<?php

/**
 * @file classes/tasks/PKPUsageStatsLoader.php
 *
 * Copyright (c) 2022 Simon Fraser University
 * Copyright (c) 2022 John Willinsky
 * Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
 *
 * @class PKPUsageStatsLoader
 *
 * @ingroup tasks
 *
 * @brief Scheduled task to extract transform and load usage statistics data into database.
 */

namespace PKP\task;

use APP\core\Application;
use APP\core\Services;
use APP\statistics\StatisticsHelper;
use Illuminate\Support\Facades\Bus;
use PKP\file\FileManager;
use PKP\jobs\statistics\CompileMonthlyMetrics;
use PKP\scheduledTask\ScheduledTaskHelper;
use PKP\site\Site;
use Throwable;

abstract class PKPUsageStatsLoader extends FileLoader
{
    /**
     * If the log files should be automatically moved to te stage folder.
     * This is the case for daily log file processing.
     * This is not the case if the whole month is reprocessed - all log files for the given month should be manually placed in the stage folder.
     */
    private bool $autoStage;

    /** List of months the processed daily log files are from, to consider for monthly aggregation */
    private array $months = [];

    /** List of log files that needs to be processed within this scheduled task, and the jobs needs to be chained for. */
    private array $logFiles = [];

    /**
     * Constructor.
     */
    public function __construct(array $args)
    {
        $this->autoStage = true;

        // if log files for a whole month should be reprocessed,
        // the month is given as parameter
        if (!empty($args)) {
            $reprocessMonth = current($args);
            $reprocessFiles = $this->getStagedFilesByMonth($reprocessMonth);
            $this->setOnlyConsiderFiles($reprocessFiles);
            $this->autoStage = false;
        }

        // shall the archived log files be compressed
        $site = Application::get()->getRequest()->getSite();
        if ($site->getData('compressStatsLogs')) {
            $this->setCompressArchives(true);
        }

        // Define the base filesystem path.
        $basePath = StatisticsHelper::getUsageStatsDirPath();
        $args[0] = $basePath;
        parent::__construct($args);

        $this->checkFolderStructure(true);
    }

    /**
     * @copydoc FileLoader::getName()
     */
    public function getName(): string
    {
        return __('admin.scheduledTask.usageStatsLoader');
    }

    /**
     * Get the jobs needed to process a usage stats log file and compile the stats.
     * The jobs have to be in the right execution order.
     *
     * @return BaseJob[]
     */
    abstract protected function getFileJobs(string $filePath, Site $site): array;

    /**
     * @copydoc FileLoader::executeActions()
     */
    protected function executeActions(): bool
    {
        // It's possible that the processing directory has files that
        // were being processed but the php process was stopped before
        // finishing the processing, or there may be a concurrent process running.
        // Warn the user if this is the case.
        $processingDirFiles = glob($this->getProcessingPath() . '/' . '*');
        $processingDirError = is_array($processingDirFiles) && count($processingDirFiles);
        // If the processing directory is not empty (and this is not the reprocessing of the older log files)
        // log that message
        if ($processingDirError && !empty($this->getOnlyConsiderFiles())) {
            $this->addExecutionLogEntry(__('admin.scheduledTask.usageStatsLoader.processingPathNotEmpty', ['directory' => $this->getProcessingPath()]), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
        }
        if ($this->autoStage) {
            $this->autoStage();
        }
        $processFilesResult = parent::executeActions();
        if (!$processFilesResult) {
            return false;
        }

        $site = Application::get()->getRequest()->getSite();
        $jobs = [];
        foreach ($this->logFiles as $filePath) {
            $jobsPerFile = $this->getFileJobs($filePath, $site);
            $jobs = array_merge($jobs, $jobsPerFile);
        }
        foreach ($this->months as $month) {
            $compileMonthlyMetricsJob = new CompileMonthlyMetrics($month, $site);
            $jobs = array_merge($jobs, [$compileMonthlyMetricsJob]);
        }
        // Bus::chain() cannot accept an empty array
        if (!empty($jobs)) {
            Bus::chain($jobs)
                ->catch(function (Throwable $e) {
                })
                ->dispatch();

            $this->addExecutionLogEntry(__(
                'admin.scheduledTask.usageStatsLoader.jobDispatched'
            ), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_NOTICE);
        }

        return (!$processingDirError);
    }

    /**
     * Check if the log file's date is later than the first installation of the new log file format,
     * so that the log file can be processed.
     */
    protected function isDateValid(string $loadId): bool
    {
        $date = substr($loadId, -12, 8);
        // Get the date when the version that uses the new log file format (and COUNTER R5) is installed.
        // Only the log files later than that day can be (regularly) processed here.
        $statsService = Services::get('sushiStats');
        $dateR5Installed = date('Ymd', strtotime($statsService->getEarliestDate()));
        if ($date < $dateR5Installed) {
            // the log file is in old log file format
            // return the file to staging and
            // log the error
            $this->addExecutionLogEntry(__(
                'admin.scheduledTask.usageStatsLoader.veryOldLogFile',
                ['file' => $loadId]
            ), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
            return false;
        }
        return true;
    }

    /**
     * Check if stats for the log file's month do not already exist.
     * Return true if they do not exist, so that log file can be processed.
     * Else, return the file to staging and log the error that
     * the CLI script for reprocessing should be called.
     * If the log files of the month are being reprocessed,
     * the CLI reprocessing script will first remove all the stats for the month,
     * so that this function will return true in that case.
     */
    protected function isMonthValid(string $loadId, string $month): bool
    {
        $currentMonth = date('Ym');
        $lastMonth = date('Ym', strtotime('last month'));
        $site = Application::get()->getRequest()->getSite();
        // If the daily metrics are not kept, and this is not the current month (which is kept in the DB)
        // the CLI script to reprocess the whole month should be called.
        if (!$site->getData('keepDailyUsageStats') && $month != $currentMonth && $month != $lastMonth) {
            $statsService = Services::get('sushiStats');
            $counterMonthExists = $statsService->monthExists($month);
            $geoService = Services::get('geoStats');
            $geoMonthExists = $geoService->monthExists($month);
            if ($counterMonthExists || $geoMonthExists) {
                $this->addExecutionLogEntry(__(
                    'admin.scheduledTask.usageStatsLoader.monthExists',
                    ['file' => $loadId]
                ), ScheduledTaskHelper::SCHEDULED_TASK_MESSAGE_TYPE_ERROR);
                return false;
            }
        }
        return true;
    }

    /**
     * Add the log file's month to the list of months to be considered for the
     * stats aggregation after the current log files are processed.
     */
    protected function considerMonthForStatsAggregation(string $month): void
    {
        if (!in_array($month, $this->months)) {
            $this->months[] = $month;
        }
    }
    /**
     * @copydoc FileLoader::processFile()
     * The file name MUST be of form usage_events_YYYYMMDD.log
     * If the function successfully finishes, the file will be archived.
     */
    protected function processFile(string $filePath): bool|int
    {
        $loadId = basename($filePath);
        $month = substr($loadId, -12, 6);
        // if the file is not being reprocessed using the CLI tool
        if (!in_array($loadId, $this->getOnlyConsiderFiles())) {
            // Check if the log file is an old log file and if the stats for the month already exist
            if (!$this->isDateValid($loadId) || !$this->isMonthValid($loadId, $month)) {
                return self::FILE_LOADER_RETURN_TO_STAGING;
            }
        }
        // Add this log file to the list, so that all jobs, for all files can be chained.
        $this->logFiles[] = $loadId;
        // Add this log file's month to the list of months the stats need to be aggregated for.
        $this->considerMonthForStatsAggregation($month);
        return self::FILE_LOADER_RETURN_TO_DISPATCH;
    }

    /**
     * Auto stage usage stats log files, also moving files that
     * might be in processing folder to stage folder.
     */
    protected function autoStage(): void
    {
        // Copy all log files to stage directory, except the current day one.
        $fileManager = new FileManager();
        $logFiles = [];
        $logsDirFiles = glob($this->getUsageEventLogsPath() . '/*');
        if (is_array($logsDirFiles)) {
            $logFiles = array_merge($logFiles, $logsDirFiles);
        }
        // It's possible that the processing directory have files that
        // were being processed but the php process was stopped before
        // finishing the processing. Just copy them to the stage directory too.
        $processingDirFiles = glob($this->getProcessingPath() . '/*');
        if (is_array($processingDirFiles)) {
            $logFiles = array_merge($logFiles, $processingDirFiles);
        }

        foreach ($logFiles as $filePath) {
            if ($fileManager->fileExists($filePath)) {
                $filename = pathinfo($filePath, PATHINFO_BASENAME);
                $currentDayFilename = $this->getUsageEventCurrentDayLogName();
                if ($filename == $currentDayFilename) {
                    continue;
                }
                $this->moveFile(pathinfo($filePath, PATHINFO_DIRNAME), $this->getStagePath(), $filename);
            }
        }
    }

    /**
     * Get staged usage log files belonging to a month, that should be reprocessed
     */
    protected function getStagedFilesByMonth(string $month): array
    {
        $files = [];
        $stagePath = StatisticsHelper::getUsageStatsDirPath() . '/' . self::FILE_LOADER_PATH_STAGING;
        $stageDir = opendir($stagePath);
        while ($filename = readdir($stageDir)) {
            if (str_starts_with($filename, 'usage_events_' . $month)) {
                $files[] = $filename;
            }
        }
        return $files;
    }

    /**
     * Get the usage event logs directory path.
     */
    protected function getUsageEventLogsPath(): string
    {
        return StatisticsHelper::getUsageStatsDirPath() . '/usageEventLogs';
    }

    /**
     * Get current day usage event log name.
     */
    protected function getUsageEventCurrentDayLogName(): string
    {
        return 'usage_events_' . date('Ymd') . '.log';
    }
}
