name : Crawler.php
<?php
/**
 * @package   admintools
 * @copyright Copyright (c)2010-2025 Nicholas K. Dionysopoulos / Akeeba Ltd
 * @license   GNU General Public License version 3, or later
 */

namespace Akeeba\Component\AdminTools\Administrator\Scanner;

defined('_JEXEC') || die;

use Akeeba\Component\AdminTools\Administrator\Scanner\Exception\WarningException;
use Akeeba\Component\AdminTools\Administrator\Scanner\Logger\Logger;
use Akeeba\Component\AdminTools\Administrator\Scanner\Util\Configuration;
use Akeeba\Component\AdminTools\Administrator\Scanner\Util\Filesystem;
use Akeeba\Component\AdminTools\Administrator\Scanner\Util\Session;
use Akeeba\Component\AdminTools\Administrator\Scanner\Util\Timer;
use DirectoryIterator;
use Exception;
use RuntimeException;

/**
 * Filesystem crawler
 */
class Crawler extends Part
{
	/**
	 * Directories currently queued for scanning
	 *
	 * @var   array
	 */
	private $directoryQueue = [];

	/**
	 * Files queued for processing
	 *
	 * @var array
	 */
	private $fileQueue = [];

	/**
	 * True when we have finished listing all folders inside the current directory
	 *
	 * @var   boolean
	 */
	private $hasScannedFolders = false;

	/**
	 * True when we have finished listing  all files inside the current directory
	 *
	 * @var   boolean
	 */
	private $hasScannedFiles = false;

	/**
	 * Current directory being scanned
	 *
	 * @var   string
	 */
	private $currentDirectory = '';

	/**
	 * Directory and file filtering engine
	 *
	 * @var   Filter
	 */
	private $filter;

	/**
	 * The file scanner engine
	 *
	 * @var   FileScanner
	 */
	private $fileScanner;

	/**
	 * Crawler constructor.
	 *
	 * @param   Configuration  $configuration
	 * @param   Session        $session
	 * @param   Logger         $logger
	 * @param   Timer          $timer
	 */
	public function __construct(Configuration $configuration, Session $session, Logger $logger, Timer $timer)
	{
		parent::__construct($configuration, $session, $logger, $timer);

		$this->filter      = new Filter($this->configuration);
		$this->fileScanner = new FileScanner($configuration, $session, $logger);

		// Load the state from the session
		$this->directoryQueue    = $this->session->get('directoryQueue', []);
		$this->fileQueue         = $this->session->get('fileQueue', []);
		$this->hasScannedFiles   = $this->session->get('hasScannedFiles', false);
		$this->hasScannedFolders = $this->session->get('hasScannedFolders', false);
		$this->currentDirectory  = $this->session->get('currentDirectory', '');

		$this->setState($this->session->get('crawlerState', self::STATE_INIT));
	}

	public function __destruct()
	{
		// Save the state to the session
		$this->session->set('directoryQueue', $this->directoryQueue);
		$this->session->set('fileQueue', $this->fileQueue);
		$this->session->set('hasScannedFiles', $this->hasScannedFiles);
		$this->session->set('hasScannedFolders', $this->hasScannedFolders);
		$this->session->set('currentDirectory', $this->currentDirectory);
		$this->session->set('crawlerState', $this->getState());
	}

	/**
	 * Initializes the crawler
	 *
	 * @return  void
	 */
	protected function _prepare()
	{
		$this->logger->debug(__CLASS__ . " :: Starting _prepare()");

		// Push the site's root as the first directory to scan
		$this->directoryQueue    = [];
		$this->fileQueue         = [];
		$this->currentDirectory  = Filesystem::normalizePath(JPATH_ROOT);
		$this->currentDirectory  = empty($this->currentDirectory) ? '/' : $this->currentDirectory;
		$this->hasScannedFolders = false;
		$this->hasScannedFiles   = false;

		$this->setState(self::STATE_PREPARED);

		$this->logger->debug(__CLASS__ . " :: prepared");
	}

	/**
	 * Runs a step of the crawler
	 *
	 * @return  void
	 * @throws  Exception
	 */
	protected function _run()
	{
		if ($this->getState() == self::STATE_POSTRUN)
		{
			$this->logger->debug(__CLASS__ . " :: Already finished");
			$this->setStep("-");
			$this->setSubstep("");

			return;
		}

		/**
		 * I am done scanning files and subdirectories and there are no more files to pack. I need to get the next
		 * directory to scan from my queue. If my queue is empty I am all done.
		 */
		if (empty($this->fileQueue) && $this->hasScannedFolders && $this->hasScannedFiles)
		{
			if (!$this->getNextDirectory())
			{
				$this->setState(self::STATE_POSTRUN);

				return;
			}
		}

		// I have no queued files and I haven't yet finished scanning for files. List the files.
		if (!$this->hasScannedFiles && empty($this->fileQueue))
		{
			$this->scanFiles();
		}
		/**
		 * I have queued files. Process them.
		 *
		 * Note that at this point I may have NOT finished listing all files. This is intentional. If a folder has two
		 * thousand files and my fileThreshold is 100 I need to run 20 batches of 100 files each. This lets me be very
		 * efficient in processing files without requesting too much session storage and memory for storing a massive
		 * number of files in the queue.
		 */
		elseif (!empty($this->fileQueue))
		{
			$this->processFiles();
		}
		/**
		 * I am done processing files. Find the subdirectories of the current directory and add them to my queue.
		 */
		elseif (!$this->hasScannedFolders)
		{
			$this->scanSubdirs();
		}
	}

	/**
	 * Finalization.
	 *
	 * I have currently nothing to do.
	 */
	protected function _finalize()
	{
		$this->setState(self::STATE_FINISHED);
	}

	/**
	 * Gets the next directory to scan from the queue.
	 *
	 * It also applies folder filters. Skipped folders are logged as such and are marked as having been fully scanned
	 * for files and folders. This means the next tick will call getNextDirectory once again.
	 *
	 * @return   boolean  True if we found a directory (even if it's filtered), false if the directory queue is empty.
	 */
	protected function getNextDirectory(): bool
	{
		// Reset the file / folder scanning positions
		$this->hasScannedFiles   = false;
		$this->hasScannedFolders = false;

		if (count($this->directoryQueue) == 0)
		{
			// No directories left to scan
			return false;
		}

		// Get and remove the last entry from the $directory_list array
		$this->currentDirectory = array_pop($this->directoryQueue);
		$this->setStep($this->currentDirectory);

		if ($this->filter->isExcludedFolder($this->currentDirectory))
		{
			$this->logger->info("Skipping directory " . $this->currentDirectory);
			$this->hasScannedFolders = true;
			$this->hasScannedFiles   = true;
		}

		return true;
	}

	/**
	 * Try to add some files from the $file_list into the archive
	 *
	 * @return   boolean   True if there were files packed, false otherwise
	 *                     (empty filelist or fatal error)
	 */
	protected function processFiles()
	{
		// Normal file backup loop; we keep on processing the file list, packing files as we go.
		if (count($this->fileQueue) == 0)
		{
			// No files left to pack. Return true and let the engine loop
			return true;
		}

		$this->logger->debug("Processing files");
		$counter = 0;

		while ((count($this->fileQueue) > 0) && ($this->timer->getTimeLeft() >= 0))
		{
			$counter++;
			$file = @array_shift($this->fileQueue);

			// Is this file skipped or simply does not exist anymore?
			if (!@file_exists($file) || $this->filter->isExcludedFile($file))
			{
				continue;
			}

			// Step break before processing a large file (unless it's the FIRST file we process in this step)
			if ($counter != 1)
			{
				$size               = @filesize($file);
				$largeFileThreshold = $this->configuration->get('largeFileThreshold');

				if ($size >= $largeFileThreshold)
				{
					$this->fileQueue[] = $file;
					$this->setBreakFlag();

					return true;
				}
			}

			// Mark another file scanned...
			$scannedFiles = $this->session->get('scannedFiles', 0);
			$this->session->set('scannedFiles', ++$scannedFiles);
			// ...and actually scan the file
			$this->fileScanner->processFile($file);
		}

		// True if we have more files, false if we're done packing
		return (count($this->fileQueue) > 0);
	}

	/**
	 * Steps the subdirectory scanning of the current directory
	 *
	 * @return  void
	 * @throws  Exception
	 */
	protected function scanSubdirs()
	{
		$dirPosition = $this->session->get('dirPosition', null);

		if (is_null($dirPosition))
		{
			$this->logger->info("Scanning directories of " . $this->currentDirectory);
		}
		else
		{
			$this->logger->info("Resuming scanning directories of " . $this->currentDirectory);
		}

		// Get subdirectories
		$exception      = null;
		$subdirectories = [];

		try
		{
			$subdirectories = $this->scanContents($this->currentDirectory, true);
		}
		catch (WarningException $e)
		{
			$this->logger->warning($e->getMessage());
		}
		catch (Exception $e)
		{
			$exception = $e;
		}

		// If the list contains "too many" items, please break this step!
		if ($this->session->get('breakFlag', false))
		{
			// Log the step break decision, for debugging reasons
			$this->logger->info(sprintf("Large directory %s while scanning for subdirectories; I will resume scanning in next step.", $this->currentDirectory));

			// Return immediately
			return;
		}

		// Error control
		if (!is_null($exception))
		{
			throw $exception;
		}

		// Make sure we do not follow directory symlinks because on some sites it could cause an infinite loop
		foreach ($subdirectories as $subdirectory)
		{
			if (is_link($subdirectory))
			{
				$this->logger->info(sprintf("Skipping directory symlink %s/%s", $this->currentDirectory, $subdirectory));

				continue;
			}

			$this->directoryQueue[] = $subdirectory;
		}

		// If the scanner nullified the next position to scan, we're done scanning for subdirectories
		$dirPosition = $this->session->get('dirPosition', null);

		if (is_null($dirPosition))
		{
			$this->hasScannedFolders = true;
		}
	}

	/**
	 * Steps the files scanning of the current directory
	 *
	 * @return  void
	 * @throws  Exception
	 */
	protected function scanFiles()
	{
		$filePosition = $this->session->get('filePosition', null);

		if (is_null($filePosition))
		{
			$this->logger->info("Scanning files of " . $this->currentDirectory);
		}
		else
		{
			$this->logger->info("Resuming scanning files of " . $this->currentDirectory);
		}

		// Get file listing
		$exception = null;
		$fileList  = false;

		try
		{
			$fileList = $this->scanContents($this->currentDirectory, false);
		}
		catch (WarningException $e)
		{
			$this->logger->warning($e->getMessage());
		}
		catch (Exception $e)
		{
			$exception = $e;
		}

		// If the list contains "too many" items, please break this step!
		if ($this->session->get('breakFlag', false))
		{
			// Log the step break decision, for debugging reasons
			$this->logger->info(sprintf("Large directory %s while scanning for files; I will resume scanning in next step.", $this->currentDirectory));

			// Return immediately, marking that we are not done yet!
			return;
		}

		// Error control
		if (!is_null($exception))
		{
			throw $exception;
		}

		// Do I have an unreadable directory?
		if ($fileList === false)
		{
			$this->logger->warning('Unreadable directory ' . $this->currentDirectory);

			$this->hasScannedFiles = true;
		}
		// Directory was readable, process the file list
		elseif (is_array($fileList) && !empty($fileList))
		{
			// Scan all directory entries
			foreach ($fileList as $fileName)
			{
				$skipThisFile = $this->filter->isExcludedFile($fileName);

				if ($skipThisFile)
				{
					$this->logger->info(sprintf("Skipping file %s", $this->currentDirectory));

					continue;
				}

				$this->fileQueue[] = $fileName;
			}
		}

		// If the scanner engine nullified the next position we are done scanning for files
		$filePosition = $this->session->get('filePosition', null);

		if (is_null($filePosition))
		{
			$this->hasScannedFiles = true;
		}

		return;
	}

	/**
	 * Scans a folder for files or folders
	 *
	 * @param   string  $scanFolder  Return the contents of this folder
	 * @param   bool    $folders     True to return contained folders, false to return contained files
	 *
	 * @return  array
	 */
	protected function scanContents($scanFolder, $folders = true)
	{
		$listing = [];

		// Sanity checks
		if (!is_dir($scanFolder) && !is_dir($scanFolder . '/'))
		{
			throw new WarningException('Cannot list contents of directory ' . $scanFolder . ' -- PHP reports it as not a folder.');
		}

		if (!@is_readable($scanFolder))
		{
			throw new WarningException('Cannot list contents of directory ' . $scanFolder . ' -- PHP reports it as not readable.');
		}

		try
		{
			$di = new DirectoryIterator($scanFolder);
		}
		catch (Exception $e)
		{
			throw new WarningException('Cannot list contents of directory ' . $scanFolder . ' -- PHP\'s DirectoryIterator reports the path cannot be opened.', 0, $e);
		}

		// Do I need to resume scanning from a position other than the beginning of the list?
		$sessionKey = ($folders ? 'dir' : 'file') . 'Position';
		$position   = $this->session->get($sessionKey, null);

		if (!empty($position))
		{
			$di->seek($position);

			if ($di->key() != $position)
			{
				$position = null;

				return $listing;
			}
		}

		// Get the maximum number of files / folders to scan
		$maxCounter = $this->configuration->get(sprintf("%sThreshold", $folders ? 'dir' : 'file'));
		$counter    = 0;

		while ($di->valid())
		{
			/**
			 * If the directory entry is a link pointing somewhere outside the allowed directories per open_basedir we
			 * will get a RuntimeException (tested on PHP 5.3 onwards). Catching it lets us report the link as
			 * unreadable without suffering a PHP Fatal Error.
			 */
			try
			{
				if ($di->isDot())
				{
					$di->next();

					continue;
				}

				$di->isLink();
			}
			catch (RuntimeException $e)
			{
				if (!in_array($di->getFilename(), ['.', '..']))
				{
					$this->logger->warning(sprintf("Link %s is inaccessible. Check the open_basedir restrictions in your server's PHP configuration", $di->getPathname()));
				}

				$di->next();

				continue;
			}

			if ($di->isDot())
			{
				$di->next();

				continue;
			}

			if ($di->isDir() != $folders)
			{
				$di->next();

				continue;
			}

			$counter++;
			$addToListing = true;

			if (!$folders)
			{
				$addToListing = !$this->filter->isExcludedByExtension($di->getBasename());
			}

			if ($addToListing)
			{
				$listing[] = Filesystem::normalizePath($scanFolder . '/' . $di->getFilename());
			}

			if ($counter == $maxCounter)
			{
				break;
			}

			$di->next();
		}

		// Determine the new value for the position
		$di->next();

		// Update our position within the DirectoryIterator
		$this->session->set($sessionKey, $di->valid() ? ($di->key() - 1) : null);

		return $listing;
	}

}

© 2025 Cubjrnet7