├── .coveralls.yml ├── .phive └── phars.xml ├── .run └── Xdebug.run.xml ├── CHANGELOG.md ├── CONTRIBUTERS.md ├── CONTRIBUTING.md ├── Classes ├── Command │ ├── BuildQueueCommand.php │ ├── FlushQueueCommand.php │ └── ProcessQueueCommand.php ├── Configuration │ └── ExtensionConfigurationProvider.php ├── ContextMenu │ └── ItemProvider.php ├── Controller │ ├── Backend │ │ ├── AbstractBackendModuleController.php │ │ ├── BackendModuleControllerInterface.php │ │ ├── BackendModuleCrawlerLogController.php │ │ ├── BackendModuleCrawlerProcessController.php │ │ ├── BackendModuleStartCrawlingController.php │ │ └── Helper │ │ │ ├── ResultHandler.php │ │ │ └── UrlBuilder.php │ └── CrawlerController.php ├── Converter │ └── JsonCompatibilityConverter.php ├── CrawlStrategy │ ├── CallbackExecutionStrategy.php │ ├── CrawlStrategyFactory.php │ ├── CrawlStrategyInterface.php │ ├── GuzzleExecutionStrategy.php │ └── SubProcessExecutionStrategy.php ├── Crawler.php ├── Domain │ ├── Model │ │ ├── Configuration.php │ │ ├── Process.php │ │ ├── ProcessCollection.php │ │ ├── Queue.php │ │ └── Reason.php │ └── Repository │ │ ├── ConfigurationRepository.php │ │ ├── ProcessRepository.php │ │ └── QueueRepository.php ├── Event │ ├── AfterQueueItemAddedEvent.php │ ├── AfterUrlAddedToQueueEvent.php │ ├── AfterUrlCrawledEvent.php │ ├── BeforeQueueItemAddedEvent.php │ ├── InvokeQueueChangeEvent.php │ └── ModifySkipPageEvent.php ├── EventListener │ ├── AfterQueueItemAddedEventListener.php │ └── ShouldUseCachedPageDataIfAvailableEventListener.php ├── Exception │ ├── CommandNotFoundException.php │ ├── CrawlerObjectException.php │ ├── ExtensionSettingsException.php │ ├── NoIndexFoundException.php │ ├── ProcessException.php │ └── TimeStampException.php ├── Helper │ └── Sleeper │ │ ├── NullSleeper.php │ │ ├── SleeperInterface.php │ │ └── SystemSleeper.php ├── Hooks │ ├── CrawlerHookInterface.php │ ├── DataHandlerHook.php │ └── ProcessCleanUpHook.php ├── Middleware │ ├── CrawlerInitialization.php │ └── FrontendUserAuthenticator.php ├── QueueExecutor.php ├── Service │ ├── BackendModuleLinkService.php │ ├── BackendModuleLogService.php │ ├── BackendModuleScriptUrlService.php │ ├── ConfigurationService.php │ ├── PageService.php │ ├── ProcessInstructionService.php │ ├── ProcessService.php │ ├── QueueService.php │ ├── UrlService.php │ └── UserService.php ├── Utility │ ├── HookUtility.php │ ├── MessageUtility.php │ ├── PhpBinaryUtility.php │ └── TcaUtility.php ├── Value │ ├── CrawlAction.php │ ├── QueueFilter.php │ └── QueueRow.php └── Writer │ └── FileWriter │ └── CsvWriter │ ├── CrawlerCsvWriter.php │ └── CsvWriterInterface.php ├── Configuration ├── Backend │ └── Modules.php ├── Extbase │ └── Persistence │ │ └── Classes.php ├── Icons.php ├── RequestMiddlewares.php ├── Services.yaml └── TCA │ └── tx_crawler_configuration.php ├── Documentation ├── Configuration │ ├── ConfigurationRecords │ │ └── Index.rst │ ├── Examples │ │ ├── Index.rst │ │ └── News │ │ │ ├── Index.rst │ │ │ ├── _NewsDetailEventListener.php │ │ │ ├── _page.tsconfig │ │ │ ├── _services.yaml │ │ │ └── _setup.typoscript │ ├── ExtensionManagerConfiguration │ │ └── Index.rst │ ├── HttpAuthentication │ │ └── Index.rst │ ├── Index.rst │ └── PageTsconfigReference(txCrawlercrawlercfg) │ │ ├── Index.rst │ │ ├── _page.tsconfig │ │ └── _paramSets_page.tsconfig ├── ExecutingTheQueue │ ├── BuildingAndExecutingQueueRightAway(fromCli) │ │ ├── Index.rst │ │ ├── _output_buildQueue_6_default.txt │ │ ├── _output_buildQueue_6_default_mode_exec.txt │ │ └── _output_buildQueue_6_default_mode_url.txt │ ├── ExecutingQueueWithCron-job │ │ └── Index.rst │ ├── Index.rst │ ├── RunViaBackend │ │ └── Index.rst │ └── RunningViaCommandController │ │ └── Index.rst ├── Features │ ├── AutomaticAddPagesToQueue │ │ └── Index.rst │ ├── Events │ │ ├── Index.rst │ │ ├── _AfterQueueItemAddedEventListener.php │ │ ├── _AfterQueueItemAddedEventListener_services.yaml │ │ ├── _AfterUrlAddedToQueueEventListener.php │ │ ├── _AfterUrlAddedToQueueEventListener_services.yaml │ │ ├── _AfterUrlCrawledEventListener.php │ │ ├── _AfterUrlCrawledEventListener_services.yaml │ │ ├── _BeforeQueueItemAddedEventListener.php │ │ ├── _BeforeQueueItemAddedEventListener_services.yaml │ │ ├── _InvokeQueueChangeEventListener.php │ │ ├── _InvokeQueueChangeEvent_services.yaml │ │ ├── _ModifySkipPageEventListener.php │ │ └── _ModifySkipPageEventListener_services.yaml │ ├── Hooks │ │ ├── Index.rst │ │ └── _PageVeto.php │ ├── Index.rst │ ├── MultiprocessSupport │ │ └── Index.rst │ ├── PollableProcessingInstructions │ │ └── Index.rst │ └── PriorityCrawling │ │ └── Index.rst ├── Images │ ├── backend_addfromcontextmenu.png │ ├── backend_clear_cache.png │ ├── backend_clear_cache_queue.png │ ├── backend_configuration_deployment.png │ ├── backend_configuration_queue.png │ ├── backend_configuration_settings.png │ ├── backend_configurationrecord_access.png │ ├── backend_configurationrecord_general.png │ ├── backend_crawler_seo_priority_v10.png │ ├── backend_crawler_seo_v10.png │ ├── backend_crawlerlog.png │ ├── backend_crawlerlog_recrawl.png │ ├── backend_info_php_error.png │ ├── backend_pendingurls.png │ ├── backend_php_path_configuration.png │ ├── backend_processlist.png │ ├── backend_processlist_add_process.png │ ├── backend_recrawl.png │ ├── backend_scheduler_overview.png │ ├── backend_scheduler_processqueue.png │ ├── backend_scheduler_record.png │ ├── backend_startcrawling.png │ ├── backend_startnewprocess.png │ ├── cli_addtoque.png │ ├── cli_processque.png │ ├── crawler_settings_processLimit.png │ └── ext_news_pagetree.png ├── Includes.rst.txt ├── Index.rst ├── Introduction │ └── Index.rst ├── Links │ └── Links.rst ├── Scheduler │ └── Index.rst ├── Sitemap.rst ├── Troubleshooting │ ├── Index.rst │ └── _htaccess.txt ├── UseCases │ ├── CacheWarmup │ │ ├── Index.rst │ │ └── _commands.bash │ ├── Index.rst │ └── IndexedSearch │ │ └── Index.rst └── guides.xml ├── LICENSE ├── Makefile ├── README.md ├── Resources ├── Private │ ├── Language │ │ ├── Backend.xlf │ │ ├── af.Backend.xlf │ │ ├── af.locallang.xlf │ │ ├── af.locallang_csh_tx_crawler_configuration.xlf │ │ ├── ar.Backend.xlf │ │ ├── ar.locallang.xlf │ │ ├── ar.locallang_csh_tx_crawler_configuration.xlf │ │ ├── ca.Backend.xlf │ │ ├── ca.locallang.xlf │ │ ├── ca.locallang_csh_tx_crawler_configuration.xlf │ │ ├── cs.Backend.xlf │ │ ├── cs.locallang.xlf │ │ ├── cs.locallang_csh_tx_crawler_configuration.xlf │ │ ├── da.Backend.xlf │ │ ├── da.locallang.xlf │ │ ├── da.locallang_csh_tx_crawler_configuration.xlf │ │ ├── de.Backend.xlf │ │ ├── de.locallang.xlf │ │ ├── de.locallang_csh_tx_crawler_configuration.xlf │ │ ├── el.Backend.xlf │ │ ├── el.locallang.xlf │ │ ├── el.locallang_csh_tx_crawler_configuration.xlf │ │ ├── es.Backend.xlf │ │ ├── es.locallang.xlf │ │ ├── es.locallang_csh_tx_crawler_configuration.xlf │ │ ├── fi.Backend.xlf │ │ ├── fi.locallang.xlf │ │ ├── fi.locallang_csh_tx_crawler_configuration.xlf │ │ ├── fr.Backend.xlf │ │ ├── fr.locallang.xlf │ │ ├── fr.locallang_csh_tx_crawler_configuration.xlf │ │ ├── he.Backend.xlf │ │ ├── he.locallang.xlf │ │ ├── he.locallang_csh_tx_crawler_configuration.xlf │ │ ├── hu.Backend.xlf │ │ ├── hu.locallang.xlf │ │ ├── hu.locallang_csh_tx_crawler_configuration.xlf │ │ ├── it.Backend.xlf │ │ ├── it.locallang.xlf │ │ ├── it.locallang_csh_tx_crawler_configuration.xlf │ │ ├── ja.Backend.xlf │ │ ├── ja.locallang.xlf │ │ ├── ja.locallang_csh_tx_crawler_configuration.xlf │ │ ├── ko.Backend.xlf │ │ ├── ko.locallang.xlf │ │ ├── ko.locallang_csh_tx_crawler_configuration.xlf │ │ ├── locallang.xlf │ │ ├── locallang_csh_tx_crawler_configuration.xlf │ │ ├── nl.Backend.xlf │ │ ├── nl.locallang.xlf │ │ ├── nl.locallang_csh_tx_crawler_configuration.xlf │ │ ├── no.Backend.xlf │ │ ├── no.locallang.xlf │ │ ├── no.locallang_csh_tx_crawler_configuration.xlf │ │ ├── pl.Backend.xlf │ │ ├── pl.locallang.xlf │ │ ├── pl.locallang_csh_tx_crawler_configuration.xlf │ │ ├── pt.Backend.xlf │ │ ├── pt.locallang.xlf │ │ ├── pt.locallang_csh_tx_crawler_configuration.xlf │ │ ├── ro.Backend.xlf │ │ ├── ro.locallang.xlf │ │ ├── ro.locallang_csh_tx_crawler_configuration.xlf │ │ ├── ru.Backend.xlf │ │ ├── ru.locallang.xlf │ │ ├── ru.locallang_csh_tx_crawler_configuration.xlf │ │ ├── sr.Backend.xlf │ │ ├── sr.locallang.xlf │ │ ├── sr.locallang_csh_tx_crawler_configuration.xlf │ │ ├── sv.Backend.xlf │ │ ├── sv.locallang.xlf │ │ ├── sv.locallang_csh_tx_crawler_configuration.xlf │ │ ├── tr.Backend.xlf │ │ ├── tr.locallang.xlf │ │ ├── tr.locallang_csh_tx_crawler_configuration.xlf │ │ ├── uk.Backend.xlf │ │ ├── uk.locallang.xlf │ │ ├── uk.locallang_csh_tx_crawler_configuration.xlf │ │ ├── vi.Backend.xlf │ │ ├── vi.locallang.xlf │ │ ├── vi.locallang_csh_tx_crawler_configuration.xlf │ │ ├── zh.Backend.xlf │ │ ├── zh.locallang.xlf │ │ └── zh.locallang_csh_tx_crawler_configuration.xlf │ ├── Layouts │ │ └── BackendModule.html │ ├── Php │ │ └── Libraries │ │ │ └── composer.json │ └── Templates │ │ └── Backend │ │ ├── ProcessOverview.html │ │ ├── ShowCrawlerInformation.html │ │ └── ShowLog.html └── Public │ ├── Css │ └── backend_crawler.css │ └── Icons │ ├── Extension.svg │ ├── bullet_green.svg │ ├── bullet_orange.svg │ ├── bullet_red.svg │ ├── crawler_configuration.svg │ ├── crawler_start.svg │ └── crawler_stop.svg ├── SECURITY.md ├── cli ├── bootstrap.php └── conf.php ├── composer.json ├── ext_conf_template.txt ├── ext_emconf.php ├── ext_localconf.php └── ext_tables.sql /.coveralls.yml: -------------------------------------------------------------------------------- 1 | coverage_clover: "*-coverage.clover" 2 | -------------------------------------------------------------------------------- /.phive/phars.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /.run/Xdebug.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /CONTRIBUTERS.md: -------------------------------------------------------------------------------- 1 | # Contributers 2 | 3 | List of contributers to the Crawler TYPO3 V9 Compatibility. 4 | 5 | Adding the name to the list is optional, email as well if you want your name on the list. 6 | Thanks for helping out. 7 | 8 | PS: Please add in alphabetical order. 9 | 10 | * Benni Mack 11 | * Sebastian Mazza 12 | * Chris Müller 13 | * Tizian Schmidlin 14 | * Tobias Stahn 15 | * Tomas Norre Mikkelsen 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ### Contributing 2 | 3 | When you have a PR, please run the following checks first. 4 | 5 | * `composer test:all` 6 | * Requires a mysql-database, you can boot one with `docker-compose` from the `.Docker`-directory 7 | * `composer cs-fix` 8 | * Ensures that coding standards are respected 9 | * `composer analyse` 10 | * Will run PHPStan and do a static code analysis, this is not adjust completely in build yet, but please try to avoid adding new violations. ;) 11 | 12 | ### Writing documentation 13 | 14 | You can render the documentation in this extension with the command 15 | 16 | ``` 17 | make docs 18 | ``` 19 | 20 | #### Devbox 21 | 22 | If you don't have a setup already, where you can do development, bugfixing etc. for the crawler, don't worry. 23 | 24 | We have included a [ddev](https://www.ddev.com) devbox to help the development. 25 | 26 | ##### Prerequisites 27 | 28 | * [DDEV](https://www.ddev.com) 29 | * Docker 30 | 31 | ##### How to use the devbox? 32 | 33 | ```shell script 34 | $ git clone git@github.com:tomasnorre/crawler.git 35 | $ cd .devbox 36 | $ ddev start 37 | ``` 38 | 39 | Username/password: `admin`/`password` 40 | 41 | And start working. 42 | 43 | **INFO** 44 | xdebug is disabled as default, to speed up the devbox when xdebug isn't needed. 45 | 46 | This can be activated with `ddev xdebug on`. 47 | 48 | #### Running tests without local development environment 49 | If you don't have `php` and/or `composer` installed on your host machine, 50 | you can run the test from withing the `ddev` docker container. 51 | 52 | Do that go into the `.devbox` folder an run `ddev ssh`. 53 | From there you need to switch folder into `/public/typo3conf/ext/crawler` 54 | and run `composer` commands from there (see above). 55 | -------------------------------------------------------------------------------- /Classes/Command/FlushQueueCommand.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\QueueRepository; 23 | use AOE\Crawler\Value\QueueFilter; 24 | use Symfony\Component\Console\Command\Command; 25 | use Symfony\Component\Console\Input\InputArgument; 26 | use Symfony\Component\Console\Input\InputInterface; 27 | use Symfony\Component\Console\Output\OutputInterface; 28 | use TYPO3\CMS\Core\Utility\GeneralUtility; 29 | 30 | /** 31 | * @internal since v12.0.0 32 | */ 33 | class FlushQueueCommand extends Command 34 | { 35 | protected function configure(): void 36 | { 37 | $this->setDescription('Remove queue entries and perform a cleanup'); 38 | 39 | $this->setHelp( 40 | 'Try "typo3 help crawler:flushQueue" to see your options' . chr(10) . chr(10) . 41 | 'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; 42 | It will remove queue entries and perform a cleanup.' . chr(10) . chr(10) . 43 | ' 44 | Examples: 45 | --- Remove all finished queue-entries 46 | $ typo3 crawler:flushQueue finished 47 | 48 | --- Remove all pending queue-entries 49 | $ typo3 crawler:flushQueue pending 50 | ' 51 | ); 52 | $this->addArgument('mode', InputArgument::REQUIRED, 'What to clear: all, finished, pending'); 53 | } 54 | 55 | /** 56 | * Crawler Command - Cleaning up the queue. 57 | * 58 | * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; 59 | * It will remove queue entries and perform a cleanup. 60 | * 61 | * Examples: 62 | * 63 | * --- Remove all finished queue-entries 64 | * $ typo3 crawler:flushQueue finished 65 | * 66 | * --- Remove all pending queue-entries for all pages 67 | * $ typo3 crawler:flushQueue pending 68 | */ 69 | protected function execute(InputInterface $input, OutputInterface $output): int 70 | { 71 | $queueFilter = new QueueFilter($input->getArgument('mode')); 72 | 73 | /** @var QueueRepository $queueRepository */ 74 | $queueRepository = GeneralUtility::makeInstance(QueueRepository::class); 75 | 76 | switch ($queueFilter) { 77 | case 'all': 78 | $queueRepository->flushQueue($queueFilter); 79 | $output->writeln('All entries in Crawler queue have been flushed'); 80 | break; 81 | case 'finished': 82 | case 'pending': 83 | $queueRepository->flushQueue($queueFilter); 84 | $output->writeln( 85 | 'All entries in Crawler queue with status "' . $queueFilter . '" have been flushed' 86 | ); 87 | break; 88 | default: 89 | $output->writeln( 90 | 'No matching parameters found.' . PHP_EOL . 'Try "typo3 help crawler:flushQueue" to see your options' 91 | ); 92 | break; 93 | } 94 | 95 | return Command::SUCCESS; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /Classes/Configuration/ExtensionConfigurationProvider.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Log\LoggerAwareInterface; 23 | use Psr\Log\LoggerAwareTrait; 24 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException; 25 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException; 26 | use TYPO3\CMS\Core\Configuration\ExtensionConfiguration; 27 | use TYPO3\CMS\Core\Utility\GeneralUtility; 28 | 29 | /** 30 | * @internal since v9.2.5 31 | */ 32 | class ExtensionConfigurationProvider implements LoggerAwareInterface 33 | { 34 | use LoggerAwareTrait; 35 | 36 | /** 37 | * Return full extension configuration array. 38 | */ 39 | public function getExtensionConfiguration(): array 40 | { 41 | try { 42 | return GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('crawler'); 43 | } catch (ExtensionConfigurationExtensionNotConfiguredException|ExtensionConfigurationPathDoesNotExistException $e) { 44 | $this->logger?->error($e->getMessage()); 45 | } 46 | return []; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Classes/ContextMenu/ItemProvider.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\ConfigurationRepository; 23 | use TYPO3\CMS\Backend\ContextMenu\ItemProviders\AbstractProvider; 24 | use TYPO3\CMS\Backend\Utility\BackendUtility; 25 | use TYPO3\CMS\Core\Utility\GeneralUtility; 26 | 27 | /** 28 | * Provides a ContextMenu item 29 | * @internal since v9.2.5 30 | */ 31 | class ItemProvider extends AbstractProvider 32 | { 33 | /** 34 | * @var array 35 | */ 36 | protected $itemsConfiguration = [ 37 | 'crawler' => [ 38 | 'type' => 'item', 39 | 'label' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf:contextMenu.label', 40 | 'iconIdentifier' => 'tx-crawler', 41 | 'callbackAction' => 'crawler', 42 | ], 43 | ]; 44 | 45 | /** 46 | * Item is added only for crawler configurations 47 | */ 48 | public function canHandle(): bool 49 | { 50 | return $this->table === ConfigurationRepository::TABLE_NAME; 51 | } 52 | 53 | /** 54 | * This needs to be lower than priority of the RecordProvider 55 | */ 56 | public function getPriority(): int 57 | { 58 | return 50; 59 | } 60 | 61 | /** 62 | * Adds the crawler info 63 | */ 64 | public function addItems(array $items): array 65 | { 66 | $localItems = $this->prepareItems($this->itemsConfiguration); 67 | return $items + $localItems; 68 | } 69 | 70 | protected function getAdditionalAttributes(string $itemName): array 71 | { 72 | $crawlerConfiguration = BackendUtility::getRecordWSOL($this->table, (int) $this->identifier); 73 | if ($crawlerConfiguration === null) { 74 | return []; 75 | } 76 | 77 | if (!array_key_exists('name', $crawlerConfiguration)) { 78 | $crawlerConfiguration['name'] = 'No Name found in configuration'; 79 | } 80 | 81 | $additionalParameters = []; 82 | $additionalParameters[] = 'SET[function]=AOE\Crawler\Backend\BackendModule'; 83 | $additionalParameters[] = 'SET[crawlaction]=start'; 84 | $additionalParameters[] = 'configurationSelection[]=' . $crawlerConfiguration['name']; 85 | return [ 86 | 'data-dispatch-action' => 'TYPO3.ModuleMenu.showModule', 87 | 'data-dispatch-args-list' => 'web_site_crawler_start,&' . GeneralUtility::quoteJSvalue( 88 | '&' . implode('&', $additionalParameters) 89 | ), 90 | ]; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /Classes/Controller/Backend/BackendModuleControllerInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Converter\JsonCompatibilityConverter; 23 | use TYPO3\CMS\Core\Utility\GeneralUtility; 24 | 25 | /** 26 | * @internal since v9.2.5 27 | */ 28 | class ResultHandler 29 | { 30 | /** 31 | * Extract the log information from the current row and retrieve it as formatted string. 32 | */ 33 | public static function getResultLog(array $resultRow): string 34 | { 35 | $content = ''; 36 | if (is_array($resultRow) && array_key_exists('result_data', $resultRow)) { 37 | $requestContent = self::getJsonCompatibilityConverter()->convert($resultRow['result_data']) ?: []; 38 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) { 39 | return $content; 40 | } 41 | $requestResult = self::getJsonCompatibilityConverter()->convert($requestContent['content']); 42 | 43 | if (is_array($requestResult) && array_key_exists('log', $requestResult)) { 44 | $content = implode(chr(10), $requestResult['log']); 45 | } 46 | } 47 | return $content; 48 | } 49 | 50 | public static function getResStatus(array|bool $requestContent): string 51 | { 52 | if (empty($requestContent)) { 53 | return '-'; 54 | } 55 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) { 56 | return 'Content index does not exists in requestContent array'; 57 | } 58 | 59 | $requestResult = self::getJsonCompatibilityConverter()->convert($requestContent['content']); 60 | if (is_array($requestResult)) { 61 | if (empty($requestResult['errorlog'])) { 62 | return 'OK'; 63 | } 64 | return implode("\n", $requestResult['errorlog']); 65 | } 66 | 67 | return 'Error - no info, sorry!'; 68 | } 69 | 70 | /** 71 | * Find Fe vars 72 | */ 73 | public static function getResFeVars(array $resultData): array 74 | { 75 | if (empty($resultData)) { 76 | return []; 77 | } 78 | $requestResult = self::getJsonCompatibilityConverter()->convert($resultData['content']); 79 | if (is_bool($requestResult)) { 80 | return []; 81 | } 82 | return $requestResult['vars'] ?? []; 83 | } 84 | 85 | private static function getJsonCompatibilityConverter(): JsonCompatibilityConverter 86 | { 87 | return GeneralUtility::makeInstance(JsonCompatibilityConverter::class); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /Classes/Controller/Backend/Helper/UrlBuilder.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | use TYPO3\CMS\Backend\Routing\Exception\RouteNotFoundException; 22 | use TYPO3\CMS\Backend\Routing\UriBuilder; 23 | use TYPO3\CMS\Core\Http\Uri; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | /** 27 | * @internal since v9.2.5 28 | */ 29 | class UrlBuilder 30 | { 31 | /** 32 | * Returns the URL to the current module, including $_GET['id']. 33 | * 34 | * @param array $uriParameters optional parameters to add to the URL 35 | * 36 | * @throws RouteNotFoundException 37 | */ 38 | public static function getBackendModuleUrl(array $uriParameters = [], string $module = 'web_site_crawler'): Uri 39 | { 40 | $id = $GLOBALS['TYPO3_REQUEST']->getParsedBody()['id'] ?? $GLOBALS['TYPO3_REQUEST']->getQueryParams()['id'] ?? null; 41 | if ($id) { 42 | $uriParameters['id'] = $id; 43 | } 44 | /** @var UriBuilder $uriBuilder */ 45 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class); 46 | return $uriBuilder->buildUriFromRoute($module, $uriParameters); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Classes/Converter/JsonCompatibilityConverter.php: -------------------------------------------------------------------------------- 1 | 9 | * (c) 2023- Tomas Norre Mikkelsen 10 | * 11 | * This file is part of the TYPO3 Crawler Extension. 12 | * 13 | * It is free software; you can redistribute it and/or modify it under 14 | * the terms of the GNU General Public License, either version 2 15 | * of the License, or any later version. 16 | * 17 | * For the full copyright and license information, please read the 18 | * LICENSE.txt file that was distributed with this source code. 19 | * 20 | * The TYPO3 project - inspiring people to share! 21 | */ 22 | 23 | use Exception; 24 | 25 | /** 26 | * @internal since v9.2.5 27 | */ 28 | class JsonCompatibilityConverter 29 | { 30 | /** 31 | * This is implemented as we want to switch away from serialized data to json data, when the crawler is storing 32 | * in the database. To ensure that older crawler entries, which have already been stored as serialized data 33 | * still works, we have added this converter that can be used for the reading part. The writing part will be done 34 | * in json from now on. 35 | * @see https://github.com/tomasnorre/crawler/issues/417 36 | * 37 | * @throws Exception 38 | */ 39 | public function convert(string $dataString): array|bool 40 | { 41 | $decoded = ''; 42 | try { 43 | $decoded = json_decode($dataString, true, 512, JSON_THROW_ON_ERROR); 44 | } catch (\JsonException) { 45 | // Do nothing as we want to continue with unserialize as a test. 46 | } 47 | 48 | if (is_array($decoded)) { 49 | return $decoded; 50 | } 51 | 52 | try { 53 | $deserialized = unserialize($dataString, [ 54 | 'allowed_classes' => false, 55 | ]); 56 | } catch (\Throwable) { 57 | return false; 58 | } 59 | 60 | if (is_object($deserialized)) { 61 | throw new \RuntimeException('Objects are not allowed: ' . var_export($deserialized, true), 1_593_758_307); 62 | } 63 | 64 | if (is_array($deserialized)) { 65 | return $deserialized; 66 | } 67 | 68 | return false; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/CallbackExecutionStrategy.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Controller\CrawlerController; 23 | use TYPO3\CMS\Core\Utility\GeneralUtility; 24 | 25 | /** 26 | * Used for hooks (e.g. crawling external files) 27 | * @internal since v12.0.0 28 | */ 29 | class CallbackExecutionStrategy 30 | { 31 | /** 32 | * In the future, the callback should implement an interface. 33 | */ 34 | public function fetchByCallback(string $callbackClassName, array $parameters, CrawlerController $crawlerController) 35 | { 36 | // Calling custom object 37 | $callBackObj = GeneralUtility::makeInstance($callbackClassName); 38 | return $callBackObj->crawler_execute($parameters, $crawlerController); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/CrawlStrategyFactory.php: -------------------------------------------------------------------------------- 1 | configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance( 20 | ExtensionConfigurationProvider::class 21 | ); 22 | } 23 | 24 | public function create(): CrawlStrategyInterface 25 | { 26 | $settings = $this->configurationProvider->getExtensionConfiguration(); 27 | $extensionSettings = is_array($settings) ? $settings : []; 28 | 29 | if ($extensionSettings['makeDirectRequests'] ?? false) { 30 | /** @var CrawlStrategyInterface $instance */ 31 | $instance = GeneralUtility::makeInstance(SubProcessExecutionStrategy::class, $this->configurationProvider); 32 | } else { 33 | $instance = GeneralUtility::makeInstance(GuzzleExecutionStrategy::class, $this->configurationProvider); 34 | } 35 | 36 | return $instance; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/CrawlStrategyInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Http\Message\UriInterface; 23 | 24 | /** 25 | * @internal since v12.0.0 26 | */ 27 | interface CrawlStrategyInterface 28 | { 29 | public function fetchUrlContents(UriInterface $url, string $crawlerId); 30 | } 31 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/GuzzleExecutionStrategy.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use GuzzleHttp\Exception\ConnectException; 23 | use GuzzleHttp\Exception\RequestException; 24 | use Psr\Http\Message\ResponseInterface; 25 | use Psr\Http\Message\UriInterface; 26 | use Psr\Log\LoggerAwareInterface; 27 | use Psr\Log\LoggerAwareTrait; 28 | use TYPO3\CMS\Core\Http\Client\GuzzleClientFactory; 29 | use TYPO3\CMS\Core\Http\RequestFactory; 30 | use TYPO3\CMS\Core\Utility\GeneralUtility; 31 | 32 | /** 33 | * Calls Guzzle / CURL (based on TYPO3 settings) for fetching a URL. 34 | * @internal since v12.0.0 35 | */ 36 | class GuzzleExecutionStrategy implements LoggerAwareInterface, CrawlStrategyInterface 37 | { 38 | use LoggerAwareTrait; 39 | 40 | /** 41 | * Sets up a CURL / Guzzle Request for fetching the request. 42 | * 43 | * @return bool|mixed 44 | */ 45 | public function fetchUrlContents(UriInterface $url, string $crawlerId) 46 | { 47 | $reqHeaders = $this->buildRequestHeaders($crawlerId); 48 | 49 | $options = [ 50 | 'headers' => $reqHeaders, 51 | ]; 52 | if ($url->getUserInfo()) { 53 | $options['auth'] = explode(':', $url->getUserInfo()); 54 | } 55 | try { 56 | $url = (string) $url; 57 | $response = $this->getResponse($url, $options); 58 | return unserialize($response->getHeaderLine('X-T3Crawler-Meta')); 59 | } catch (RequestException $e) { 60 | $response = $e->getResponse(); 61 | $message = ($response ? $response->getStatusCode() : 0) 62 | . chr(32) 63 | . ($response ? $response->getReasonPhrase() : $e->getMessage()); 64 | 65 | $this->logger->debug( 66 | sprintf('Error while opening "%s" - ' . $message, $url), 67 | [ 68 | 'crawlerId' => $crawlerId, 69 | ] 70 | ); 71 | return $message; 72 | } catch (ConnectException $e) { 73 | $message = $e->getCode() . chr(32) . $e->getMessage(); 74 | 75 | $this->logger->debug( 76 | sprintf('Error while opening "%s" - ' . $message, $url), 77 | [ 78 | 'crawlerId' => $crawlerId, 79 | ] 80 | ); 81 | return $message; 82 | } 83 | } 84 | 85 | protected function getResponse(string $url, array $options): ResponseInterface 86 | { 87 | $guzzleClientFactory = GeneralUtility::makeInstance(GuzzleClientFactory::class); 88 | return GeneralUtility::makeInstance(RequestFactory::class, $guzzleClientFactory) 89 | ->request($url, 'GET', $options); 90 | } 91 | 92 | /** 93 | * Builds HTTP request headers. 94 | */ 95 | private function buildRequestHeaders(string $crawlerId): array 96 | { 97 | return [ 98 | 'Connection' => 'close', 99 | 'X-T3Crawler' => $crawlerId, 100 | 'User-Agent' => 'TYPO3 crawler', 101 | ]; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /Classes/Crawler.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Core\Environment; 23 | use TYPO3\CMS\Core\SingletonInterface; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | /** 27 | * @internal since v9.2.5 28 | */ 29 | final class Crawler implements SingletonInterface 30 | { 31 | private readonly string $processFilename; 32 | 33 | public function __construct(?string $processFilename = null) 34 | { 35 | $this->processFilename = $processFilename ?: Environment::getVarPath() . '/lock/tx_crawler.proc'; 36 | $this->setDisabled(false); 37 | $pathInfo = pathinfo($this->processFilename); 38 | GeneralUtility::mkdir_deep($pathInfo['dirname']); 39 | } 40 | 41 | public function setDisabled(bool $disabled = true): void 42 | { 43 | if ($disabled) { 44 | GeneralUtility::writeFile($this->processFilename, ''); 45 | } elseif (is_file($this->processFilename)) { 46 | unlink($this->processFilename); 47 | } 48 | } 49 | 50 | public function isDisabled(): bool 51 | { 52 | return is_file($this->processFilename); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Classes/Domain/Model/ProcessCollection.php: -------------------------------------------------------------------------------- 1 | 14 | * 15 | * All rights reserved 16 | * 17 | * This script is part of the TYPO3 project. The TYPO3 project is 18 | * free software; you can redistribute it and/or modify 19 | * it under the terms of the GNU General Public License as published by 20 | * the Free Software Foundation; either version 3 of the License, or 21 | * (at your option) any later version. 22 | * 23 | * The GNU General Public License can be found at 24 | * http://www.gnu.org/copyleft/gpl.html. 25 | * 26 | * This script is distributed in the hope that it will be useful, 27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | * GNU General Public License for more details. 30 | * 31 | * This copyright notice MUST APPEAR in all copies of the script! 32 | ***************************************************************/ 33 | 34 | /** 35 | * @internal since v9.2.5 36 | */ 37 | class ProcessCollection extends \ArrayObject 38 | { 39 | /** 40 | * Method to retrieve an element from the collection. 41 | * @throws NoIndexFoundException 42 | */ 43 | public function offsetGet(mixed $index): Process 44 | { 45 | if (!parent::offsetExists($index)) { 46 | throw new NoIndexFoundException('Index "' . var_export( 47 | $index, 48 | true 49 | ) . '" for \AOE\Crawler\Domain\Model\Process are not available', 1_593_714_823); 50 | } 51 | return parent::offsetGet($index); 52 | } 53 | 54 | /** 55 | * Method to add an element to the collection- 56 | * 57 | * @param Process $subject 58 | * @throws InvalidArgumentException 59 | */ 60 | public function offsetSet(mixed $index, $subject): void 61 | { 62 | if (!$subject instanceof Process) { 63 | throw new \InvalidArgumentException( 64 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!', 65 | 1_593_714_822 66 | ); 67 | } 68 | 69 | parent::offsetSet($index, $subject); 70 | } 71 | 72 | /** 73 | * Method to append an element to the collection 74 | * @param Process $subject 75 | * @throws InvalidArgumentException 76 | */ 77 | public function append($subject): void 78 | { 79 | if (!$subject instanceof Process) { 80 | throw new \InvalidArgumentException( 81 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!', 82 | 1_593_714_821 83 | ); 84 | } 85 | 86 | parent::append($subject); 87 | } 88 | 89 | /** 90 | * returns array of process ids of the current collection 91 | * @return array 92 | */ 93 | public function getProcessIds() 94 | { 95 | $result = []; 96 | foreach ($this->getIterator() as $value) { 97 | $result[] = $value->getProcessId(); 98 | } 99 | return $result; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /Classes/Domain/Repository/ConfigurationRepository.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Doctrine\DBAL\ArrayParameterType; 23 | use TYPO3\CMS\Backend\Utility\BackendUtility; 24 | use TYPO3\CMS\Core\Database\ConnectionPool; 25 | use TYPO3\CMS\Core\Database\Query\QueryBuilder; 26 | use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction; 27 | use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction; 28 | use TYPO3\CMS\Core\Utility\GeneralUtility; 29 | use TYPO3\CMS\Extbase\Persistence\Repository; 30 | 31 | /** 32 | * @internal since v9.2.5 33 | */ 34 | class ConfigurationRepository extends Repository 35 | { 36 | final public const TABLE_NAME = 'tx_crawler_configuration'; 37 | 38 | /** 39 | * Traverses up the rootline of a page and fetches all crawler records. 40 | */ 41 | public function getCrawlerConfigurationRecordsFromRootLine(int $pageId, array $parentIds = []): array 42 | { 43 | if (empty($parentIds)) { 44 | $pageIdsInRootLine = []; 45 | $rootLine = BackendUtility::BEgetRootLine($pageId); 46 | 47 | foreach ($rootLine as $pageInRootLine) { 48 | $pageIdsInRootLine[] = (int) $pageInRootLine['uid']; 49 | } 50 | } else { 51 | $pageIdsInRootLine = $parentIds; 52 | } 53 | 54 | $queryBuilder = $this->createQueryBuilder(); 55 | $queryBuilder 56 | ->getRestrictions()->removeAll() 57 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class)) 58 | ->add(GeneralUtility::makeInstance(HiddenRestriction::class)); 59 | return $queryBuilder 60 | ->select('*') 61 | ->from(self::TABLE_NAME) 62 | ->where( 63 | $queryBuilder->expr()->in( 64 | 'pid', 65 | $queryBuilder->createNamedParameter($pageIdsInRootLine, ArrayParameterType::INTEGER) 66 | ) 67 | ) 68 | ->orderBy('name') 69 | ->executeQuery() 70 | ->fetchAllAssociative(); 71 | } 72 | 73 | protected function createQueryBuilder(): QueryBuilder 74 | { 75 | return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(self::TABLE_NAME); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Classes/Event/AfterQueueItemAddedEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class AfterQueueItemAddedEvent 26 | { 27 | /** 28 | * @param int|string $queueId 29 | */ 30 | public function __construct( 31 | private $queueId, 32 | private array $fieldArray 33 | ) { 34 | } 35 | 36 | public function getQueueId(): int|string 37 | { 38 | return $this->queueId; 39 | } 40 | 41 | public function getFieldArray(): array 42 | { 43 | return $this->fieldArray; 44 | } 45 | 46 | public function setFieldArray(array $fieldArray): void 47 | { 48 | $this->fieldArray = $fieldArray; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Classes/Event/AfterUrlAddedToQueueEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class AfterUrlAddedToQueueEvent 26 | { 27 | public function __construct( 28 | private readonly string $uid, 29 | private readonly array $fieldArray 30 | ) { 31 | } 32 | 33 | public function getUid(): string 34 | { 35 | return $this->uid; 36 | } 37 | 38 | public function getFieldArray(): array 39 | { 40 | return $this->fieldArray; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Classes/Event/AfterUrlCrawledEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class AfterUrlCrawledEvent 26 | { 27 | public function __construct( 28 | private readonly string $url, 29 | private readonly array $result 30 | ) { 31 | } 32 | 33 | public function getUrl(): string 34 | { 35 | return $this->url; 36 | } 37 | 38 | public function getResult(): array 39 | { 40 | return $this->result; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Classes/Event/BeforeQueueItemAddedEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class BeforeQueueItemAddedEvent 26 | { 27 | public function __construct( 28 | private readonly int $queueId, 29 | private array $queueRecord 30 | ) { 31 | } 32 | 33 | public function getQueueId(): int 34 | { 35 | return $this->queueId; 36 | } 37 | 38 | public function getQueueRecord(): array 39 | { 40 | return $this->queueRecord; 41 | } 42 | 43 | public function setQueueRecord(array $queueRecord): void 44 | { 45 | $this->queueRecord = $queueRecord; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Classes/Event/InvokeQueueChangeEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Model\Reason; 23 | 24 | /** 25 | * @internal since v12.0.0 26 | */ 27 | final class InvokeQueueChangeEvent 28 | { 29 | public function __construct( 30 | private readonly Reason $reason 31 | ) { 32 | } 33 | 34 | public function getReasonDetailedText(): string 35 | { 36 | return $this->reason->getDetailText(); 37 | } 38 | 39 | public function getReasonText(): string 40 | { 41 | return $this->reason->getReason(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Classes/Event/ModifySkipPageEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class ModifySkipPageEvent 26 | { 27 | private bool|string $skipped = false; 28 | 29 | public function __construct( 30 | private readonly array $pageRow 31 | ) { 32 | } 33 | 34 | public function isSkipped(): false|string 35 | { 36 | return $this->skipped; 37 | } 38 | 39 | public function setSkipped(false|string $skipped): void 40 | { 41 | $this->skipped = $skipped; 42 | } 43 | 44 | public function getPageRow(): array 45 | { 46 | return $this->pageRow; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Classes/EventListener/AfterQueueItemAddedEventListener.php: -------------------------------------------------------------------------------- 1 | getConnectionForTable(QueueRepository::TABLE_NAME) 17 | ->update(QueueRepository::TABLE_NAME, $event->getFieldArray(), [ 18 | 'qid' => (int) $event->getQueueId(), 19 | ]); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Classes/EventListener/ShouldUseCachedPageDataIfAvailableEventListener.php: -------------------------------------------------------------------------------- 1 | getRequest()->getAttribute('tx_crawler') === null) { 18 | return; 19 | } 20 | $event->setShouldUseCachedPageData(false); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Classes/Exception/CommandNotFoundException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class CommandNotFoundException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Classes/Exception/CrawlerObjectException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | * @deprecated since 12.0.5 will be removed in v14.x 25 | */ 26 | class CrawlerObjectException extends \Exception 27 | { 28 | } 29 | -------------------------------------------------------------------------------- /Classes/Exception/ExtensionSettingsException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class ExtensionSettingsException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Classes/Exception/NoIndexFoundException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class NoIndexFoundException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Classes/Exception/ProcessException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class ProcessException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Classes/Exception/TimeStampException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | * @deprecated since 12.0.5 will be removed in v14.x 25 | */ 26 | class TimeStampException extends \Exception 27 | { 28 | } 29 | -------------------------------------------------------------------------------- /Classes/Helper/Sleeper/NullSleeper.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /* 23 | * @internal 24 | * @codeCoverageIgnore 25 | */ 26 | final class NullSleeper implements SleeperInterface 27 | { 28 | public function sleep(int $seconds): void 29 | { 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Classes/Helper/Sleeper/SleeperInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | interface SleeperInterface 26 | { 27 | public function sleep(int $seconds): void; 28 | } 29 | -------------------------------------------------------------------------------- /Classes/Helper/Sleeper/SystemSleeper.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /* 23 | * @internal 24 | */ 25 | final class SystemSleeper implements SleeperInterface 26 | { 27 | public function sleep(int $seconds): void 28 | { 29 | \sleep($seconds); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Classes/Hooks/CrawlerHookInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * (c) 2021- Tomas Norre Mikkelsen 10 | * 11 | * This file is part of the TYPO3 Crawler Extension. 12 | * 13 | * It is free software; you can redistribute it and/or modify it under 14 | * the terms of the GNU General Public License, either version 2 15 | * of the License, or any later version. 16 | * 17 | * For the full copyright and license information, please read the 18 | * LICENSE.txt file that was distributed with this source code. 19 | * 20 | * The TYPO3 project - inspiring people to share! 21 | */ 22 | 23 | /** 24 | * @internal since v12.0.0 25 | */ 26 | interface CrawlerHookInterface 27 | { 28 | public function crawler_init(): void; 29 | } 30 | -------------------------------------------------------------------------------- /Classes/Hooks/DataHandlerHook.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\QueueRepository; 23 | use AOE\Crawler\Service\QueueService; 24 | use TYPO3\CMS\Core\DataHandling\DataHandler; 25 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; 26 | use TYPO3\CMS\Core\Utility\GeneralUtility; 27 | 28 | /** 29 | * @internal since v9.2.5 30 | */ 31 | class DataHandlerHook 32 | { 33 | /** 34 | * @noRector \Rector\DeadCode\Rector\ClassMethod\RemoveUnusedParameterRector 35 | */ 36 | public function addFlushedPagesToCrawlerQueue(array $parameters, DataHandler $dataHandler): void 37 | { 38 | $pageIdsToBeFlushedFromCache = $parameters['pageIdArray']; 39 | if (empty($pageIdsToBeFlushedFromCache)) { 40 | return; 41 | } 42 | foreach ($pageIdsToBeFlushedFromCache as $pageId) { 43 | $pageId = (int) $pageId; 44 | if ($pageId < 1 || empty($this->getPageRepository()->getPage($pageId))) { 45 | continue; 46 | } 47 | if ($this->getQueueRepository()->isPageInQueue($pageId)) { 48 | continue; 49 | } 50 | $this->getQueueService()->addPageToQueue($pageId); 51 | } 52 | } 53 | 54 | public function getQueueRepository(): QueueRepository 55 | { 56 | return GeneralUtility::makeInstance(QueueRepository::class); 57 | } 58 | 59 | public function getQueueService(): QueueService 60 | { 61 | return GeneralUtility::makeInstance(QueueService::class); 62 | } 63 | 64 | public function getPageRepository(): PageRepository 65 | { 66 | return GeneralUtility::makeInstance(PageRepository::class); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Classes/QueueExecutor.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Controller\CrawlerController; 23 | use AOE\Crawler\Converter\JsonCompatibilityConverter; 24 | use AOE\Crawler\CrawlStrategy\CallbackExecutionStrategy; 25 | use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory; 26 | use AOE\Crawler\CrawlStrategy\CrawlStrategyInterface; 27 | use AOE\Crawler\Event\AfterUrlCrawledEvent; 28 | use TYPO3\CMS\Core\EventDispatcher\EventDispatcher; 29 | use TYPO3\CMS\Core\Http\Uri; 30 | use TYPO3\CMS\Core\SingletonInterface; 31 | use TYPO3\CMS\Core\Utility\GeneralUtility; 32 | 33 | /** 34 | * Fetches a URL based on the selected strategy or via a callback. 35 | * @internal since v9.2.5 36 | */ 37 | class QueueExecutor implements SingletonInterface 38 | { 39 | protected CrawlStrategyInterface $crawlStrategy; 40 | 41 | public function __construct( 42 | CrawlStrategyFactory $crawlStrategyFactory, 43 | private readonly EventDispatcher $eventDispatcher 44 | ) { 45 | $this->crawlStrategy = $crawlStrategyFactory->create(); 46 | } 47 | 48 | /** 49 | * Takes a queue record and fetches the contents of the URL. 50 | * In the future, updating the queue item & additional signal/slot/events should also happen in here. 51 | * 52 | * @return array|bool|mixed|string 53 | */ 54 | public function executeQueueItem(array $queueItem, CrawlerController $crawlerController) 55 | { 56 | $parameters = ''; 57 | if (isset($queueItem['parameters'])) { 58 | // Decode parameters: 59 | /** @var JsonCompatibilityConverter $jsonCompatibleConverter */ 60 | $jsonCompatibleConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class); 61 | $parameters = $jsonCompatibleConverter->convert($queueItem['parameters']); 62 | } 63 | 64 | if (!is_array($parameters) || empty($parameters)) { 65 | return 'ERROR'; 66 | } 67 | if (isset($parameters['_CALLBACKOBJ'])) { 68 | $className = $parameters['_CALLBACKOBJ']; 69 | unset($parameters['_CALLBACKOBJ']); 70 | $result = GeneralUtility::makeInstance(CallbackExecutionStrategy::class) 71 | ->fetchByCallback($className, $parameters, $crawlerController); 72 | $result = [ 73 | 'content' => json_encode($result), 74 | ]; 75 | } else { 76 | // Regular FE request 77 | $crawlerId = $this->generateCrawlerIdFromQueueItem($queueItem); 78 | 79 | $url = new Uri($parameters['url']); 80 | $result = $this->crawlStrategy->fetchUrlContents($url, $crawlerId); 81 | if ($result !== false) { 82 | $result = [ 83 | 'content' => json_encode($result), 84 | ]; 85 | $this->eventDispatcher->dispatch(new AfterUrlCrawledEvent($parameters['url'], $result)); 86 | } 87 | } 88 | return $result; 89 | } 90 | 91 | protected function generateCrawlerIdFromQueueItem(array $queueItem): string 92 | { 93 | return $queueItem['qid'] . ':' . md5( 94 | $queueItem['qid'] . '|' . $queueItem['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'] 95 | ); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /Classes/Service/BackendModuleScriptUrlService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Http\Message\ServerRequestInterface; 23 | use TYPO3\CMS\Backend\Routing\UriBuilder; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | class BackendModuleScriptUrlService 27 | { 28 | public function buildScriptUrl( 29 | ServerRequestInterface $request, 30 | string $elementName, 31 | int $pageUid, 32 | array $queryParameters, 33 | string $queryString = '' 34 | ): string { 35 | $mainParams = [ 36 | 'id' => $pageUid, 37 | ]; 38 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class); 39 | $route = $request->getAttribute('route'); 40 | $scriptUrl = (string) $uriBuilder->buildUriFromRoute($route->getOption('_identifier'), $mainParams); 41 | 42 | return $scriptUrl . ($queryString . $this->getAdditionalQueryParams( 43 | $elementName, 44 | $queryParameters 45 | ) . '&' . $elementName . '=${value}'); 46 | } 47 | 48 | /* 49 | * Build query string with affected checkbox/dropdown value removed. 50 | */ 51 | private function getAdditionalQueryParams(string $keyToBeRemoved, array $queryParameters): string 52 | { 53 | $queryString = ''; 54 | unset($queryParameters[$keyToBeRemoved]); 55 | foreach ($queryParameters as $key => $value) { 56 | $queryString .= "&{$key}={$value}"; 57 | } 58 | return $queryString; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Classes/Service/ProcessInstructionService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\GeneralUtility; 23 | 24 | /** 25 | * @internal since v11.0.3 26 | */ 27 | class ProcessInstructionService 28 | { 29 | public function isAllowed(string $processInstruction, array $incoming): bool 30 | { 31 | if (empty($incoming)) { 32 | return true; 33 | } 34 | 35 | foreach ($incoming as $pi) { 36 | if (GeneralUtility::inList($processInstruction, $pi)) { 37 | return true; 38 | } 39 | } 40 | return false; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Classes/Service/QueueService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Controller\CrawlerController; 23 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | /** 27 | * @internal since v9.2.5 28 | */ 29 | class QueueService 30 | { 31 | private ?\AOE\Crawler\Controller\CrawlerController $crawlerController = null; 32 | 33 | public function injectCrawlerController(CrawlerController $crawlerController): void 34 | { 35 | $this->crawlerController = $crawlerController; 36 | $this->crawlerController->setID = GeneralUtility::md5int(microtime()); 37 | } 38 | 39 | public function addPageToQueue(int $pageUid, int $time = 0): void 40 | { 41 | if ($this->crawlerController === null) { 42 | return; 43 | } 44 | 45 | $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($pageUid, true); 46 | $configurations = $this->crawlerController->getUrlsForPageRow($pageData); 47 | // Currently this is only used from the DataHandlerHook, and we don't know of any allowed/disallowed configurations, 48 | // when clearing the cache, therefore we allow all configurations in this case. 49 | // This next lines could be skipped as it will return the incoming configurations, but for visibility and 50 | // later implementation it's kept as it do no harm. 51 | $allowedConfigurations = []; 52 | $configurations = ConfigurationService::removeDisallowedConfigurations($allowedConfigurations, $configurations); 53 | $downloadUrls = []; 54 | $duplicateTrack = []; 55 | 56 | if (is_array($configurations)) { 57 | foreach ($configurations as $configuration) { 58 | //enable inserting of entries 59 | $this->crawlerController->registerQueueEntriesInternallyOnly = false; 60 | $this->crawlerController->urlListFromUrlArray( 61 | $configuration, 62 | $pageData, 63 | $time, 64 | 300, 65 | true, 66 | false, 67 | $duplicateTrack, 68 | $downloadUrls, 69 | array_keys($this->getCrawlerProcInstructions()) 70 | ); 71 | 72 | //reset the queue because the entries have been written to the db 73 | unset($this->crawlerController->queueEntries); 74 | } 75 | } 76 | } 77 | 78 | /** 79 | * Reads the registered processingInstructions of the crawler 80 | */ 81 | private function getCrawlerProcInstructions(): array 82 | { 83 | $crawlerProcInstructions = []; 84 | if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) { 85 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) { 86 | $crawlerProcInstructions[$configuration['key']] = $configuration['value']; 87 | } 88 | } 89 | 90 | return $crawlerProcInstructions; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /Classes/Service/UserService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\GeneralUtility; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | class UserService 28 | { 29 | public static function hasGroupAccess(string $groupList, string $accessList): bool 30 | { 31 | if (empty($accessList)) { 32 | return true; 33 | } 34 | foreach (explode(',', $groupList) as $groupUid) { 35 | if (GeneralUtility::inList($accessList, $groupUid)) { 36 | return true; 37 | } 38 | } 39 | return false; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /Classes/Utility/HookUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Hooks\ProcessCleanUpHook; 23 | use Psr\Http\Message\ServerRequestInterface; 24 | use TYPO3\CMS\Core\Http\ApplicationType; 25 | 26 | /** 27 | * @codeCoverageIgnore 28 | * @internal since v9.2.5 29 | */ 30 | class HookUtility 31 | { 32 | /** 33 | * Registers hooks 34 | * 35 | * @param string $extKey 36 | */ 37 | public static function registerHooks($extKey): void 38 | { 39 | // Activating Crawler cli_hooks 40 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['cli_hooks'][] = 41 | ProcessCleanUpHook::class; 42 | 43 | // Activating refresh hooks 44 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['refresh_hooks'][] = 45 | ProcessCleanUpHook::class; 46 | 47 | // Env-dependent 48 | if (($GLOBALS['TYPO3_REQUEST'] ?? null) instanceof ServerRequestInterface 49 | && ApplicationType::fromRequest($GLOBALS['TYPO3_REQUEST'])->isBackend() 50 | ) { 51 | self::registerBackendHooks(); 52 | } 53 | } 54 | 55 | private static function registerBackendHooks(): void 56 | { 57 | // DataHandler clear page cache pre-processing 58 | $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['clearPageCacheEval'][] = 59 | "AOE\Crawler\Hooks\DataHandlerHook->addFlushedPagesToCrawlerQueue"; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /Classes/Utility/MessageUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Messaging\FlashMessage; 23 | use TYPO3\CMS\Core\Messaging\FlashMessageService; 24 | use TYPO3\CMS\Core\Type\ContextualFeedbackSeverity; 25 | use TYPO3\CMS\Core\Utility\GeneralUtility; 26 | 27 | /** 28 | * @internal since v9.2.5 29 | */ 30 | class MessageUtility 31 | { 32 | /** 33 | * Add notice message to the user interface. 34 | */ 35 | public static function addNoticeMessage(string $message): void 36 | { 37 | self::addMessage($message, ContextualFeedbackSeverity::NOTICE); 38 | } 39 | 40 | /** 41 | * Add error message to the user interface. 42 | */ 43 | public static function addErrorMessage(string $message): void 44 | { 45 | self::addMessage($message, ContextualFeedbackSeverity::ERROR); 46 | } 47 | 48 | /** 49 | * Add error message to the user interface. 50 | */ 51 | public static function addWarningMessage(string $message): void 52 | { 53 | self::addMessage($message, ContextualFeedbackSeverity::WARNING); 54 | } 55 | 56 | /** 57 | * This method is used to add a message to the internal queue 58 | * 59 | * @param string $message the message itself 60 | * @param ContextualFeedbackSeverity $severity message level (0 = success (default), -1 = info, -2 = notice, 1 = warning, 2 = error) 61 | */ 62 | private static function addMessage( 63 | string $message, 64 | ContextualFeedbackSeverity $severity = ContextualFeedbackSeverity::OK 65 | ): void { 66 | $message = GeneralUtility::makeInstance(FlashMessage::class, $message, '', $severity); 67 | 68 | /** @var FlashMessageService $flashMessageService */ 69 | $flashMessageService = GeneralUtility::makeInstance(FlashMessageService::class); 70 | $flashMessageService->getMessageQueueByIdentifier()->addMessage($message); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /Classes/Utility/PhpBinaryUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; 23 | use AOE\Crawler\Exception\CommandNotFoundException; 24 | use AOE\Crawler\Exception\ExtensionSettingsException; 25 | use TYPO3\CMS\Core\Utility\CommandUtility; 26 | use TYPO3\CMS\Core\Utility\GeneralUtility; 27 | 28 | /** 29 | * @internal since v9.2.5 30 | */ 31 | class PhpBinaryUtility 32 | { 33 | public static function getPhpBinary(): string 34 | { 35 | $extensionSettings = GeneralUtility::makeInstance( 36 | ExtensionConfigurationProvider::class 37 | )->getExtensionConfiguration(); 38 | 39 | if (empty($extensionSettings)) { 40 | throw new ExtensionSettingsException('ExtensionSettings are empty', 1_587_066_853); 41 | } 42 | 43 | if (empty($extensionSettings['phpPath'])) { 44 | $phpPath = CommandUtility::getCommand($extensionSettings['phpBinary']); 45 | if ($phpPath === false) { 46 | throw new CommandNotFoundException( 47 | 'The phpBinary: "' . $extensionSettings['phpBinary'] . '" could not be found!', 48 | 1_587_068_215 49 | ); 50 | } 51 | } else { 52 | $phpPath = $extensionSettings['phpPath']; 53 | } 54 | 55 | return $phpPath; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Classes/Utility/TcaUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\ExtensionManagementUtility; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | class TcaUtility 28 | { 29 | /** 30 | * Get crawler processing instructions. 31 | * This function is called as a itemsProcFunc in tx_crawler_configuration.processing_instruction_filter 32 | * 33 | * @return array 34 | */ 35 | public function getProcessingInstructions(array $configuration) 36 | { 37 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] ?? null)) { 38 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $extensionKey => $extensionConfiguration) { 39 | $configuration['items'][] = [ 40 | 'label' => $extensionConfiguration['value'] . ' [' . $extensionConfiguration['key'] . ']', 41 | 'value' => $extensionConfiguration['key'], 42 | 'icon' => $this->getExtensionIcon($extensionKey), 43 | ]; 44 | } 45 | } 46 | 47 | return $configuration; 48 | } 49 | 50 | /** 51 | * Get path to ext_icon.gif from processing instruction key 52 | * 53 | * @param string $extensionKey Like staticfilecache or indexed_search 54 | * @return string 55 | */ 56 | private function getExtensionIcon($extensionKey) 57 | { 58 | return ExtensionManagementUtility::getExtensionIcon(ExtensionManagementUtility::extPath($extensionKey), true); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Classes/Value/CrawlAction.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Assert\Assert; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | final class CrawlAction implements \Stringable 28 | { 29 | private readonly string $crawlAction; 30 | 31 | public function __construct(string $crawlAction) 32 | { 33 | Assert::that($crawlAction) 34 | ->inArray(['start', 'log', 'multiprocess']); 35 | 36 | $this->crawlAction = $crawlAction; 37 | } 38 | 39 | public function __toString(): string 40 | { 41 | return $this->crawlAction; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Classes/Value/QueueFilter.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Assert\Assert; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | class QueueFilter implements \Stringable 28 | { 29 | private readonly string $queueFilter; 30 | 31 | public function __construct(string $queueFilter = 'all') 32 | { 33 | Assert::that($queueFilter) 34 | ->inArray(['all', 'pending', 'finished']); 35 | 36 | $this->queueFilter = $queueFilter; 37 | } 38 | 39 | public function __toString(): string 40 | { 41 | return $this->queueFilter; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Classes/Value/QueueRow.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal 24 | */ 25 | class QueueRow 26 | { 27 | public string $pageTitleHTML = ''; 28 | public string $message = ''; 29 | public string $configurationKey = ''; 30 | public string $parameterConfig = ''; 31 | public string $valuesExpanded = ''; 32 | public string $urls = ''; 33 | public array $options = []; 34 | public string $parameters = ''; 35 | 36 | public function __construct( 37 | public string $pageTitle = '' 38 | ) { 39 | } 40 | 41 | public function setPageTitleHTML(string $pageTitleHTML): void 42 | { 43 | $this->pageTitleHTML = $pageTitleHTML; 44 | } 45 | 46 | public function setMessage(string $message): void 47 | { 48 | $this->message = $message; 49 | } 50 | 51 | public function setConfigurationKey(string $configurationKey): void 52 | { 53 | $this->configurationKey = $configurationKey; 54 | } 55 | 56 | public function setParameterConfig(string $parameterConfig): void 57 | { 58 | $this->parameterConfig = $parameterConfig; 59 | } 60 | 61 | public function setValuesExpanded(string $valuesExpanded): void 62 | { 63 | $this->valuesExpanded = $valuesExpanded; 64 | } 65 | 66 | public function setUrls(string $urls): void 67 | { 68 | $this->urls = $urls; 69 | } 70 | 71 | public function setOptions(array $options): void 72 | { 73 | $this->options = $options; 74 | } 75 | 76 | public function setParameters(string $parameters): void 77 | { 78 | $this->parameters = $parameters; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /Classes/Writer/FileWriter/CsvWriter/CrawlerCsvWriter.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\CsvUtility; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | final class CrawlerCsvWriter implements CsvWriterInterface 28 | { 29 | private const CARRIAGE_RETURN = 13; 30 | private const LINE_FEED = 10; 31 | 32 | public function arrayToCsv(array $records): string 33 | { 34 | $csvLines = []; 35 | reset($records); 36 | 37 | $csvLines[] = $this->getRowHeaders($records); 38 | foreach ($records as $row) { 39 | $csvLines[] = CsvUtility::csvValues($row); 40 | } 41 | 42 | return implode(chr(self::CARRIAGE_RETURN) . chr(self::LINE_FEED), $csvLines); 43 | } 44 | 45 | private function getRowHeaders(array $lines): string 46 | { 47 | $fieldNames = array_keys(current($lines)); 48 | return CsvUtility::csvValues($fieldNames); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Classes/Writer/FileWriter/CsvWriter/CsvWriterInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | interface CsvWriterInterface 26 | { 27 | public function arrayToCsv(array $records): string; 28 | } 29 | -------------------------------------------------------------------------------- /Configuration/Backend/Modules.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * This file is part of the TYPO3 Crawler Extension. 9 | * 10 | * It is free software; you can redistribute it and/or modify it under 11 | * the terms of the GNU General Public License, either version 2 12 | * of the License, or any later version. 13 | * 14 | * For the full copyright and license information, please read the 15 | * LICENSE.txt file that was distributed with this source code. 16 | * 17 | * The TYPO3 project - inspiring people to share! 18 | */ 19 | 20 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerLogController; 21 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerProcessController; 22 | use AOE\Crawler\Controller\Backend\BackendModuleStartCrawlingController; 23 | 24 | return [ 25 | 'web_site_crawler' => [ 26 | 'parent' => 'web', 27 | 'position' => [ 28 | 'after' => 'web_info', 29 | ], 30 | 'access' => 'user', 31 | 'workspaces' => 'live', 32 | 'path' => '/module/page/crawler', 33 | 'labels' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf', 34 | 'extensionName' => 'Crawler', 35 | 'iconIdentifier' => 'tx-crawler-icon', 36 | 'routes' => [ 37 | '_default' => [ 38 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest', 39 | ], 40 | ], 41 | ], 42 | 'web_site_crawler_start' => [ 43 | 'parent' => 'web_site_crawler', 44 | 'access' => 'user', 45 | 'path' => '/module/page/crawler/start', 46 | 'iconIdentifier' => 'crawler-start', 47 | 'labels' => [ 48 | 'title' => 'Start', 49 | ], 50 | 'routes' => [ 51 | '_default' => [ 52 | 'target' => BackendModuleStartCrawlingController::class . '::handleRequest', 53 | ], 54 | ], 55 | ], 56 | 'web_site_crawler_process' => [ 57 | 'parent' => 'web_site_crawler', 58 | 'access' => 'user', 59 | 'path' => '/module/page/crawler/process', 60 | 'iconIdentifier' => 'crawler-process', 61 | 'labels' => [ 62 | 'title' => 'Process', 63 | ], 64 | 'routes' => [ 65 | '_default' => [ 66 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest', 67 | ], 68 | ], 69 | ], 70 | 'web_site_crawler_log' => [ 71 | 'parent' => 'web_site_crawler', 72 | 'access' => 'user', 73 | 'path' => '/module/page/crawler/log', 74 | 'iconIdentifier' => 'crawler-log', 75 | 'labels' => [ 76 | 'title' => 'Log', 77 | ], 78 | 'routes' => [ 79 | '_default' => [ 80 | 'target' => BackendModuleCrawlerLogController::class . '::handleRequest', 81 | ], 82 | ], 83 | ], 84 | ]; 85 | -------------------------------------------------------------------------------- /Configuration/Extbase/Persistence/Classes.php: -------------------------------------------------------------------------------- 1 | [ 7 | 'tableName' => 'tx_crawler_configuration', 8 | ], 9 | AOE\Crawler\Domain\Model\Process::class => [ 10 | 'tableName' => 'tx_crawler_process', 11 | ], 12 | AOE\Crawler\Domain\Model\Queue::class => [ 13 | 'tableName' => 'tx_crawler_queue', 14 | ], 15 | ]; 16 | -------------------------------------------------------------------------------- /Configuration/Icons.php: -------------------------------------------------------------------------------- 1 | [ 9 | 'provider' => SvgIconProvider::class, 10 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_configuration.svg', 11 | ], 12 | 'tx-crawler-start' => [ 13 | 'provider' => SvgIconProvider::class, 14 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_start.svg', 15 | ], 16 | 'tx-crawler-stop' => [ 17 | 'provider' => SvgIconProvider::class, 18 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_stop.svg', 19 | ], 20 | 'tx-crawler-icon' => [ 21 | 'provider' => SvgIconProvider::class, 22 | 'source' => 'EXT:crawler/Resources/Public/Icons/Extension.svg', 23 | ], 24 | ]; 25 | -------------------------------------------------------------------------------- /Configuration/RequestMiddlewares.php: -------------------------------------------------------------------------------- 1 | [ 10 | 'aoe/crawler/authentication' => [ 11 | 'target' => FrontendUserAuthenticator::class, 12 | 'after' => ['typo3/cms-frontend/authentication'], 13 | 'before' => ['typo3/cms-frontend/page-resolver'], 14 | ], 15 | 'aoe/crawler/initialization' => [ 16 | 'target' => CrawlerInitialization::class, 17 | 'before' => ['typo3/cms-frontend/prepare-tsfe-rendering'], 18 | ], 19 | ], 20 | ]; 21 | -------------------------------------------------------------------------------- /Documentation/Configuration/ConfigurationRecords/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _backend-configuration-record: 4 | 5 | ===================== 6 | Configuration records 7 | ===================== 8 | 9 | Formerly configuration was done by using pageTS (see below). This is 10 | still possible (fully backwards compatible) but not recommended. 11 | Instead of writing pageTS simply create a configuration record (table: 12 | ``tx_crawler_configuration``) and put it on the topmost page of the 13 | pagetree you want to affect with this configuration. 14 | 15 | The fields in these records are related to the pageTS keys described 16 | below. 17 | 18 | .. _backend-configuration-record-fields: 19 | 20 | Fields and their pageTS equivalents 21 | =================================== 22 | 23 | .. _backend-configuration-record-general: 24 | 25 | General 26 | ------- 27 | 28 | .. figure:: /Images/backend_configurationrecord_general.png 29 | :alt: Backend configuration record: General 30 | 31 | Backend configuration record: General 32 | 33 | Name 34 | Corresponds to the "key" part in the pageTS setup e.g. 35 | :typoscript:`tx_crawler.crawlerCfg.paramSets.myConfigurationKeyName` 36 | 37 | Protocol for crawling 38 | Force HTTP, HTTPS or keep the configured protocol 39 | 40 | Processing instruction filter 41 | List of processing instructions. See also: 42 | :ref:`paramSets.[key].procInstrFilter ` 43 | 44 | Base URL 45 | Set baseUrl (most likely the same as the entry point configured in your 46 | site configuration) 47 | 48 | Pids only 49 | List of Page Ids to limit this configuration to. See also: 50 | :ref:`paramSets.[key].pidsOnly ` 51 | 52 | Exclude pages 53 | Comma separated list of page ids which should not be crawled. 54 | You can do recursive exclusion by adding `uid`+`depth` e.g. 6+3, 55 | this will ensure that all pages including pageUid 6 and 3 levels down 56 | will not be crawled. 57 | 58 | Configuration 59 | Parameter configuration. The values of GET variables are according to a 60 | special syntax. See also: :ref:`paramSets.[key] 61 | ` 62 | 63 | Processing instruction parameters 64 | Options for processing instructions. Will be defined in the respective third 65 | party modules. See also: :ref:`paramSets.[key].procInstrParams 66 | ` 67 | 68 | Crawl with FE user groups 69 | User groups to set for the request. See also: 70 | :ref:`paramSets.[key].userGroups ` and the hint in :ref:`create-crawler-configuration` 71 | 72 | .. _backend-configuration-record-access: 73 | 74 | Access 75 | ------ 76 | 77 | .. figure:: /Images/backend_configurationrecord_access.png 78 | :alt: Backend configuration record: Access 79 | 80 | Backend configuration record: Access 81 | 82 | Hide 83 | If activated the configuration record is not taken into account. 84 | 85 | Restrict access to 86 | Restricts access to this configuration record to selected backend user 87 | groups. Empty means no restriction is set. 88 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _examples: 4 | 5 | ======== 6 | Examples 7 | ======== 8 | 9 | .. toctree:: 10 | :maxdepth: 5 11 | :titlesonly: 12 | :glob: 13 | 14 | News/Index 15 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _example-configuration-news: 4 | 5 | ======== 6 | EXT:news 7 | ======== 8 | 9 | The news extensions is one of the most used extensions in the TYPO3 CMS. This 10 | configuration is made under the assumption with a page tree looking similar to this: 11 | 12 | .. figure:: /Images/ext_news_pagetree.png 13 | :alt: Example Pagetree of EXT:news setup 14 | 15 | Example Pagetree of EXT:news setup 16 | 17 | If you want to have a Crawler Configuration that matches this, you can add 18 | following to the :guilabel:`PageTS` for PageId `56`. 19 | 20 | .. literalinclude:: _page.tsconfig 21 | :caption: packages/my_extension/Configuration/Sets/MySet/page.tsconfig 22 | 23 | Now you can add the News detail-view pages to the crawler queue and have them in 24 | the cache and the `indexed_search` index if you are using that. 25 | 26 | .. _example-configuration-news-category: 27 | 28 | Respecting Categories in News 29 | ============================= 30 | 31 | On some installations news is configured in such a way, that news of category A 32 | have their detail view on one page and news of category B have their detail view on 33 | another page. In this case it would still be possible to view news of category A on 34 | the detail page for category B (example.com/detail-page-for-category-B/news-of-category-A). 35 | That means that each news article would be crawled twice - once on the detail page 36 | for category A and once on the detail page for category B. It is possible to use a 37 | PSR-14 event with news to prevent this. 38 | 39 | On both detail pages include this typoscript setup: 40 | 41 | .. literalinclude:: _setup.typoscript 42 | :caption: packages/my_extension/Configuration/Sets/MySet/setup.typoscript 43 | 44 | and register an event listener in your site package. 45 | 46 | .. literalinclude:: _services.yaml 47 | :caption: packages/my_extension/Configuration/Services.yaml 48 | 49 | .. literalinclude:: _NewsDetailEventListener.php 50 | :caption: packages/my_extension/Classes/EventListeners/NewsDetailEventListener.php 51 | 52 | .. warning:: 53 | 54 | Note that this does more than just prevent articles from being indexed twice. It 55 | actually prevents articles from being displayed on a page that is supposed to show 56 | only articles of a certain category! 57 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_NewsDetailEventListener.php: -------------------------------------------------------------------------------- 1 | getAssignedValues(); 14 | $newsItem = $assignedValues['newsItem']; 15 | $demand = $assignedValues['demand']; 16 | $settings = $assignedValues['settings']; 17 | 18 | if ($newsItem !== null) { 19 | $demandedCategories = $demand->getCategories(); 20 | $itemCategories = $newsItem->getCategories()->toArray(); 21 | $itemCategoryIds = \array_map(function ($category) { 22 | return (string) $category->getUid(); 23 | }, $itemCategories); 24 | 25 | if (count($demandedCategories) > 0 && !$this::itemMatchesCategoryDemand( 26 | $settings['categoryConjunction'], 27 | $itemCategoryIds, 28 | $demandedCategories 29 | )) { 30 | $assignedValues['newsItem'] = null; 31 | $event->setAssignedValues($assignedValues); 32 | } 33 | } 34 | } 35 | 36 | protected static function itemMatchesCategoryDemand( 37 | string $categoryConjunction, 38 | array $itemCategoryIds, 39 | array $demandedCategories 40 | ): bool { 41 | $numOfDemandedCategories = \count($demandedCategories); 42 | $intersection = \array_intersect($itemCategoryIds, $demandedCategories); 43 | $numOfCommonItems = \count($intersection); 44 | 45 | switch ($categoryConjunction) { 46 | case 'AND': 47 | return $numOfCommonItems === $numOfDemandedCategories; 48 | case 'OR': 49 | return $numOfCommonItems > 0; 50 | case 'NOTAND': 51 | return $numOfCommonItems < $numOfDemandedCategories; 52 | case 'NOTOR': 53 | return $numOfCommonItems === 0; 54 | } 55 | return true; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_page.tsconfig: -------------------------------------------------------------------------------- 1 | tx_crawler.crawlerCfg.paramSets { 2 | tx_news = &tx_news_pi1[controller]=News&tx_news_pi1[action]=detail&tx_news_pi1[news]=[_TABLE:tx_news_domain_model_news; _PID:58; _WHERE: hidden = 0] 3 | tx_news { 4 | pidsOnly = 57 5 | } 6 | } 7 | 8 | # _PID:58 is the Folder where news records are stored. 9 | # pidSOnly = 57 is the detail-view PageId. 10 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | MyVendor\MyExtension\EventListeners\NewsDetailEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'myNewsDetailListener' 6 | event: GeorgRinger\News\Event\NewsDetailActionEvent 7 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_setup.typoscript: -------------------------------------------------------------------------------- 1 | plugin.tx_news.settings { 2 | # categories and categoryconjunction are not considered in detail view, so they must be overridden 3 | overrideFlexformSettingsIfEmpty = cropMaxCharacters,dateField,timeRestriction,archiveRestriction,orderBy,orderDirection,backPid,listPid,startingpoint,recursive,list.paginate.itemsPerPage,list.paginate.templatePath,categories,categoryConjunction 4 | # see the news extension for possible values of categoryConjunction 5 | categoryConjunction = AND 6 | categories = 7 | detail.errorHandling = pageNotFoundHandler 8 | } 9 | -------------------------------------------------------------------------------- /Documentation/Configuration/ExtensionManagerConfiguration/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _extension-manager-configuration: 4 | 5 | =============================== 6 | Extension Manager Configuration 7 | =============================== 8 | 9 | A lot of options were added to the extension manager configuration, 10 | that allow settings to improve and enable new crawler features: 11 | 12 | .. figure:: /Images/backend_configuration_settings.png 13 | :alt: Backend configuration: Settings 14 | 15 | Backend configuration: Settings 16 | 17 | .. figure:: /Images/backend_configuration_queue.png 18 | :alt: Backend configuration: Queue 19 | 20 | Backend configuration: Queue 21 | -------------------------------------------------------------------------------- /Documentation/Configuration/HttpAuthentication/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _http-authentication: 4 | 5 | =================== 6 | HTTP Authentication 7 | =================== 8 | 9 | If you want to use HTTP Authentication you need to configure your base url 10 | to contain user:pass 11 | 12 | .. code-block:: text 13 | 14 | https://user:pass@www.mydomain.com/ 15 | -------------------------------------------------------------------------------- /Documentation/Configuration/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _configuration: 4 | 5 | ============= 6 | Configuration 7 | ============= 8 | 9 | .. toctree:: 10 | :maxdepth: 5 11 | :titlesonly: 12 | :glob: 13 | 14 | ExtensionManagerConfiguration/Index 15 | ConfigurationRecords/Index 16 | PageTsconfigReference(txCrawlercrawlercfg)/Index 17 | HttpAuthentication/Index 18 | Examples/Index 19 | -------------------------------------------------------------------------------- /Documentation/Configuration/PageTsconfigReference(txCrawlercrawlercfg)/_page.tsconfig: -------------------------------------------------------------------------------- 1 | tx_crawler.crawlerCfg.paramSets.test = &L=[0-3] 2 | tx_crawler.crawlerCfg.paramSets.test { 3 | procInstrFilter = tx_indexedsearch_reindex 4 | pidsOnly = 1,5,13,55 5 | userGroups = 1 6 | force_ssl = 1 7 | } 8 | -------------------------------------------------------------------------------- /Documentation/Configuration/PageTsconfigReference(txCrawlercrawlercfg)/_paramSets_page.tsconfig: -------------------------------------------------------------------------------- 1 | tx_crawler.crawlerCfg.paramSets { 2 | myConfigurationKeyName = &tx_myext[items]=[_TABLE:tt_myext_items;_PID:15;_WHERE: hidden = 0] 3 | myConfigurationKeyName { 4 | pidsOnly = 13 5 | procInstrFilter = tx_indexedsearch_reindex 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default.txt: -------------------------------------------------------------------------------- 1 | 38 entries found for processing. (Use "mode" to decide action): 2 | 3 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/overview 4 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/rich-text 5 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/headers 6 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/bullet-list 7 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser 8 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-and-icon 9 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-in-columns 10 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/list-group 11 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/panel 12 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/table 13 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/quote 14 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/audio 15 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/text-and-images 16 | ... 17 | [10-04-20 10:36] https://crawler-devbox.ddev.site/content-examples/and-more/frames 18 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_exec.txt: -------------------------------------------------------------------------------- 1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode exec 2 | https://crawler-devbox.ddev.site/content-examples/overview 3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text 4 | https://crawler-devbox.ddev.site/content-examples/text/headers 5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list 6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser 7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon 8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns 9 | https://crawler-devbox.ddev.site/content-examples/text/list-group 10 | https://crawler-devbox.ddev.site/content-examples/text/panel 11 | ... 12 | Processing 13 | 14 | https://crawler-devbox.ddev.site/content-examples/overview () => 15 | 16 | OK: 17 | User Groups: 18 | 19 | https://crawler-devbox.ddev.site/content-examples/text/rich-text () => 20 | 21 | OK: 22 | User Groups: 23 | 24 | https://crawler-devbox.ddev.site/content-examples/text/headers () => 25 | 26 | OK: 27 | User Groups: 28 | 29 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list () => 30 | 31 | OK: 32 | User Groups: 33 | ... 34 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_url.txt: -------------------------------------------------------------------------------- 1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode url 2 | https://crawler-devbox.ddev.site/content-examples/overview 3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text 4 | https://crawler-devbox.ddev.site/content-examples/text/headers 5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list 6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser 7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon 8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns 9 | https://crawler-devbox.ddev.site/content-examples/text/list-group 10 | https://crawler-devbox.ddev.site/content-examples/text/panel 11 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/ExecutingQueueWithCron-job/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _with-crown: 4 | 5 | ============================= 6 | Executing queue with cron-job 7 | ============================= 8 | 9 | A "cron-job" refers to a script that runs on the server with time 10 | intervals. 11 | 12 | For this to become reality you must ideally have a cron-job set up. 13 | This assumes you are running on Unix architecture of some sort. The 14 | crontab is often edited by :bash:`crontab -e` and you should insert a line 15 | like this: 16 | 17 | .. code-block:: plaintext 18 | 19 | * * * * * vendor/bin/typo3 crawler:buildQueue > /dev/null 20 | 21 | This will run the script every minute. You should try to run the 22 | script on the command line first to make sure it runs without any 23 | errors. If it doesn't output anything it was successful. 24 | 25 | You will need to have a user called `_cli_` and you must have PHP installed 26 | as a CGI script as well in :path:`/usr/bin/`. 27 | 28 | The user `_cli_` is created by the framework on demand if it does not exist 29 | at the first command line call. 30 | 31 | Make sure that the user `_cli_` has admin-rights. 32 | 33 | In the :guilabel:`CLI status` menu of the :guilabel:`Site Crawler` info module 34 | you can see the status: 35 | 36 | .. figure:: /Images/backend_processlist.png 37 | :alt: Status page in the backend 38 | 39 | Status page in the backend 40 | 41 | This is how it looks just after you ran the script. (You can also see 42 | the full path to the script in the bottom - this is the path to the 43 | script as you should use it on the command line / in the crontab) 44 | 45 | If the cron-script stalls there is a default delay of 1 hour before a 46 | new process will announce the old one dead and run a new one. If a 47 | cron-script takes more than 1 minute and thereby overlaps the next 48 | process, the next process will NOT start if it sees that the "lock- 49 | file" exists (unless that hour has passed). 50 | 51 | The reason why it works like this is to make sure that overlapping 52 | calls to the crawler CLI script will not run parallel processes. So 53 | the second call will just exit if it finds in the status file that the 54 | process is already running. But of course a crashed script will fail 55 | to set the status to "end" and hence this situation can occur. 56 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _executing-the-queue-label: 4 | 5 | =================== 6 | Executing the queue 7 | =================== 8 | 9 | The idea of the queue is that a large number of tasks can be submitted 10 | to the queue and performed over longer time. This could be interesting 11 | for several reasons; 12 | 13 | - To spread server load over time. 14 | 15 | - To time the requests for nightly processing. 16 | 17 | - And simply to avoid `max_execution_time` of PHP to limit processing 18 | to 30 seconds! 19 | 20 | 21 | .. toctree:: 22 | :maxdepth: 5 23 | :titlesonly: 24 | :glob: 25 | 26 | RunningViaCommandController/Index 27 | ExecutingQueueWithCron-job/Index 28 | RunViaBackend/Index 29 | BuildingAndExecutingQueueRightAway(fromCli)/Index 30 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/RunViaBackend/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _run-backend: 4 | 5 | =============== 6 | Run via backend 7 | =============== 8 | 9 | To process the queue you must either set up a cron-job on your server 10 | or use the backend to process the queue: 11 | 12 | .. figure:: /Images/backend_processlist_add_process.png 13 | :alt: Process the queue via backend 14 | 15 | Process the queue via backend 16 | 17 | You can also (re-)crawl single URLs manually from within the :guilabel:`Crawler 18 | log` view in the info module: 19 | 20 | .. figure:: /Images/backend_crawlerlog_recrawl.png 21 | :alt: Crawl single URLs via backend 22 | 23 | Crawl single URLs via backend 24 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/RunningViaCommandController/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _command-controller: 4 | 5 | ========================== 6 | Run via command controller 7 | ========================== 8 | 9 | .. _command-controller-buildqueue: 10 | 11 | Create queue 12 | ------------ 13 | 14 | .. code-block:: bash 15 | :caption: replace vendor/bin/typo3 with your own cli runner 16 | 17 | $ vendor/bin/typo3 crawler:buildQueue [--depth ] [--number ] [--mode ] 18 | 19 | .. _command-controller-processqueue: 20 | 21 | Run queue 22 | --------- 23 | 24 | .. code-block:: bash 25 | :caption: replace vendor/bin/typo3 with your own cli runner 26 | 27 | $ vendor/bin/typo3 crawler:processQueue [--amount ] [--sleeptime ] [--sleepafter ] 28 | 29 | .. _command-controller-flushqueue: 30 | 31 | Flush queue 32 | ----------- 33 | 34 | .. code-block:: bash 35 | :caption: replace vendor/bin/typo3 with your own cli runner 36 | 37 | $ vendor/bin/typo3 crawler:flushQueue 38 | -------------------------------------------------------------------------------- /Documentation/Features/AutomaticAddPagesToQueue/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _add-to-queue: 4 | 5 | ============================ 6 | Automatic add pages to Queue 7 | ============================ 8 | 9 | .. versionadded:: 9.1.0 10 | 11 | .. _add-to-queue-edit: 12 | 13 | Edit Pages 14 | ---------- 15 | 16 | With this feature, you will automatically add pages to the crawler queue 17 | when you are editing content on the page, unless it's within a workspace, then 18 | it will not be added to the queue before it's published. 19 | 20 | This functionality gives you the advantages that you would not need to keep track 21 | of which pages you have edited, it will automatically be handle on next crawler 22 | process task, see :ref:`executing-the-queue-label`. This ensure that 23 | your cache or e.g. Search Index is always up to date and the end-users will see 24 | the most current content as soon as possible. 25 | 26 | .. _add-to-queue-cache: 27 | 28 | Clear Page Single Cache 29 | ----------------------- 30 | 31 | As the edit and clear page cache function is using the same dataHandler hooks, 32 | we have an additional feature for free. When you clear the page cache for a specific 33 | page then it will also be added automatically to the crawler queue. Again this will 34 | be processed during the next crawler process. 35 | 36 | .. figure:: /Images/backend_clear_cache.png 37 | :alt: Clearing the page cache 38 | 39 | Clearing the page cache 40 | 41 | .. figure:: /Images/backend_clear_cache_queue.png 42 | :alt: Page is added to the crawler queue 43 | 44 | Page is added to the crawler queue 45 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterQueueItemAddedEventListener.php: -------------------------------------------------------------------------------- 1 | $afterUrl()); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterUrlCrawledEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | AOE\Crawler\EventListener\AfterUrlCrawledEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/AfterUrlCrawledEventListener' 6 | event: AOE\Crawler\Event\AfterUrlCrawledEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_BeforeQueueItemAddedEventListener.php: -------------------------------------------------------------------------------- 1 | getReasonText(); 14 | // You can implement different logic based on reason, GUI or CLI 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_InvokeQueueChangeEvent_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | AOE\Crawler\EventListener\InvokeQueueChangeEvent: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/InvokeQueueChangeEventListener' 6 | event: AOE\Crawler\Event\InvokeQueueChangeEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_ModifySkipPageEventListener.php: -------------------------------------------------------------------------------- 1 | getPageRow()['uid'] === 42) { 14 | $modifySkipPageEvent->setSkipped('Page with uid "42" is excluded by ModifySkipPageEvent'); 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_ModifySkipPageEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | AOE\Crawler\EventListener\ModifySkipPageEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/ModifySkipPageEventListener' 6 | event: AOE\Crawler\Event\ModifySkipPageEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Hooks/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _hooks: 4 | 5 | ===== 6 | Hooks 7 | ===== 8 | 9 | Register the following hooks in :file:`ext_localconf.php` of your extension. 10 | 11 | .. _hooks-excludeDoktype: 12 | 13 | excludeDoktype Hook 14 | =================== 15 | 16 | By adding doktype ids to following array you can exclude them from 17 | being crawled: 18 | 19 | .. code-block:: php 20 | :caption: packages/my_extension/ext_localconf.php 21 | 22 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'][] = 23 | 24 | .. _hooks-pageVeto: 25 | 26 | pageVeto Hook 27 | ============= 28 | 29 | .. deprecated:: 11.0.0 30 | Will be removed in 13.0, please migrate to the PSR-14 Event :ref:`psr14-modify-skip-page-event`! 31 | 32 | You can also decide whether a page should not be crawled in an 33 | individual userfunction. Register your function here: 34 | 35 | .. code-block:: php 36 | :caption: packages/my_extension/ext_localconf.php 37 | 38 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'][] = MyVendor\MyExtension\Hooks\Crawler\PageVeto::class . '->excludePage'; 39 | 40 | .. literalinclude:: _PageVeto.php 41 | :caption: packages/my_extension/Classes/Hooks/Crawler/PageVeto.php 42 | -------------------------------------------------------------------------------- /Documentation/Features/Hooks/_PageVeto.php: -------------------------------------------------------------------------------- 1 | applicationData['tx_crawler']['success']['tx_staticpub'] = true; 33 | 34 | -------------------------------------------------------------------------------- /Documentation/Features/PriorityCrawling/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _priority-crawling: 4 | 5 | ================= 6 | Priority Crawling 7 | ================= 8 | 9 | .. versionadded:: 9.1.0 10 | 11 | Some website has a quite large number of pages. Some pages are logically more 12 | important than others e.g. the start-, support-, product-, you name it-pages. 13 | These important pages are also the pages where we want to have the best caching 14 | and performance, as they will most likely be the pages with the most changes and 15 | the most traffic. 16 | 17 | With TYPO3 10 LTS the `sysext/seo` introduced among other things, the 18 | `sitemap_priority`, which is used to generate an SEO optimised sitemap.xml 19 | where page priorities are listed as well. Their priorities will most likely be higher the 20 | more important the page is for you and the end-user. 21 | 22 | This logic is something that we can benefit from in the Crawler as well. A 23 | Website with let us say 10.000 pages, will have different importance depending on 24 | the page you are at. Therefore we have changed the functionality of the crawler, 25 | to take the value of this field, range from 0.0 to 1.0, into consideration when 26 | processing the crawler queue. This means that if you have a page with high priority 27 | for your sitemap, it will also be crawled first when a new crawler process is 28 | added. 29 | 30 | This ensures that we will always crawl the pages that have the highest importance to 31 | you and your end-user based on your sitemap priority. We choose to 32 | reuse this field, to not have editors doing work that is more or less similar twice. 33 | 34 | If you don't want to use this functionality, it's ok. You can just ignore the 35 | options that the `sysext/seo` gives you and all pages will by default get a priority 36 | 0.5, and therefore do not influence the processing order as everyone will have the 37 | same priority. 38 | 39 | The existing :guilabel:`SEO` tab will be used to set priorities when editing 40 | pages. 41 | 42 | .. image:: /Images/backend_crawler_seo_v10.png 43 | 44 | .. figure:: /Images/backend_crawler_seo_priority_v10.png 45 | :alt: The SEO tab will contain the sitemap_priority field 46 | 47 | The SEO tab will contain the sitemap_priority field 48 | -------------------------------------------------------------------------------- /Documentation/Images/backend_addfromcontextmenu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_addfromcontextmenu.png -------------------------------------------------------------------------------- /Documentation/Images/backend_clear_cache.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_clear_cache.png -------------------------------------------------------------------------------- /Documentation/Images/backend_clear_cache_queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_clear_cache_queue.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configuration_deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configuration_deployment.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configuration_queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configuration_queue.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configuration_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configuration_settings.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configurationrecord_access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configurationrecord_access.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configurationrecord_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configurationrecord_general.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawler_seo_priority_v10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawler_seo_priority_v10.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawler_seo_v10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawler_seo_v10.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawlerlog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawlerlog.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawlerlog_recrawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawlerlog_recrawl.png -------------------------------------------------------------------------------- /Documentation/Images/backend_info_php_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_info_php_error.png -------------------------------------------------------------------------------- /Documentation/Images/backend_pendingurls.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_pendingurls.png -------------------------------------------------------------------------------- /Documentation/Images/backend_php_path_configuration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_php_path_configuration.png -------------------------------------------------------------------------------- /Documentation/Images/backend_processlist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_processlist.png -------------------------------------------------------------------------------- /Documentation/Images/backend_processlist_add_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_processlist_add_process.png -------------------------------------------------------------------------------- /Documentation/Images/backend_recrawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_recrawl.png -------------------------------------------------------------------------------- /Documentation/Images/backend_scheduler_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_scheduler_overview.png -------------------------------------------------------------------------------- /Documentation/Images/backend_scheduler_processqueue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_scheduler_processqueue.png -------------------------------------------------------------------------------- /Documentation/Images/backend_scheduler_record.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_scheduler_record.png -------------------------------------------------------------------------------- /Documentation/Images/backend_startcrawling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_startcrawling.png -------------------------------------------------------------------------------- /Documentation/Images/backend_startnewprocess.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_startnewprocess.png -------------------------------------------------------------------------------- /Documentation/Images/cli_addtoque.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/cli_addtoque.png -------------------------------------------------------------------------------- /Documentation/Images/cli_processque.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/cli_processque.png -------------------------------------------------------------------------------- /Documentation/Images/crawler_settings_processLimit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/crawler_settings_processLimit.png -------------------------------------------------------------------------------- /Documentation/Images/ext_news_pagetree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/ext_news_pagetree.png -------------------------------------------------------------------------------- /Documentation/Includes.rst.txt: -------------------------------------------------------------------------------- 1 | .. You can put central messages to display on all pages here 2 | -------------------------------------------------------------------------------- /Documentation/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _start: 4 | 5 | ====================== 6 | Site Crawler Extension 7 | ====================== 8 | 9 | :Extension key: 10 | crawler 11 | 12 | :Package name: 13 | tomasnorre/crawler 14 | 15 | :Version: 16 | |release| 17 | 18 | :Language: 19 | en 20 | 21 | :Author: 22 | Tomas Norre Mikkelsen 23 | 24 | :Copyright: 25 | 2005-2021 AOE GmbH, since 2021 Tomas Norre Mikkelsen 26 | 27 | :License: 28 | This document is published under the `Open Content License 29 | `_. 30 | 31 | :Rendered: 32 | |today| 33 | 34 | ---- 35 | 36 | Libraries and scripts for crawling the TYPO3 page tree. Used for re-caching, re-indexing, publishing applications etc. 37 | 38 | ---- 39 | 40 | **Table of Contents:** 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | :titlesonly: 45 | 46 | Introduction/Index 47 | Configuration/Index 48 | ExecutingTheQueue/Index 49 | Scheduler/Index 50 | UseCases/Index 51 | Features/Index 52 | Troubleshooting/Index 53 | Links/Links 54 | 55 | .. toctree:: 56 | :hidden: 57 | 58 | Sitemap 59 | -------------------------------------------------------------------------------- /Documentation/Introduction/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _introduction: 4 | 5 | ============ 6 | Introduction 7 | ============ 8 | 9 | .. _introduction-what: 10 | 11 | What does it do? 12 | ================ 13 | 14 | The TYPO3 Crawler is an extension which provides possibilities, from both 15 | the TYPO3 backend and from CLIm that helps you with you cache and e.g. 16 | search index. 17 | 18 | The Crawler implements several PSR-14 events, that you can use to "hook" into 19 | if you have certain requirements for your site at the given time. 20 | 21 | See more :ref:`psr14-modify-skip-page-event`. 22 | 23 | It features an API that other extensions can plug into. Example of this 24 | is "indexed\_search" which uses crawler to index content defined by 25 | its Indexing Configurations. Other extensions supporting it are 26 | "staticpub" (publishing to static pages) or "cachemgm" (allows 27 | recaching of pages). 28 | 29 | The requests of URLs is specially designed to request TYPO3 frontends 30 | with special processing instructions. The requests sends a TYPO3 31 | specific header in the GET requests which identifies a special action. 32 | For instance the action requested could be to publish the URL to a 33 | static file or it could be to index its content - or re-cache the 34 | page. These processing instructions are also defined by third-party 35 | extensions (and indexed search is one of them). In this way a 36 | processing instruction can instruct the frontend to perform an action 37 | (like indexing, publishing etc.) which cannot be done with a request 38 | from outside. 39 | 40 | .. _introduction-screenshots: 41 | 42 | Screenshots 43 | =========== 44 | 45 | The extension provides a backend module which displays the queue and log and 46 | allows execution and status check of the "cronscript" from the backend for 47 | testing purposes. 48 | 49 | .. figure:: /Images/backend_processlist.png 50 | 51 | CLI status display 52 | 53 | CLI = Command Line Interface = shell script = cron script 54 | 55 | .. figure:: /Images/backend_crawlerlog.png 56 | 57 | Crawler queue (before processing) / log (after processing) 58 | 59 | .. figure:: /Images/backend_pendingurls.png 60 | 61 | Interface for submitting a batch of URLs to be crawled 62 | 63 | The parameter combinations are programmable through Page TSconfig or 64 | configuration records. 65 | -------------------------------------------------------------------------------- /Documentation/Links/Links.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _links: 4 | 5 | ===== 6 | Links 7 | ===== 8 | 9 | :TER: 10 | https://extensions.typo3.org/extension/crawler/ 11 | 12 | :Bug Tracker: 13 | https://github.com/tomasnorre/crawler/issues 14 | 15 | :Git Repository: 16 | https://github.com/tomasnorre/crawler.git 17 | -------------------------------------------------------------------------------- /Documentation/Scheduler/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _scheduler: 4 | 5 | ========= 6 | Scheduler 7 | ========= 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 5 12 | :titlesonly: 13 | :glob: 14 | 15 | 16 | As seen in :ref:`executing-the-queue-label` you can execute the queue in 17 | multiple ways, but it's no fun doing that manually all the time. 18 | 19 | With the Crawler you have the possibility to add Scheduler Tasks to be executed 20 | on a give time. The Crawler commands are implemented with the Symfony Console, 21 | and therefore they can be configured with the Core supported 22 | `Execute console commands (scheduler)` task. 23 | 24 | So how to setup crawler scheduler tasks: 25 | 26 | 1. Add a new Scheduler Task 27 | 2. Select the class :guilabel:`Execute console commands` 28 | 3. Select :guilabel:`Frequency` for the execution 29 | 4. Go to section :guilabel:`Schedulable Command. Save and reopen to define 30 | command arguments` at the bottom. 31 | 5. Select e.g. :guilabel:`crawler:buildQueue` (press save) 32 | 6. Select the options you want to execute the queue with, it's important to 33 | check the checkboxes and not only fill in the values. 34 | 35 | Now you can save and close, and your scheduler tasks will be running as 36 | configured. 37 | 38 | The configured task will look like this: 39 | 40 | .. figure:: /Images/backend_scheduler_record.png 41 | :alt: Task configuration for building the queue 42 | 43 | Task configuration for building the queue 44 | 45 | And after save and close, you can see what command is executed, it would be 46 | the same parameters, you can use when running from cli, 47 | see :ref:`executing-the-queue-cli-label` 48 | 49 | .. figure:: /Images/backend_scheduler_overview.png 50 | :alt: Task in the scheduled tasks overview 51 | 52 | Task in the scheduled tasks overview 53 | -------------------------------------------------------------------------------- /Documentation/Sitemap.rst: -------------------------------------------------------------------------------- 1 | :template: sitemap.html 2 | 3 | .. include:: /Includes.rst.txt 4 | 5 | .. _sitemap: 6 | 7 | ======= 8 | Sitemap 9 | ======= 10 | 11 | .. The sitemap.html template will insert here the page tree automatically. 12 | -------------------------------------------------------------------------------- /Documentation/Troubleshooting/_htaccess.txt: -------------------------------------------------------------------------------- 1 | 2 | # Rules to set ApplicationContext based on hostname 3 | RewriteCond %{HTTP_HOST} ^(.*)\.my\-site\.localhost$ 4 | RewriteRule .? - [E=TYPO3_CONTEXT:Development] 5 | RewriteCond %{HTTP_HOST} ^(.*)\.mysite\.info$ 6 | RewriteRule .? - [E=TYPO3_CONTEXT:Production/Staging] 7 | RewriteCond %{HTTP_HOST} ^(.*)\.my\-site\.info$ 8 | RewriteRule .? - [E=TYPO3_CONTEXT:Production] 9 | 10 | -------------------------------------------------------------------------------- /Documentation/UseCases/CacheWarmup/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _use-case-cache-warm-up: 4 | 5 | ============= 6 | Cache warm up 7 | ============= 8 | 9 | To have a website that is fast for the end-user is essential, therefore having a 10 | warm cache even before the first user hits the newly deployed website, will be 11 | beneficial, so how could one achieve this? 12 | 13 | The crawler have some command line tools (hereafter cli tools) that can be used, 14 | during deployments. The cli tools is implemented with the `symfony/console` 15 | which have been standard in TYPO3 for a while. 16 | 17 | There are 3 commands that can be of you benefit during deployments. 18 | 19 | * :bash:`vendor/bin/typo3 crawler:flushQueue` 20 | * :bash:`vendor/bin/typo3 crawler:buildQueue` 21 | * :bash:`vendor/bin/typo3 crawler:processQueue` 22 | 23 | You can see more on which parameters they take in :ref:`command-controller`, 24 | this example will provide suggestion on how you can set it up, and you can 25 | adjust with additional parameters if you like. 26 | 27 | .. rst-class:: bignums-xxl 28 | 29 | .. _create-crawler-configuration: 30 | #. Create crawler configuration 31 | 32 | First we need a `crawler configuration` these are stored in the database. You 33 | can add it via the backend, see :ref:`backend-configuration-record`. 34 | 35 | It's suggested to select the most important pages of the website and add 36 | them to a Crawler configuration called e.g. `deployment`: 37 | 38 | .. figure:: /Images/backend_configuration_deployment.png 39 | :alt: Crawler configuration record 40 | 41 | Crawler configuration record 42 | 43 | .. hint:: 44 | Let's say your website has frontend users with one or multiple user 45 | groups. In this case you need to create multiple crawler 46 | configurations: For every possible combination of User groups that a 47 | user can have you need to create a individual crawler configuration. 48 | 49 | All those crawler configurations need to be added to the 50 | `crawler:processQueue` command to be considered. If you miss this 51 | some user get a warmed up cache but those with a combination of 52 | user groups which was not taken into account in a crawler configuration 53 | will get an uncached page. 54 | 55 | #. Build the queue 56 | 57 | With this only pages added will be crawled when using this configuration. So 58 | how will we execute this from CLI during deployment? I don't know which 59 | deployment tool you use, but it's not important as long as you can execute 60 | shell commands. What would you need to execute? 61 | 62 | .. literalinclude:: _commands.bash 63 | :language: bash 64 | 65 | #. Process the queue 66 | 67 | The last step will add the pages to the queue, and you would need a scheduler 68 | task setup to have them processed. Go to the :guilabel:`Scheduler` module and 69 | do following steps: 70 | 71 | 1. Add a new Scheduler Task 72 | 2. Select the :guilabel:`Execute console commands` 73 | 3. Select :guilabel:`Frequency` for the execution 74 | 4. Go to section :guilabel:`Schedulable Command. Save and reopen to define 75 | command arguments` at the bottom. 76 | 5. Select :guilabel:`crawler:processQueue` (press save) 77 | 6. Select the options you want to execute the queue with, it's important to 78 | check the checkboxes and not only fill in the values. 79 | 80 | .. figure:: /Images/backend_scheduler_processqueue.png 81 | :alt: Options of the task 82 | 83 | Options of the task 84 | 85 | 86 | With there steps you will have a website that is faster by the first visit after 87 | a deployment, and the rest of the website is crawled automatically shortly 88 | after. 89 | 90 | `#HappyCrawling` 91 | -------------------------------------------------------------------------------- /Documentation/UseCases/CacheWarmup/_commands.bash: -------------------------------------------------------------------------------- 1 | # Done to make sure the crawler queue is empty, so that we will only crawl important pages. 2 | $ vendor/bin/typo3 crawler:flushQueue all 3 | 4 | # Now we want to fill the crawler queue, 5 | # This will start on page uid 1 with the deployment configuration and depth 99, 6 | # --mode exec crawles the pages instantly so we don't need a secondary process for that. 7 | $ vendor/bin/typo3 crawler:buildQueue 1 deployment --depth 99 --mode exec 8 | 9 | # Add the rest of the pages to crawler queue and have the processed with the scheduler 10 | # --mode queue is default, but it is added for visibility, 11 | # we assume that you have a crawler configuration called default 12 | $ vendor/bin/typo3 crawler:buildQueue 1 default --depth 99 --mode queue 13 | -------------------------------------------------------------------------------- /Documentation/UseCases/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _use-cases: 4 | 5 | ========= 6 | Use cases 7 | ========= 8 | 9 | This section is made to show different use cases for the crawler, and what value 10 | it can bring by installing it. The crawler has transformed over the years to 11 | have multiple use cases. If you have some that is not listed here, feel free 12 | to make a PR or issue on `https://github.com/tomasnorre/crawler 13 | `_. 14 | 15 | .. toctree:: 16 | :maxdepth: 5 17 | :titlesonly: 18 | :glob: 19 | 20 | CacheWarmup/Index 21 | IndexedSearch/Index 22 | 23 | -------------------------------------------------------------------------------- /Documentation/UseCases/IndexedSearch/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _use-case-indexed-search: 4 | 5 | ============== 6 | Indexed Search 7 | ============== 8 | 9 | The TYPO3 Crawler is quite often used for generated the Index of Indexed Search. 10 | 11 | Unfortunately we don't have any good documentation included on this, but you can help in two ways. 12 | 13 | 1. You can help write the documentations 14 | 2. You can tip into the money pool, to help sponsor those writing the documentation. 15 | 16 | You can see the issue here: https://github.com/tomasnorre/crawler/issues/558 17 | or tip in the money pool here: https://www.paypal.com/paypalme/tomasnorre/10 18 | 19 | `#HappyCrawling` 20 | 21 | .. _use-case-indexed-search-setup: 22 | 23 | Setup Index Search 24 | ================== 25 | 26 | With the latest improvements of the TYPO3 Crawler and Indexed Search, it's gotten 27 | easier to set up Indexed Search to work with the TYPO3 Crawler. 28 | 29 | You need a few things to have this working. 30 | 31 | 1. Create a :ref:`backend-configuration-record` 32 | 2. Setup add a Indexed Search Configuration (See: https://docs.typo3.org/c/typo3/cms-indexed-search/main/en-us/IndexingConfigurations/Configurations/Index.html) 33 | 34 | If you want to index e.g. PDF files please ensure that you have the 35 | respective tools installed on your server. For PDFs that would be `pdftotext` and 36 | `pdfinfo`. 37 | -------------------------------------------------------------------------------- /Documentation/guides.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 17 | 18 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help 2 | help: ## Displays this list of targets with descriptions 3 | @echo "The following commands are available:\n" 4 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[32m%-30s\033[0m %s\n", $$1, $$2}' 5 | 6 | 7 | .PHONY: docs 8 | docs: ## Generate projects docs (from "Documentation" directory) 9 | mkdir -p Documentation-GENERATED-temp 10 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation 11 | 12 | 13 | .PHONY: test-docs 14 | test-docs: ## Test the documentation rendering 15 | mkdir -p Documentation-GENERATED-temp 16 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation --no-progress --minimal-test 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TYPO3 Crawler 2 | [![Latest Stable Version](https://poser.pugx.org/tomasnorre/crawler/v/stable)](https://packagist.org/packages/tomasnorre/crawler) 3 | [![Total Downloads](https://poser.pugx.org/tomasnorre/crawler/downloads)](https://packagist.org/packages/tomasnorre/crawler) 4 | [![License](https://poser.pugx.org/tomasnorre/crawler/license)](https://packagist.org/packages/tomasnorre/crawler) 5 | ![Tests](https://github.com/tomasnorre/crawler/workflows/Tests/badge.svg) 6 | [![Code Coverage](https://scrutinizer-ci.com/g/tomasnorre/crawler/badges/coverage.png?b=main)](https://scrutinizer-ci.com/g/tomasnorre/crawler/?branch=main) 7 | [![Coverage Status](https://coveralls.io/repos/github/tomasnorre/crawler/badge.svg)](https://coveralls.io/github/tomasnorre/crawler) 8 | [![Mutation testing badge](https://img.shields.io/endpoint?style=flat&url=https%3A%2F%2Fbadge-api.stryker-mutator.io%2Fgithub.com%2Ftomasnorre%2Fcrawler%2Fmain)](https://dashboard.stryker-mutator.io/reports/github.com/tomasnorre/crawler/main) 9 | ![Psalm coverage](https://shepherd.dev/github/tomasnorre/crawler/coverage.svg) 10 | [![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/tomasnorre/crawler.svg)](http://isitmaintained.com/project/tomasnorre/crawler "Average time to resolve an issue") 11 | [![Percentage of issues still open](http://isitmaintained.com/badge/open/tomasnorre/crawler.svg)](http://isitmaintained.com/project/tomasnorre/crawler "Percentage of issues still open") 12 | 13 | Libraries and scripts for crawling the TYPO3 page tree. Used for re-caching, re-indexing, publishing applications etc. 14 | 15 | 16 | You can include the crawler in your TYPO3 project with composer or from the [TYPO3 Extension Repository](https://extensions.typo3.org/extension/crawler) 17 | 18 | ```shell script 19 | composer require tomasnorre/crawler 20 | ``` 21 | 22 | **Crawler processes** 23 | 24 | ![backend_processlist](https://user-images.githubusercontent.com/1212481/142763110-936be57c-1e9e-4d62-afbe-4134b139fd56.png) 25 | 26 | ## Versions and Support 27 | 28 | | Release | TYPO3 | PHP | Fixes will contain 29 | |---------|-----------|---------|---| 30 | | 12.x.y | 12.4-13.3 | 8.1-8.4 |Features, Bugfixes, Security Updates, Since 12.0.6 TYPO3 13.4, Since 12.0.7 PHP 8.4 31 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Security Updates, Since 11.0.3 PHP 8.1 32 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates 33 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead. 34 | | 8.x.y | | | Releases do not exist 35 | | 7.x.y | | | Releases do not exist 36 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates 37 | 38 | ### Documentation 39 | Please read the [documentation](https://docs.typo3.org/p/tomasnorre/crawler/master/en-us/) 40 | 41 | To render the documentation locally, please use the official TYPO3 Documentation rendering Docker Tool. 42 | 43 | 44 | ### Contributions 45 | 46 | Please see [CONTRIBUTING.md](https://github.com/tomasnorre/crawler/blob/main/CONTRIBUTING.md) 47 | 48 | ### Honorable Previous Maintainers 49 | 50 | * Kasper Skaarhoj 51 | * Daniel Poetzinger 52 | * Fabrizio Branca 53 | * Tolleiv Nietsch 54 | * Timo Schmidt 55 | * Michael Klapper 56 | * Stefan Rotsch 57 | -------------------------------------------------------------------------------- /Resources/Private/Language/af.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ar.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ca.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/cs.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/da.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Side Id som crawleren vil bruge for at indlæse TSFE (påkrævet) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | Når en side crawles direkte fra TYPO3 Backend. fx. ved at bruge "læs" funktionaliteten i "Crawler Log" modulet, bruges den valgte siden til at initialisere frontend renderingen. Adgang til den valgte side <strong>MÅ IKKE</strong> være begrænset, i så fald vil crawlingen fejle. 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /Resources/Private/Language/de.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Seiten-ID, die der Crawler zur Initialisierung des TSFE verwendet (erforderlich) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | Beim Crawlen einer Seite direkt im TYPO3-Backend, z.B. unter Verwendung der "read"-Funktionalität des Moduls "Crawler-Protokoll" wird die ausgewählte Seiten-ID zur Initialisierung der Frontend-Darstellung verwendet. 14 | Zugriff auf die ausgewählte Seite <strong>DARF NICHT</strong> eingeschränkt sein; das Crawling wird sonst fehlschlagen. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/el.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/es.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/fi.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/fr.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/he.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/hu.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/it.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ja.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ko.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | 9 | 10 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 11 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Resources/Private/Language/nl.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/no.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/pl.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/pt.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ro.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ru.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/sr.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/sv.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/tr.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/uk.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/vi.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/zh.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Layouts/BackendModule.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

5 | 6 |

7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 |

15 | 16 |

17 | 18 |
19 |
20 | -------------------------------------------------------------------------------- /Resources/Private/Php/Libraries/composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "config": { 3 | "classmap-authoritative": true, 4 | "prepend-autoloader": false 5 | }, 6 | "require": { 7 | "beberlei/assert": "^3.3", 8 | "guzzlehttp/guzzle": "^6.4.1 || ^7.2", 9 | "psr/http-message": "^2.0", 10 | "psr/log": "^1.0 || ^2.0 || ^3.0", 11 | "symfony/console": "^6.4 || ^7.0" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /Resources/Public/Css/backend_crawler.css: -------------------------------------------------------------------------------- 1 | table.crawlerLogActions { 2 | border-spacing: 0px 10px; 3 | border-collapse: initial; 4 | } 5 | 6 | table.crawlerLogActions tr.firstRow td { 7 | padding: 0px 10px 0px 0px; 8 | } 9 | -------------------------------------------------------------------------------- /Resources/Public/Icons/Extension.svg: -------------------------------------------------------------------------------- 1 | ext_icon_crawler -------------------------------------------------------------------------------- /Resources/Public/Icons/bullet_green.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /Resources/Public/Icons/bullet_orange.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /Resources/Public/Icons/bullet_red.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /Resources/Public/Icons/crawler_configuration.svg: -------------------------------------------------------------------------------- 1 | ext_icon_crawler_transparent -------------------------------------------------------------------------------- /Resources/Public/Icons/crawler_start.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Resources/Public/Icons/crawler_stop.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Release | TYPO3 | PHP | Fixes will contain 6 | |---------|-----------|---|---| 7 | | 12.x.y | 12.2 | 8.1 |Features, Bugfixes, Security Updates 8 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Bugfixes, Security Updates, Since 11.0.3 PHP 8.1 9 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates 10 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead. 11 | | 8.x.y | | | Releases do not exist 12 | | 7.x.y | | | Releases do not exist 13 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates 14 | 15 | 16 | ## Reporting a Vulnerability 17 | 18 | I case you find a security issue, please write an email to: [tomasnorre@gmail.com](mailto:tomasnorre@gmail.com) 19 | -------------------------------------------------------------------------------- /cli/bootstrap.php: -------------------------------------------------------------------------------- 1 | 1 41 | processMaxRunTime=300 42 | 43 | ######### 44 | ## Cleanup 45 | ######### 46 | 47 | # cat=Cleanup; type=boolean; label=Clean up old queue entries: If checked the older queue entries will be deleted when adding new crawler configurations from CLI. 48 | cleanUpOldQueueEntries=1 49 | 50 | # cat=Cleanup; type=int [1- 99]; label=Processed Age: If Clean up old queue entries is checked, then processed entries older than X days are deleted. 51 | cleanUpProcessedAge=2 52 | 53 | # cat=Cleanup; type=int [1- 99]; label=Scheduled Age: If Clean up old queue entries is checked, then scheduled entries older than X days are deleted. 54 | cleanUpScheduledAge=7 55 | 56 | # cat=Cleanup; type=int [1-365]; label= Delete processed items: Delete processed items from the queue after n days (0 will keep the entries forever - the database may grow very large over time!) 57 | purgeQueueDays=14 58 | 59 | ######### 60 | ## System 61 | ######### 62 | 63 | # cat=System; type=string; label= Name of the php binary (e.g. PHP72-LATEST-CLI ), default is php 64 | phpBinary=php 65 | 66 | # cat=System; type=string; label= PHP Path: Local path to php binary file (e.g. "/usr/bin/php"), you should ONLY use this when the resolved php-binary isn't the correct one. You can check that in the Info -> Site Crawling -> Crawling Process -> CLI-Path 67 | phpPath= 68 | 69 | ######### 70 | ## Debug 71 | ######### 72 | 73 | # cat=Debug; type=boolean; label= Debug: Print Multiprocess- processing informations - prints some information whether a process was really executed and which status it has 74 | processDebug=0 75 | 76 | # cat=Debug; type=boolean; label= Make Multiprocess- processing be verbose while running 77 | processVerbose=0 78 | -------------------------------------------------------------------------------- /ext_emconf.php: -------------------------------------------------------------------------------- 1 | 'Site Crawler', 4 | 'description' => 'Libraries and scripts for crawling the TYPO3 page tree.', 5 | 'category' => 'module', 6 | 'state' => 'stable', 7 | 'uploadfolder' => 0, 8 | 'createDirs' => '', 9 | 'clearCacheOnLoad' => 0, 10 | 'author' => 'Tomas Norre Mikkelsen', 11 | 'author_email' => 'tomasnorre@gmail.com', 12 | 'author_company' => '', 13 | 'version' => '12.0.8', 14 | 'constraints' => [ 15 | 'depends' => [ 16 | 'php' => '8.1.0-8.99.99', 17 | 'typo3' => '12.4.0-13.4.99', 18 | ], 19 | 'conflicts' => [], 20 | 'suggests' => [], 21 | ] 22 | ]; 23 | -------------------------------------------------------------------------------- /ext_localconf.php: -------------------------------------------------------------------------------- 1 | isPackageActive('indexed_search')) { 13 | // Register with "indexed_search" extension 14 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']['indexed_search'] = [ 15 | 'key' => 'tx_indexedsearch_reindex', 16 | 'value' => 'Re-indexing' 17 | ]; 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /ext_tables.sql: -------------------------------------------------------------------------------- 1 | # 2 | # Table structure for table 'tx_crawler_queue' 3 | # 4 | CREATE TABLE tx_crawler_queue 5 | ( 6 | qid int(11) DEFAULT '0' NOT NULL auto_increment, 7 | page_id int(11) DEFAULT '0' NOT NULL, 8 | parameters text NOT NULL, 9 | parameters_hash varchar(50) DEFAULT '' NOT NULL, 10 | configuration_hash varchar(50) DEFAULT '' NOT NULL, 11 | scheduled int(11) DEFAULT '0' NOT NULL, 12 | exec_time int(11) DEFAULT '0' NOT NULL, 13 | set_id int(11) DEFAULT '0' NOT NULL, 14 | result_data longtext NOT NULL, 15 | process_scheduled int(11) DEFAULT '0' NOT NULL, 16 | process_id varchar(50) DEFAULT '' NOT NULL, 17 | process_id_completed varchar(50) DEFAULT '' NOT NULL, 18 | configuration varchar(250) DEFAULT '' NOT NULL, 19 | 20 | PRIMARY KEY (qid), 21 | KEY page_id (page_id), 22 | KEY set_id (set_id), 23 | KEY exec_time (exec_time), 24 | KEY scheduled (scheduled), 25 | KEY process_id (process_id), 26 | KEY parameters_hash (parameters_hash), 27 | KEY configuration_hash (configuration_hash), 28 | KEY cleanup (exec_time,scheduled) 29 | ) ENGINE=InnoDB; 30 | 31 | # 32 | # Table structure for table 'tx_crawler_process' 33 | # 34 | CREATE TABLE tx_crawler_process 35 | ( 36 | process_id varchar(50) DEFAULT '' NOT NULL, 37 | active smallint(6) DEFAULT '0', 38 | ttl int(11) DEFAULT '0' NOT NULL, 39 | assigned_items_count int(11) DEFAULT '0' NOT NULL, 40 | deleted tinyint(4) unsigned DEFAULT '0' NOT NULL, 41 | system_process_id int(11) DEFAULT '0' NOT NULL, 42 | 43 | KEY update_key (active,deleted), 44 | KEY process_id (process_id) 45 | ) ENGINE=InnoDB; 46 | 47 | # 48 | # Table structure for table 'tx_crawler_configuration' 49 | # 50 | CREATE TABLE tx_crawler_configuration 51 | ( 52 | name tinytext NOT NULL, 53 | force_ssl tinyint(4) DEFAULT '0' NOT NULL, 54 | processing_instruction_filter varchar(200) DEFAULT '' NOT NULL, 55 | processing_instruction_parameters_ts varchar(200) DEFAULT '' NOT NULL, 56 | configuration text NOT NULL, 57 | base_url tinytext NOT NULL, 58 | pidsonly blob, 59 | begroups varchar(100) DEFAULT '0' NOT NULL, 60 | fegroups varchar(100) DEFAULT '0' NOT NULL, 61 | exclude text NOT NULL 62 | 63 | ) ENGINE=InnoDB; 64 | 65 | # 66 | # Table structure for table 'pages' 67 | # This is added to reuse the information from typo3/cms-seo. 68 | # As we don't have a dependency for typo3/cms-seo it's added here to ensure that the 69 | # database queries isn't breaking 70 | # 71 | CREATE TABLE pages 72 | ( 73 | sitemap_priority decimal(2, 1) DEFAULT '0.5' NOT NULL 74 | ); 75 | --------------------------------------------------------------------------------