├── .coveralls.yml ├── Documentation ├── Includes.rst.txt ├── Images │ ├── cli_addtoque.png │ ├── cli_processque.png │ ├── backend_recrawl.png │ ├── ext_news_pagetree.png │ ├── backend_clear_cache.png │ ├── backend_crawlerlog.png │ ├── backend_pendingurls.png │ ├── backend_processlist.png │ ├── backend_info_php_error.png │ ├── backend_startcrawling.png │ ├── backend_crawler_seo_v10.png │ ├── backend_scheduler_record.png │ ├── backend_startnewprocess.png │ ├── backend_addfromcontextmenu.png │ ├── backend_clear_cache_queue.png │ ├── backend_configuration_queue.png │ ├── backend_crawlerlog_recrawl.png │ ├── backend_scheduler_overview.png │ ├── crawler_settings_processLimit.png │ ├── backend_configuration_deployment.png │ ├── backend_configuration_settings.png │ ├── backend_crawler_seo_priority_v10.png │ ├── backend_php_path_configuration.png │ ├── backend_processlist_add_process.png │ ├── backend_scheduler_processqueue.png │ ├── backend_configurationrecord_access.png │ └── backend_configurationrecord_general.png ├── Sitemap.rst ├── Configuration │ ├── Examples │ │ ├── Index.rst │ │ └── News │ │ │ ├── _services.yaml │ │ │ ├── _page.tsconfig │ │ │ ├── _setup.typoscript │ │ │ ├── _NewsDetailEventListener.php │ │ │ └── Index.rst │ ├── PageTsconfigReference(txCrawlercrawlercfg) │ │ ├── _page.tsconfig │ │ └── _paramSets_page.tsconfig │ ├── HttpAuthentication │ │ └── Index.rst │ ├── Index.rst │ ├── ExtensionManagerConfiguration │ │ └── Index.rst │ └── ConfigurationRecords │ │ └── Index.rst ├── Features │ ├── Events │ │ ├── _InvokeQueueChangeEvent_services.yaml │ │ ├── _AfterUrlCrawledEventListener_services.yaml │ │ ├── _ModifySkipPageEventListener_services.yaml │ │ ├── _BeforeQueueItemAddedEventListener_services.yaml │ │ ├── _AfterQueueItemAddedEventListener_services.yaml │ │ ├── _AfterUrlAddedToQueueEventListener_services.yaml │ │ ├── _AfterUrlCrawledEventListener.php │ │ ├── _AfterQueueItemAddedEventListener.php │ │ ├── _AfterUrlAddedToQueueEventListener.php │ │ ├── _BeforeQueueItemAddedEventListener.php │ │ ├── _InvokeQueueChangeEventListener.php │ │ └── _ModifySkipPageEventListener.php │ ├── Index.rst │ ├── Hooks │ │ └── Index.rst │ ├── MultiprocessSupport │ │ └── Index.rst │ ├── PollableProcessingInstructions │ │ └── Index.rst │ ├── AutomaticAddPagesToQueue │ │ └── Index.rst │ └── PriorityCrawling │ │ └── Index.rst ├── Links │ └── Links.rst ├── Troubleshooting │ └── _htaccess.txt ├── UseCases │ ├── Index.rst │ └── CacheWarmup │ │ └── _commands.bash ├── ExecutingTheQueue │ ├── RunViaBackend │ │ └── Index.rst │ ├── BuildingAndExecutingQueueRightAway(fromCli) │ │ ├── _output_buildQueue_6_default_mode_url.txt │ │ ├── _output_buildQueue_6_default_mode_exec.txt │ │ └── _output_buildQueue_6_default.txt │ ├── Index.rst │ ├── RunningViaCommandController │ │ └── Index.rst │ └── ExecutingQueueWithCron-job │ │ └── Index.rst ├── guides.xml ├── Index.rst ├── Scheduler │ └── Index.rst └── Introduction │ └── Index.rst ├── cli └── conf.php ├── Resources ├── Public │ ├── Css │ │ └── backend_crawler.css │ ├── Icons │ │ ├── bullet_green.svg │ │ ├── bullet_red.svg │ │ ├── bullet_orange.svg │ │ ├── crawler_stop.svg │ │ ├── crawler_start.svg │ │ ├── Extension.svg │ │ └── crawler_configuration.svg │ └── JavaScript │ │ └── ProcessStatus.js └── Private │ ├── Php │ └── Libraries │ │ └── composer.json │ ├── Layouts │ └── BackendModule.html │ └── Language │ ├── locallang_csh_tx_crawler_configuration.xlf │ ├── da.locallang_csh_tx_crawler_configuration.xlf │ ├── af.locallang_csh_tx_crawler_configuration.xlf │ ├── ar.locallang_csh_tx_crawler_configuration.xlf │ ├── ca.locallang_csh_tx_crawler_configuration.xlf │ ├── cs.locallang_csh_tx_crawler_configuration.xlf │ ├── el.locallang_csh_tx_crawler_configuration.xlf │ ├── fi.locallang_csh_tx_crawler_configuration.xlf │ ├── fr.locallang_csh_tx_crawler_configuration.xlf │ ├── he.locallang_csh_tx_crawler_configuration.xlf │ ├── hu.locallang_csh_tx_crawler_configuration.xlf │ ├── it.locallang_csh_tx_crawler_configuration.xlf │ ├── ja.locallang_csh_tx_crawler_configuration.xlf │ ├── ko.locallang_csh_tx_crawler_configuration.xlf │ ├── nl.locallang_csh_tx_crawler_configuration.xlf │ ├── no.locallang_csh_tx_crawler_configuration.xlf │ ├── pl.locallang_csh_tx_crawler_configuration.xlf │ ├── ro.locallang_csh_tx_crawler_configuration.xlf │ ├── ru.locallang_csh_tx_crawler_configuration.xlf │ ├── sr.locallang_csh_tx_crawler_configuration.xlf │ ├── tr.locallang_csh_tx_crawler_configuration.xlf │ ├── uk.locallang_csh_tx_crawler_configuration.xlf │ ├── vi.locallang_csh_tx_crawler_configuration.xlf │ ├── es.locallang_csh_tx_crawler_configuration.xlf │ ├── pt.locallang_csh_tx_crawler_configuration.xlf │ ├── sv.locallang_csh_tx_crawler_configuration.xlf │ ├── zh.locallang_csh_tx_crawler_configuration.xlf │ └── de.locallang_csh_tx_crawler_configuration.xlf ├── .run └── Xdebug.run.xml ├── Configuration ├── Backend │ ├── JavaScriptModules.php │ ├── AjaxRoutes.php │ └── Modules.php ├── Extbase │ └── Persistence │ │ └── Classes.php ├── RequestMiddlewares.php └── Icons.php ├── Classes ├── Controller │ ├── Backend │ │ ├── BackendModuleControllerInterface.php │ │ └── Helper │ │ │ ├── UrlBuilder.php │ │ │ ├── RequestHelper.php │ │ │ └── ResultHandler.php │ └── Ajax │ │ └── ProcessStatusController.php ├── EventListener │ ├── ShouldUseCachedPageDataIfAvailableEventListener.php │ └── AfterQueueItemAddedEventListener.php ├── Exception │ ├── ProcessException.php │ ├── NoIndexFoundException.php │ ├── CommandNotFoundException.php │ ├── ExtensionSettingsException.php │ ├── TimeStampException.php │ └── CrawlerObjectException.php ├── Helper │ └── Sleeper │ │ ├── SleeperInterface.php │ │ ├── NullSleeper.php │ │ └── SystemSleeper.php ├── Writer │ └── FileWriter │ │ └── CsvWriter │ │ ├── CsvWriterInterface.php │ │ └── CrawlerCsvWriter.php ├── Hooks │ ├── CrawlerHookInterface.php │ ├── ProcessCleanUpHook.php │ └── DataHandlerHook.php ├── Process │ ├── ProcessManagerInterface.php │ ├── ProcessManagerFactory.php │ ├── WindowsProcessManager.php │ ├── UnixProcessManager.php │ └── Cleaner │ │ ├── OldProcessCleaner.php │ │ └── OrphanProcessCleaner.php ├── Event │ ├── AfterUrlCrawledEvent.php │ ├── AfterUrlAddedToQueueEvent.php │ ├── InvokeQueueChangeEvent.php │ ├── ModifySkipPageEvent.php │ ├── BeforeQueueItemAddedEvent.php │ └── AfterQueueItemAddedEvent.php ├── Service │ ├── UserService.php │ ├── ProcessInstructionService.php │ ├── BackendModuleScriptUrlService.php │ ├── PageService.php │ └── QueueService.php ├── Value │ ├── CrawlAction.php │ ├── QueueFilter.php │ └── QueueRow.php ├── CrawlStrategy │ ├── CrawlStrategyFactory.php │ ├── CallbackExecutionStrategy.php │ └── CrawlStrategyInterface.php ├── Utility │ ├── HookUtility.php │ ├── PhpBinaryUtility.php │ ├── TcaUtility.php │ └── MessageUtility.php ├── Configuration │ └── ExtensionConfigurationProvider.php ├── Crawler.php ├── Domain │ ├── Repository │ │ └── ConfigurationRepository.php │ └── Model │ │ └── ProcessCollection.php └── ContextMenu │ └── ItemProvider.php ├── CONTRIBUTERS.md ├── ext_emconf.php ├── ext_localconf.php ├── Makefile ├── SECURITY.md ├── CONTRIBUTING.md ├── composer-dependency-analyser.php ├── README.md ├── ext_tables.sql └── ext_conf_template.txt /.coveralls.yml: -------------------------------------------------------------------------------- 1 | coverage_clover: "*-coverage.clover" 2 | -------------------------------------------------------------------------------- /Documentation/Includes.rst.txt: -------------------------------------------------------------------------------- 1 | .. You can put central messages to display on all pages here 2 | -------------------------------------------------------------------------------- /Documentation/Images/cli_addtoque.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/cli_addtoque.png -------------------------------------------------------------------------------- /Documentation/Images/cli_processque.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/cli_processque.png -------------------------------------------------------------------------------- /Documentation/Images/backend_recrawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_recrawl.png -------------------------------------------------------------------------------- /Documentation/Images/ext_news_pagetree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/ext_news_pagetree.png -------------------------------------------------------------------------------- /Documentation/Images/backend_clear_cache.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_clear_cache.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawlerlog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_crawlerlog.png -------------------------------------------------------------------------------- /Documentation/Images/backend_pendingurls.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_pendingurls.png -------------------------------------------------------------------------------- /Documentation/Images/backend_processlist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_processlist.png -------------------------------------------------------------------------------- /Documentation/Images/backend_info_php_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_info_php_error.png -------------------------------------------------------------------------------- /Documentation/Images/backend_startcrawling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_startcrawling.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawler_seo_v10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_crawler_seo_v10.png -------------------------------------------------------------------------------- /Documentation/Images/backend_scheduler_record.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_scheduler_record.png -------------------------------------------------------------------------------- /Documentation/Images/backend_startnewprocess.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_startnewprocess.png -------------------------------------------------------------------------------- /Documentation/Images/backend_addfromcontextmenu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_addfromcontextmenu.png -------------------------------------------------------------------------------- /Documentation/Images/backend_clear_cache_queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_clear_cache_queue.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configuration_queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_configuration_queue.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawlerlog_recrawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_crawlerlog_recrawl.png -------------------------------------------------------------------------------- /Documentation/Images/backend_scheduler_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_scheduler_overview.png -------------------------------------------------------------------------------- /Documentation/Images/crawler_settings_processLimit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/crawler_settings_processLimit.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configuration_deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_configuration_deployment.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configuration_settings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_configuration_settings.png -------------------------------------------------------------------------------- /Documentation/Images/backend_crawler_seo_priority_v10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_crawler_seo_priority_v10.png -------------------------------------------------------------------------------- /Documentation/Images/backend_php_path_configuration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_php_path_configuration.png -------------------------------------------------------------------------------- /Documentation/Images/backend_processlist_add_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_processlist_add_process.png -------------------------------------------------------------------------------- /Documentation/Images/backend_scheduler_processqueue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_scheduler_processqueue.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configurationrecord_access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_configurationrecord_access.png -------------------------------------------------------------------------------- /Documentation/Images/backend_configurationrecord_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tomasnorre/crawler/HEAD/Documentation/Images/backend_configurationrecord_general.png -------------------------------------------------------------------------------- /cli/conf.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | MyVendor\MyExtension\EventListeners\NewsDetailEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'myNewsDetailListener' 6 | event: GeorgRinger\News\Event\NewsDetailActionEvent 7 | -------------------------------------------------------------------------------- /Configuration/Backend/JavaScriptModules.php: -------------------------------------------------------------------------------- 1 | ['backend'], 5 | 'imports' => [ 6 | '@tomasnorre/crawler/' => 7 | [ 8 | 'path' => 'EXT:crawler/Resources/Public/JavaScript/', 9 | ], 10 | ], 11 | ]; 12 | -------------------------------------------------------------------------------- /Documentation/Configuration/PageTsconfigReference(txCrawlercrawlercfg)/_page.tsconfig: -------------------------------------------------------------------------------- 1 | tx_crawler.crawlerCfg.paramSets.test = &L=[0-3] 2 | tx_crawler.crawlerCfg.paramSets.test { 3 | procInstrFilter = tx_indexedsearch_reindex 4 | pidsOnly = 1,5,13,55 5 | userGroups = 1 6 | force_ssl = 1 7 | } 8 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_InvokeQueueChangeEvent_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | AOE\Crawler\EventListener\InvokeQueueChangeEvent: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/InvokeQueueChangeEventListener' 6 | event: AOE\Crawler\Event\InvokeQueueChangeEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterUrlCrawledEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | AOE\Crawler\EventListener\AfterUrlCrawledEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/AfterUrlCrawledEventListener' 6 | event: AOE\Crawler\Event\AfterUrlCrawledEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_ModifySkipPageEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | AOE\Crawler\EventListener\ModifySkipPageEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/ModifySkipPageEventListener' 6 | event: AOE\Crawler\Event\ModifySkipPageEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_BeforeQueueItemAddedEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | MyVendor\MyExtension\BeforeQueueItemAddedEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/BeforeQueueItemAddedEventListener' 6 | event: AOE\Crawler\Event\BeforeQueueItemAddedEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterQueueItemAddedEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | MyVendor\MyExtension\EventListener\AfterQueueItemAddedEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/AfterQueueItemAddedEventListener' 6 | event: AOE\Crawler\Event\AfterQueueItemAddedEvent 7 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterUrlAddedToQueueEventListener_services.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | MyVendor\MyExtension\EventListener\AfterUrlAddedToQueueEventListener: 3 | tags: 4 | - name: event.listener 5 | identifier: 'ext-extension-key/AfterUrlAddedToQueueEventListener' 6 | event: AOE\Crawler\Event\AfterUrlAddedToQueueEvent 7 | -------------------------------------------------------------------------------- /Documentation/Links/Links.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _links: 4 | 5 | ===== 6 | Links 7 | ===== 8 | 9 | :TER: 10 | https://extensions.typo3.org/extension/crawler/ 11 | 12 | :Bug Tracker: 13 | https://github.com/tomasnorre/crawler/issues 14 | 15 | :Git Repository: 16 | https://github.com/tomasnorre/crawler.git 17 | -------------------------------------------------------------------------------- /Documentation/Configuration/PageTsconfigReference(txCrawlercrawlercfg)/_paramSets_page.tsconfig: -------------------------------------------------------------------------------- 1 | tx_crawler.crawlerCfg.paramSets { 2 | myConfigurationKeyName = &tx_myext[items]=[_TABLE:tt_myext_items;_PID:15;_WHERE: hidden = 0] 3 | myConfigurationKeyName { 4 | pidsOnly = 13 5 | procInstrFilter = tx_indexedsearch_reindex 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /Configuration/Backend/AjaxRoutes.php: -------------------------------------------------------------------------------- 1 | [ 7 | 'path' => '/crawler/process/status', 8 | 'target' => ProcessStatusController::class . '::getProcessStatus', 9 | 'inheritAccessFromModule' => 'web_site_crawler_process', 10 | ], 11 | ]; 12 | -------------------------------------------------------------------------------- /Documentation/Configuration/HttpAuthentication/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _http-authentication: 4 | 5 | =================== 6 | HTTP Authentication 7 | =================== 8 | 9 | If you want to use HTTP Authentication you need to configure your base url 10 | to contain user:pass 11 | 12 | .. code-block:: text 13 | 14 | https://user:pass@www.mydomain.com/ 15 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_page.tsconfig: -------------------------------------------------------------------------------- 1 | tx_crawler.crawlerCfg.paramSets { 2 | tx_news = &tx_news_pi1[controller]=News&tx_news_pi1[action]=detail&tx_news_pi1[news]=[_TABLE:tx_news_domain_model_news; _PID:58; _WHERE: hidden = 0] 3 | tx_news { 4 | pidsOnly = 57 5 | } 6 | } 7 | 8 | # _PID:58 is the Folder where news records are stored. 9 | # pidSOnly = 57 is the detail-view PageId. 10 | -------------------------------------------------------------------------------- /Documentation/Features/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _features: 4 | 5 | ======== 6 | Features 7 | ======== 8 | 9 | .. toctree:: 10 | :maxdepth: 5 11 | :titlesonly: 12 | :glob: 13 | 14 | AutomaticAddPagesToQueue/Index 15 | PollableProcessingInstructions/Index 16 | MultiprocessSupport/Index 17 | Hooks/Index 18 | Events/Index 19 | PriorityCrawling/Index 20 | -------------------------------------------------------------------------------- /Documentation/Configuration/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _configuration: 4 | 5 | ============= 6 | Configuration 7 | ============= 8 | 9 | .. toctree:: 10 | :maxdepth: 5 11 | :titlesonly: 12 | :glob: 13 | 14 | ExtensionManagerConfiguration/Index 15 | ConfigurationRecords/Index 16 | PageTsconfigReference(txCrawlercrawlercfg)/Index 17 | HttpAuthentication/Index 18 | Examples/Index 19 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterUrlCrawledEventListener.php: -------------------------------------------------------------------------------- 1 | $afterUrl()); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Classes/Controller/Backend/BackendModuleControllerInterface.php: -------------------------------------------------------------------------------- 1 | [ 7 | 'tableName' => 'tx_crawler_configuration', 8 | ], 9 | AOE\Crawler\Domain\Model\Process::class => [ 10 | 'tableName' => 'tx_crawler_process', 11 | ], 12 | AOE\Crawler\Domain\Model\Queue::class => [ 13 | 'tableName' => 'tx_crawler_queue', 14 | ], 15 | ]; 16 | -------------------------------------------------------------------------------- /Documentation/Troubleshooting/_htaccess.txt: -------------------------------------------------------------------------------- 1 | 2 | # Rules to set ApplicationContext based on hostname 3 | RewriteCond %{HTTP_HOST} ^(.*)\.my\-site\.localhost$ 4 | RewriteRule .? - [E=TYPO3_CONTEXT:Development] 5 | RewriteCond %{HTTP_HOST} ^(.*)\.mysite\.info$ 6 | RewriteRule .? - [E=TYPO3_CONTEXT:Production/Staging] 7 | RewriteCond %{HTTP_HOST} ^(.*)\.my\-site\.info$ 8 | RewriteRule .? - [E=TYPO3_CONTEXT:Production] 9 | 10 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_AfterQueueItemAddedEventListener.php: -------------------------------------------------------------------------------- 1 | getReasonText(); 14 | // You can implement different logic based on reason, GUI or CLI 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /CONTRIBUTERS.md: -------------------------------------------------------------------------------- 1 | # Contributers 2 | 3 | List of contributers to the Crawler TYPO3 V9 Compatibility. 4 | 5 | Adding the name to the list is optional, email as well if you want your name on the list. 6 | Thanks for helping out. 7 | 8 | PS: Please add in alphabetical order. 9 | 10 | * Benni Mack 11 | * Sebastian Mazza 12 | * Chris Müller 13 | * Tizian Schmidlin 14 | * Tobias Stahn 15 | * Tomas Norre Mikkelsen 16 | -------------------------------------------------------------------------------- /Documentation/Features/Events/_ModifySkipPageEventListener.php: -------------------------------------------------------------------------------- 1 | getPageRow()['uid'] === 42) { 14 | $modifySkipPageEvent->setSkipped('Page with uid "42" is excluded by ModifySkipPageEvent'); 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Resources/Private/Php/Libraries/composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "config": { 3 | "classmap-authoritative": true, 4 | "prepend-autoloader": false 5 | }, 6 | "require": { 7 | "beberlei/assert": "^3.3", 8 | "doctrine/dbal": "^3.10 || ^4.3", 9 | "guzzlehttp/guzzle": "^6.4.1 || ^7.2", 10 | "psr/http-message": "^2.0", 11 | "psr/http-server-handler": "^1.0", 12 | "psr/http-server-middleware": "^1.0", 13 | "psr/log": "^1.0 || ^2.0 || ^3.0", 14 | "symfony/console": "^7.2", 15 | "symfony/service-contracts": "^3.6" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Resources/Public/Icons/bullet_green.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /Resources/Public/Icons/bullet_red.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /Resources/Public/Icons/bullet_orange.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_setup.typoscript: -------------------------------------------------------------------------------- 1 | plugin.tx_news.settings { 2 | # categories and categoryconjunction are not considered in detail view, so they must be overridden 3 | overrideFlexformSettingsIfEmpty = cropMaxCharacters,dateField,timeRestriction,archiveRestriction,orderBy,orderDirection,backPid,listPid,startingpoint,recursive,list.paginate.itemsPerPage,list.paginate.templatePath,categories,categoryConjunction 4 | # see the news extension for possible values of categoryConjunction 5 | categoryConjunction = AND 6 | categories = 7 | detail.errorHandling = pageNotFoundHandler 8 | } 9 | -------------------------------------------------------------------------------- /Documentation/UseCases/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _use-cases: 4 | 5 | ========= 6 | Use cases 7 | ========= 8 | 9 | This section is made to show different use cases for the crawler, and what value 10 | it can bring by installing it. The crawler has transformed over the years to 11 | have multiple use cases. If you have some that is not listed here, feel free 12 | to make a PR or issue on `https://github.com/tomasnorre/crawler 13 | `_. 14 | 15 | .. toctree:: 16 | :maxdepth: 5 17 | :titlesonly: 18 | :glob: 19 | 20 | CacheWarmup/Index 21 | IndexedSearch/Index 22 | 23 | -------------------------------------------------------------------------------- /Configuration/RequestMiddlewares.php: -------------------------------------------------------------------------------- 1 | [ 10 | 'aoe/crawler/authentication' => [ 11 | 'target' => FrontendUserAuthenticator::class, 12 | 'after' => ['typo3/cms-frontend/authentication'], 13 | 'before' => ['typo3/cms-frontend/page-resolver'], 14 | ], 15 | 'aoe/crawler/initialization' => [ 16 | 'target' => CrawlerInitialization::class, 17 | 'before' => ['typo3/cms-frontend/prepare-tsfe-rendering'], 18 | ], 19 | ], 20 | ]; 21 | -------------------------------------------------------------------------------- /Documentation/Configuration/ExtensionManagerConfiguration/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _extension-manager-configuration: 4 | 5 | =============================== 6 | Extension Manager Configuration 7 | =============================== 8 | 9 | A lot of options were added to the extension manager configuration, 10 | that allow settings to improve and enable new crawler features: 11 | 12 | .. figure:: /Images/backend_configuration_settings.png 13 | :alt: Backend configuration: Settings 14 | 15 | Backend configuration: Settings 16 | 17 | .. figure:: /Images/backend_configuration_queue.png 18 | :alt: Backend configuration: Queue 19 | 20 | Backend configuration: Queue 21 | -------------------------------------------------------------------------------- /Resources/Private/Layouts/BackendModule.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

5 | 6 |

7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 |

15 | 16 |

17 | 18 |
19 |
20 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/RunViaBackend/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _run-backend: 4 | 5 | =============== 6 | Run via backend 7 | =============== 8 | 9 | To process the queue you must either set up a cron-job on your server 10 | or use the backend to process the queue: 11 | 12 | .. figure:: /Images/backend_processlist_add_process.png 13 | :alt: Process the queue via backend 14 | 15 | Process the queue via backend 16 | 17 | You can also (re-)crawl single URLs manually from within the :guilabel:`Crawler 18 | log` view in the info module: 19 | 20 | .. figure:: /Images/backend_crawlerlog_recrawl.png 21 | :alt: Crawl single URLs via backend 22 | 23 | Crawl single URLs via backend 24 | -------------------------------------------------------------------------------- /Classes/EventListener/ShouldUseCachedPageDataIfAvailableEventListener.php: -------------------------------------------------------------------------------- 1 | getRequest()->getAttribute('tx_crawler') === null) { 18 | return; 19 | } 20 | $event->setShouldUseCachedPageData(false); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Classes/Exception/ProcessException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class ProcessException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Documentation/Features/Hooks/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _hooks: 4 | 5 | ===== 6 | Hooks 7 | ===== 8 | 9 | Register the following hooks in :file:`ext_localconf.php` of your extension. 10 | 11 | .. _hooks-excludeDoktype: 12 | 13 | excludeDoktype Hook 14 | =================== 15 | 16 | By adding doktype ids to following array you can exclude them from 17 | being crawled: 18 | 19 | .. code-block:: php 20 | :caption: packages/my_extension/ext_localconf.php 21 | 22 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'][] = 23 | 24 | pageVeto Hook 25 | ============= 26 | 27 | .. deprecated:: 11.0.0 28 | Removed in 13.0, please migrate to the PSR-14 Event :ref:`psr14-modify-skip-page-event`! 29 | -------------------------------------------------------------------------------- /ext_emconf.php: -------------------------------------------------------------------------------- 1 | 'Site Crawler', 4 | 'description' => 'TYPO3 Crawler crawls the TYPO3 page tree. Used for cache warmup, indexing, publishing applications etc.', 5 | 'category' => 'module', 6 | 'state' => 'stable', 7 | 'uploadfolder' => 0, 8 | 'createDirs' => '', 9 | 'clearCacheOnLoad' => 0, 10 | 'author' => 'Tomas Norre Mikkelsen', 11 | 'author_email' => 'tomasnorre@gmail.com', 12 | 'author_company' => '', 13 | 'version' => '12.0.10', 14 | 'constraints' => [ 15 | 'depends' => [ 16 | 'php' => '8.1.0-8.99.99', 17 | 'typo3' => '12.4.0-13.4.99', 18 | ], 19 | 'conflicts' => [], 20 | 'suggests' => [], 21 | ] 22 | ]; 23 | -------------------------------------------------------------------------------- /Classes/Exception/NoIndexFoundException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class NoIndexFoundException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Classes/EventListener/AfterQueueItemAddedEventListener.php: -------------------------------------------------------------------------------- 1 | getConnectionForTable(QueueRepository::TABLE_NAME) 17 | ->update(QueueRepository::TABLE_NAME, $event->getFieldArray(), [ 18 | 'qid' => (int) $event->getQueueId(), 19 | ]); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Classes/Exception/CommandNotFoundException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class CommandNotFoundException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Classes/Exception/ExtensionSettingsException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | class ExtensionSettingsException extends \Exception 26 | { 27 | } 28 | -------------------------------------------------------------------------------- /Documentation/UseCases/CacheWarmup/_commands.bash: -------------------------------------------------------------------------------- 1 | # Done to make sure the crawler queue is empty, so that we will only crawl important pages. 2 | $ vendor/bin/typo3 crawler:flushQueue all 3 | 4 | # Now we want to fill the crawler queue, 5 | # This will start on page uid 1 with the deployment configuration and depth 99, 6 | # --mode exec crawles the pages instantly so we don't need a secondary process for that. 7 | $ vendor/bin/typo3 crawler:buildQueue 1 deployment --depth 99 --mode exec 8 | 9 | # Add the rest of the pages to crawler queue and have the processed with the scheduler 10 | # --mode queue is default, but it is added for visibility, 11 | # we assume that you have a crawler configuration called default 12 | $ vendor/bin/typo3 crawler:buildQueue 1 default --depth 99 --mode queue 13 | -------------------------------------------------------------------------------- /Resources/Public/Icons/crawler_stop.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_url.txt: -------------------------------------------------------------------------------- 1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode url 2 | https://crawler-devbox.ddev.site/content-examples/overview 3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text 4 | https://crawler-devbox.ddev.site/content-examples/text/headers 5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list 6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser 7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon 8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns 9 | https://crawler-devbox.ddev.site/content-examples/text/list-group 10 | https://crawler-devbox.ddev.site/content-examples/text/panel 11 | -------------------------------------------------------------------------------- /ext_localconf.php: -------------------------------------------------------------------------------- 1 | isPackageActive('indexed_search')) { 13 | // Register with "indexed_search" extension 14 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']['indexed_search'] = [ 15 | 'key' => 'tx_indexedsearch_reindex', 16 | 'value' => 'Re-indexing' 17 | ]; 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /Classes/Exception/TimeStampException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | * @deprecated since 12.0.5 will be removed in v14.x 25 | */ 26 | class TimeStampException extends \Exception 27 | { 28 | } 29 | -------------------------------------------------------------------------------- /Classes/Exception/CrawlerObjectException.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | * @deprecated since 12.0.5 will be removed in v14.x 25 | */ 26 | class CrawlerObjectException extends \Exception 27 | { 28 | } 29 | -------------------------------------------------------------------------------- /Classes/Helper/Sleeper/SleeperInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | interface SleeperInterface 26 | { 27 | public function sleep(int $seconds): void; 28 | } 29 | -------------------------------------------------------------------------------- /Configuration/Icons.php: -------------------------------------------------------------------------------- 1 | [ 9 | 'provider' => SvgIconProvider::class, 10 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_configuration.svg', 11 | ], 12 | 'tx-crawler-start' => [ 13 | 'provider' => SvgIconProvider::class, 14 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_start.svg', 15 | ], 16 | 'tx-crawler-stop' => [ 17 | 'provider' => SvgIconProvider::class, 18 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_stop.svg', 19 | ], 20 | 'tx-crawler-icon' => [ 21 | 'provider' => SvgIconProvider::class, 22 | 'source' => 'EXT:crawler/Resources/Public/Icons/Extension.svg', 23 | ], 24 | ]; 25 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _executing-the-queue-label: 4 | 5 | =================== 6 | Executing the queue 7 | =================== 8 | 9 | The idea of the queue is that a large number of tasks can be submitted 10 | to the queue and performed over longer time. This could be interesting 11 | for several reasons; 12 | 13 | - To spread server load over time. 14 | 15 | - To time the requests for nightly processing. 16 | 17 | - And simply to avoid `max_execution_time` of PHP to limit processing 18 | to 30 seconds! 19 | 20 | 21 | .. toctree:: 22 | :maxdepth: 5 23 | :titlesonly: 24 | :glob: 25 | 26 | RunningViaCommandController/Index 27 | ExecutingQueueWithCron-job/Index 28 | RunViaBackend/Index 29 | BuildingAndExecutingQueueRightAway(fromCli)/Index 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help 2 | help: ## Displays this list of targets with descriptions 3 | @echo "The following commands are available:\n" 4 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[32m%-30s\033[0m %s\n", $$1, $$2}' 5 | 6 | 7 | .PHONY: docs 8 | docs: ## Generate projects docs (from "Documentation" directory) 9 | mkdir -p Documentation-GENERATED-temp 10 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation 11 | 12 | 13 | .PHONY: test-docs 14 | test-docs: ## Test the documentation rendering 15 | mkdir -p Documentation-GENERATED-temp 16 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation --no-progress --minimal-test 17 | -------------------------------------------------------------------------------- /Classes/Writer/FileWriter/CsvWriter/CsvWriterInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | interface CsvWriterInterface 26 | { 27 | public function arrayToCsv(array $records): string; 28 | } 29 | -------------------------------------------------------------------------------- /Classes/Hooks/CrawlerHookInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * (c) 2021- Tomas Norre Mikkelsen 10 | * 11 | * This file is part of the TYPO3 Crawler Extension. 12 | * 13 | * It is free software; you can redistribute it and/or modify it under 14 | * the terms of the GNU General Public License, either version 2 15 | * of the License, or any later version. 16 | * 17 | * For the full copyright and license information, please read the 18 | * LICENSE.txt file that was distributed with this source code. 19 | * 20 | * The TYPO3 project - inspiring people to share! 21 | */ 22 | 23 | /** 24 | * @internal since v12.0.0 25 | */ 26 | interface CrawlerHookInterface 27 | { 28 | public function crawler_init(): void; 29 | } 30 | -------------------------------------------------------------------------------- /Classes/Helper/Sleeper/NullSleeper.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /* 23 | * @internal 24 | * @codeCoverageIgnore 25 | */ 26 | final class NullSleeper implements SleeperInterface 27 | { 28 | #[\Override] 29 | public function sleep(int $seconds): void 30 | { 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Classes/Helper/Sleeper/SystemSleeper.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /* 23 | * @internal 24 | */ 25 | final class SystemSleeper implements SleeperInterface 26 | { 27 | #[\Override] 28 | public function sleep(int $seconds): void 29 | { 30 | \sleep($seconds); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /Resources/Public/Icons/crawler_start.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Documentation/guides.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 11 | 17 | 18 | -------------------------------------------------------------------------------- /Documentation/Features/MultiprocessSupport/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _multi-process: 4 | 5 | ===================== 6 | Multi process support 7 | ===================== 8 | 9 | If you want to optimize the crawling process for speed (instead of low 10 | server stress), maybe because the machine is a dedicated staging 11 | machine you should experiment with the new multi process features. 12 | 13 | In the extension settings you can set how many processes are allowed to 14 | run at the same time, how many queue entries a process should grab and 15 | how long a process is allowed to run. Then run one (or even more) 16 | crawling processes per minute. You'll be able to speed up the crawler quite a lot. 17 | 18 | But choose your settings carefully as it puts loads on the server. 19 | 20 | .. figure:: /Images/crawler_settings_processLimit.png 21 | :alt: Backend configuration: Processing 22 | 23 | Backend configuration: Processing 24 | -------------------------------------------------------------------------------- /Classes/Process/ProcessManagerInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @codeCoverageIgnore 24 | * @internal since v12.0.10 25 | */ 26 | interface ProcessManagerInterface 27 | { 28 | public function processExists(int $pid): bool; 29 | 30 | public function killProcess(int $pid): void; 31 | 32 | public function findDispatcherProcesses(): array; 33 | } 34 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Release | TYPO3 | PHP | Fixes will contain 6 | |---------|-----------|---------|---| 7 | | 12.x.y | 12.4-13.3 | 8.1-8.4 |Features, Bugfixes, Security Updates, Since 12.0.6 TYPO3 13.4, Since 12.0.7 PHP 8.4 8 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Security Updates, Since 11.0.3 PHP 8.1 9 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates 10 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead. 11 | | 8.x.y | | | Releases do not exist 12 | | 7.x.y | | | Releases do not exist 13 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates 14 | 15 | ## Reporting a Vulnerability 16 | 17 | I case you find a security issue, please write an email to: [tomasnorre@gmail.com](mailto:tomasnorre@gmail.com) or reach out to the [TYPO3 Security Team](https://typo3.org/community/teams/security) 18 | -------------------------------------------------------------------------------- /Classes/Process/ProcessManagerFactory.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * This file is part of the TYPO3 Crawler Extension. 9 | * 10 | * It is free software; you can redistribute it and/or modify it under 11 | * the terms of the GNU General Public License, either version 2 12 | * of the License, or any later version. 13 | * 14 | * For the full copyright and license information, please read the 15 | * LICENSE.txt file that was distributed with this source code. 16 | * 17 | * The TYPO3 project - inspiring people to share! 18 | */ 19 | 20 | use TYPO3\CMS\Core\Core\Environment; 21 | 22 | /** 23 | * @internal since v12.0.10 24 | */ 25 | class ProcessManagerFactory 26 | { 27 | public static function create(): ProcessManagerInterface 28 | { 29 | if (Environment::isWindows()) { 30 | return new WindowsProcessManager(); 31 | } 32 | 33 | return new UnixProcessManager(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Classes/Event/AfterUrlCrawledEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class AfterUrlCrawledEvent 26 | { 27 | public function __construct( 28 | private readonly string $url, 29 | private readonly array $result 30 | ) { 31 | } 32 | 33 | public function getUrl(): string 34 | { 35 | return $this->url; 36 | } 37 | 38 | public function getResult(): array 39 | { 40 | return $this->result; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Classes/Event/AfterUrlAddedToQueueEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class AfterUrlAddedToQueueEvent 26 | { 27 | public function __construct( 28 | private readonly string $uid, 29 | private readonly array $fieldArray 30 | ) { 31 | } 32 | 33 | public function getUid(): string 34 | { 35 | return $this->uid; 36 | } 37 | 38 | public function getFieldArray(): array 39 | { 40 | return $this->fieldArray; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/RunningViaCommandController/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _command-controller: 4 | 5 | ========================== 6 | Run via command controller 7 | ========================== 8 | 9 | .. _command-controller-buildqueue: 10 | 11 | Create queue 12 | ------------ 13 | 14 | .. code-block:: bash 15 | :caption: replace vendor/bin/typo3 with your own cli runner 16 | 17 | $ vendor/bin/typo3 crawler:buildQueue [--depth ] [--number ] [--mode ] 18 | 19 | .. _command-controller-processqueue: 20 | 21 | Run queue 22 | --------- 23 | 24 | .. code-block:: bash 25 | :caption: replace vendor/bin/typo3 with your own cli runner 26 | 27 | $ vendor/bin/typo3 crawler:processQueue [--amount ] [--sleeptime ] [--sleepafter ] 28 | 29 | .. _command-controller-flushqueue: 30 | 31 | Flush queue 32 | ----------- 33 | 34 | .. code-block:: bash 35 | :caption: replace vendor/bin/typo3 with your own cli runner 36 | 37 | $ vendor/bin/typo3 crawler:flushQueue 38 | -------------------------------------------------------------------------------- /Documentation/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _start: 4 | 5 | ====================== 6 | Site Crawler Extension 7 | ====================== 8 | 9 | :Extension key: 10 | crawler 11 | 12 | :Package name: 13 | tomasnorre/crawler 14 | 15 | :Version: 16 | |release| 17 | 18 | :Language: 19 | en 20 | 21 | :Author: 22 | Tomas Norre Mikkelsen 23 | 24 | :Copyright: 25 | 2005-2021 AOE GmbH, since 2021 Tomas Norre Mikkelsen 26 | 27 | :License: 28 | This document is published under the `Open Content License 29 | `_. 30 | 31 | :Rendered: 32 | |today| 33 | 34 | ---- 35 | 36 | Libraries and scripts for crawling the TYPO3 page tree. Used for re-caching, re-indexing, publishing applications etc. 37 | 38 | ---- 39 | 40 | **Table of Contents:** 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | :titlesonly: 45 | 46 | Introduction/Index 47 | Configuration/Index 48 | ExecutingTheQueue/Index 49 | Scheduler/Index 50 | UseCases/Index 51 | Features/Index 52 | Troubleshooting/Index 53 | Links/Links 54 | 55 | .. toctree:: 56 | :hidden: 57 | 58 | Sitemap 59 | -------------------------------------------------------------------------------- /Classes/Event/InvokeQueueChangeEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Model\Reason; 23 | 24 | /** 25 | * @internal since v12.0.0 26 | */ 27 | final class InvokeQueueChangeEvent 28 | { 29 | public function __construct( 30 | private readonly Reason $reason 31 | ) { 32 | } 33 | 34 | public function getReasonDetailedText(): string 35 | { 36 | return $this->reason->getDetailText(); 37 | } 38 | 39 | public function getReasonText(): string 40 | { 41 | return $this->reason->getReason(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Classes/Service/UserService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\GeneralUtility; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | class UserService 28 | { 29 | public static function hasGroupAccess(string $groupList, string $accessList): bool 30 | { 31 | if (empty($accessList)) { 32 | return true; 33 | } 34 | foreach (explode(',', $groupList) as $groupUid) { 35 | if (GeneralUtility::inList($accessList, $groupUid)) { 36 | return true; 37 | } 38 | } 39 | return false; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /Classes/Value/CrawlAction.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Assert\Assert; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | final class CrawlAction implements \Stringable 28 | { 29 | private readonly string $crawlAction; 30 | 31 | public function __construct(string $crawlAction) 32 | { 33 | Assert::that($crawlAction) 34 | ->inArray(['start', 'log', 'multiprocess']); 35 | 36 | $this->crawlAction = $crawlAction; 37 | } 38 | 39 | #[\Override] 40 | public function __toString(): string 41 | { 42 | return $this->crawlAction; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Classes/Value/QueueFilter.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Assert\Assert; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | class QueueFilter implements \Stringable 28 | { 29 | private readonly string $queueFilter; 30 | 31 | public function __construct(string $queueFilter = 'all') 32 | { 33 | Assert::that($queueFilter) 34 | ->inArray(['all', 'pending', 'finished']); 35 | 36 | $this->queueFilter = $queueFilter; 37 | } 38 | 39 | #[\Override] 40 | public function __toString(): string 41 | { 42 | return $this->queueFilter; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Resources/Private/Language/locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | 9 | 10 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 11 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Classes/Service/ProcessInstructionService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\GeneralUtility; 23 | 24 | /** 25 | * @internal since v11.0.3 26 | */ 27 | class ProcessInstructionService 28 | { 29 | public function isAllowed(string $processInstruction, array $incoming): bool 30 | { 31 | if (empty($incoming)) { 32 | return true; 33 | } 34 | 35 | foreach ($incoming as $pi) { 36 | if (GeneralUtility::inList($processInstruction, $pi)) { 37 | return true; 38 | } 39 | } 40 | return false; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Classes/Event/ModifySkipPageEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class ModifySkipPageEvent 26 | { 27 | private false|string $skipped = false; 28 | 29 | public function __construct( 30 | private readonly array $pageRow 31 | ) { 32 | } 33 | 34 | public function isSkipped(): false|string 35 | { 36 | return $this->skipped; 37 | } 38 | 39 | public function setSkipped(false|string $skipped): void 40 | { 41 | $this->skipped = $skipped; 42 | } 43 | 44 | public function getPageRow(): array 45 | { 46 | return $this->pageRow; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Classes/Event/BeforeQueueItemAddedEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class BeforeQueueItemAddedEvent 26 | { 27 | public function __construct( 28 | private readonly int $queueId, 29 | private array $queueRecord 30 | ) { 31 | } 32 | 33 | public function getQueueId(): int 34 | { 35 | return $this->queueId; 36 | } 37 | 38 | public function getQueueRecord(): array 39 | { 40 | return $this->queueRecord; 41 | } 42 | 43 | public function setQueueRecord(array $queueRecord): void 44 | { 45 | $this->queueRecord = $queueRecord; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_exec.txt: -------------------------------------------------------------------------------- 1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode exec 2 | https://crawler-devbox.ddev.site/content-examples/overview 3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text 4 | https://crawler-devbox.ddev.site/content-examples/text/headers 5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list 6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser 7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon 8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns 9 | https://crawler-devbox.ddev.site/content-examples/text/list-group 10 | https://crawler-devbox.ddev.site/content-examples/text/panel 11 | ... 12 | Processing 13 | 14 | https://crawler-devbox.ddev.site/content-examples/overview () => 15 | 16 | OK: 17 | User Groups: 18 | 19 | https://crawler-devbox.ddev.site/content-examples/text/rich-text () => 20 | 21 | OK: 22 | User Groups: 23 | 24 | https://crawler-devbox.ddev.site/content-examples/text/headers () => 25 | 26 | OK: 27 | User Groups: 28 | 29 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list () => 30 | 31 | OK: 32 | User Groups: 33 | ... 34 | -------------------------------------------------------------------------------- /Classes/Hooks/ProcessCleanUpHook.php: -------------------------------------------------------------------------------- 1 | 9 | * (c) 2021- Tomas Norre Mikkelsen 10 | * 11 | * This file is part of the TYPO3 Crawler Extension. 12 | * 13 | * It is free software; you can redistribute it and/or modify it under 14 | * the terms of the GNU General Public License, either version 2 15 | * of the License, or any later version. 16 | * 17 | * For the full copyright and license information, please read the 18 | * LICENSE.txt file that was distributed with this source code. 19 | * 20 | * The TYPO3 project - inspiring people to share! 21 | */ 22 | 23 | use AOE\Crawler\Process\Cleaner\OldProcessCleaner; 24 | use AOE\Crawler\Process\Cleaner\OrphanProcessCleaner; 25 | 26 | /** 27 | * @internal since v9.2.5 28 | */ 29 | class ProcessCleanUpHook implements CrawlerHookInterface 30 | { 31 | public function __construct( 32 | private readonly OrphanProcessCleaner $orphanCleaner, 33 | private readonly OldProcessCleaner $oldCleaner 34 | ) { 35 | } 36 | 37 | #[\Override] 38 | public function crawler_init(): void 39 | { 40 | $this->orphanCleaner->clean(); 41 | $this->oldCleaner->clean(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Classes/Event/AfterQueueItemAddedEvent.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v12.0.0 24 | */ 25 | final class AfterQueueItemAddedEvent 26 | { 27 | /** 28 | * @param int|string $queueId 29 | */ 30 | public function __construct( 31 | private $queueId, 32 | private array $fieldArray 33 | ) { 34 | } 35 | 36 | public function getQueueId(): int|string 37 | { 38 | return $this->queueId; 39 | } 40 | 41 | public function getFieldArray(): array 42 | { 43 | return $this->fieldArray; 44 | } 45 | 46 | public function setFieldArray(array $fieldArray): void 47 | { 48 | $this->fieldArray = $fieldArray; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/CrawlStrategyFactory.php: -------------------------------------------------------------------------------- 1 | configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance( 20 | ExtensionConfigurationProvider::class 21 | ); 22 | } 23 | 24 | public function create(): CrawlStrategyInterface 25 | { 26 | $extensionSettings = $this->configurationProvider->getExtensionConfiguration(); 27 | if ($extensionSettings['makeDirectRequests'] ?? false) { 28 | /** @var CrawlStrategyInterface $instance */ 29 | $instance = GeneralUtility::makeInstance(SubProcessExecutionStrategy::class, $this->configurationProvider); 30 | } else { 31 | $instance = GeneralUtility::makeInstance(GuzzleExecutionStrategy::class, $this->configurationProvider); 32 | } 33 | 34 | return $instance; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default.txt: -------------------------------------------------------------------------------- 1 | 38 entries found for processing. (Use "mode" to decide action): 2 | 3 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/overview 4 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/rich-text 5 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/headers 6 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/bullet-list 7 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser 8 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-and-icon 9 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-in-columns 10 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/list-group 11 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/panel 12 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/table 13 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/quote 14 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/audio 15 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/text-and-images 16 | ... 17 | [10-04-20 10:36] https://crawler-devbox.ddev.site/content-examples/and-more/frames 18 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/CallbackExecutionStrategy.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Controller\CrawlerController; 23 | use TYPO3\CMS\Core\Utility\GeneralUtility; 24 | 25 | /** 26 | * Used for hooks (e.g. crawling external files) 27 | * @internal since v12.0.0 28 | */ 29 | class CallbackExecutionStrategy 30 | { 31 | /** 32 | * In the future, the callback should implement an interface. 33 | * @template T of object 34 | * @param class-string $callbackClassName 35 | */ 36 | public function fetchByCallback(string $callbackClassName, array $parameters, CrawlerController $crawlerController) 37 | { 38 | // Calling custom object 39 | $callBackObj = GeneralUtility::makeInstance($callbackClassName); 40 | return $callBackObj->crawler_execute($parameters, $crawlerController); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Documentation/Features/PollableProcessingInstructions/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _pollable-processing: 4 | 5 | ================================ 6 | Pollable processing instructions 7 | ================================ 8 | 9 | Some processing instructions are never executed on the "client side" 10 | (the TYPO3 frontend that is called by the crawler). This happens for 11 | example if a try to staticpub a page containing non-cacheable 12 | elements. That bad thing about this is, that staticpub doesn't have 13 | any chance to tell that something went wrong and why. That's why we 14 | introduced the "pollable processing instructions" feature. You can 15 | define in the :file:`ext_localconf.php` file of your extension that this 16 | extension should be "pollable" bye adding following line: 17 | 18 | .. code-block:: php 19 | :caption: packages/my_extension/ext_localconf.php 20 | 21 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'][] = 'tx_staticpub'; 22 | 23 | In this case the crawler expects the extension to tell if everything 24 | was ok actively, assuming that something went wrong (and displaying 25 | this in the log) is no "success message" was found. 26 | 27 | In your extension than simple write your "ok" status by calling this: 28 | 29 | .. code-block:: php 30 | :caption: packages/my_extension/ext_localconf.php 31 | 32 | $GLOBALS['TSFE']->applicationData['tx_crawler']['success']['tx_staticpub'] = true; 33 | 34 | -------------------------------------------------------------------------------- /Classes/Process/WindowsProcessManager.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @codeCoverageIgnore 24 | * @internal since v12.0.10 25 | */ 26 | class WindowsProcessManager implements ProcessManagerInterface 27 | { 28 | #[\Override] 29 | public function processExists(int $pid): bool 30 | { 31 | exec('tasklist | find "' . $pid . '"', $returnArray); 32 | return count($returnArray) > 0 && stripos($returnArray[0], 'php') !== false; 33 | } 34 | 35 | #[\Override] 36 | public function killProcess(int $pid): void 37 | { 38 | exec('taskkill /PID ' . $pid); 39 | } 40 | 41 | #[\Override] 42 | public function findDispatcherProcesses(): array 43 | { 44 | $returnArray = []; 45 | exec('tasklist | find \'typo3 crawler:processQueue\'', $returnArray); 46 | return $returnArray; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Classes/CrawlStrategy/CrawlStrategyInterface.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Http\Message\UriInterface; 23 | 24 | /** 25 | * @internal since v12.0.0 26 | */ 27 | interface CrawlStrategyInterface 28 | { 29 | /** 30 | * Fetch the given URL and return its textual response 31 | * 32 | * @return array|false "false" on errors without explanation. 33 | * Array may contain the following optional keys: 34 | * - errorlog: array of string error messages 35 | * - content: HTML content (string) 36 | * - running: bool 37 | * - parameters: array 38 | * - log: array of strings 39 | * - vars: array 40 | */ 41 | public function fetchUrlContents(UriInterface $url, string $crawlerId); 42 | } 43 | -------------------------------------------------------------------------------- /Classes/Utility/HookUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Hooks\ProcessCleanUpHook; 23 | 24 | /** 25 | * @codeCoverageIgnore 26 | * @internal since v9.2.5 27 | */ 28 | class HookUtility 29 | { 30 | /** 31 | * Registers hooks 32 | * 33 | * @param string $extKey 34 | */ 35 | public static function registerHooks($extKey): void 36 | { 37 | // Activating Crawler cli_hooks 38 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['cli_hooks'][] = 39 | ProcessCleanUpHook::class; 40 | 41 | // Activating refresh hooks 42 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['refresh_hooks'][] = 43 | ProcessCleanUpHook::class; 44 | 45 | $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['clearPageCacheEval'][] = 46 | "AOE\Crawler\Hooks\DataHandlerHook->addFlushedPagesToCrawlerQueue"; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Resources/Public/Icons/Extension.svg: -------------------------------------------------------------------------------- 1 | ext_icon_crawler -------------------------------------------------------------------------------- /Resources/Public/Icons/crawler_configuration.svg: -------------------------------------------------------------------------------- 1 | ext_icon_crawler_transparent -------------------------------------------------------------------------------- /Classes/Writer/FileWriter/CsvWriter/CrawlerCsvWriter.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Utility\CsvUtility; 23 | 24 | /** 25 | * @internal since v9.2.5 26 | */ 27 | final class CrawlerCsvWriter implements CsvWriterInterface 28 | { 29 | private const CARRIAGE_RETURN = 13; 30 | private const LINE_FEED = 10; 31 | 32 | #[\Override] 33 | public function arrayToCsv(array $records): string 34 | { 35 | $csvLines = []; 36 | reset($records); 37 | 38 | $csvLines[] = $this->getRowHeaders($records); 39 | foreach ($records as $row) { 40 | $csvLines[] = CsvUtility::csvValues($row); 41 | } 42 | 43 | return implode(chr(self::CARRIAGE_RETURN) . chr(self::LINE_FEED), $csvLines); 44 | } 45 | 46 | private function getRowHeaders(array $lines): string 47 | { 48 | $fieldNames = array_map(strval(...), array_keys(current($lines))); 49 | return CsvUtility::csvValues($fieldNames); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /Documentation/Features/AutomaticAddPagesToQueue/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _add-to-queue: 4 | 5 | ============================ 6 | Automatic add pages to Queue 7 | ============================ 8 | 9 | .. versionadded:: 9.1.0 10 | 11 | .. _add-to-queue-edit: 12 | 13 | Edit Pages 14 | ---------- 15 | 16 | With this feature, you will automatically add pages to the crawler queue 17 | when you are editing content on the page, unless it's within a workspace, then 18 | it will not be added to the queue before it's published. 19 | 20 | This functionality gives you the advantages that you would not need to keep track 21 | of which pages you have edited, it will automatically be handle on next crawler 22 | process task, see :ref:`executing-the-queue-label`. This ensure that 23 | your cache or e.g. Search Index is always up to date and the end-users will see 24 | the most current content as soon as possible. 25 | 26 | .. _add-to-queue-cache: 27 | 28 | Clear Page Single Cache 29 | ----------------------- 30 | 31 | As the edit and clear page cache function is using the same dataHandler hooks, 32 | we have an additional feature for free. When you clear the page cache for a specific 33 | page then it will also be added automatically to the crawler queue. Again this will 34 | be processed during the next crawler process. 35 | 36 | .. figure:: /Images/backend_clear_cache.png 37 | :alt: Clearing the page cache 38 | 39 | Clearing the page cache 40 | 41 | .. figure:: /Images/backend_clear_cache_queue.png 42 | :alt: Page is added to the crawler queue 43 | 44 | Page is added to the crawler queue 45 | -------------------------------------------------------------------------------- /Resources/Private/Language/da.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Side Id som crawleren vil bruge for at indlæse TSFE (påkrævet) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | Når en side crawles direkte fra TYPO3 Backend. fx. ved at bruge "læs" funktionaliteten i "Crawler Log" modulet, bruges den valgte siden til at initialisere frontend renderingen. Adgang til den valgte side <strong>MÅ IKKE</strong> være begrænset, i så fald vil crawlingen fejle. 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /Classes/Process/UnixProcessManager.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @codeCoverageIgnore 24 | * @internal since v12.0.10 25 | */ 26 | class UnixProcessManager implements ProcessManagerInterface 27 | { 28 | #[\Override] 29 | public function processExists(int $pid): bool 30 | { 31 | return file_exists('/proc/' . $pid); 32 | } 33 | 34 | #[\Override] 35 | public function killProcess(int $pid): void 36 | { 37 | posix_kill($pid, 9); 38 | } 39 | 40 | #[\Override] 41 | public function findDispatcherProcesses(): array 42 | { 43 | $returnArray = []; 44 | if (exec('which ps')) { 45 | // ps command is defined 46 | exec("ps aux | grep 'typo3 crawler:processQueue'", $returnArray); 47 | } else { 48 | trigger_error( 49 | 'Crawler is unable to locate the ps command to clean up orphaned crawler processes.', 50 | E_USER_WARNING 51 | ); 52 | } 53 | 54 | return $returnArray; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /Resources/Private/Language/af.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ar.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ca.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/cs.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/el.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/fi.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/fr.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/he.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/hu.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/it.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ja.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ko.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/nl.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/no.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/pl.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ro.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/ru.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/sr.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/tr.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/uk.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/vi.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/es.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/pt.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/sv.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Resources/Private/Language/zh.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Page Id the crawler will use for initializing the TSFE (required) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Classes/Configuration/ExtensionConfigurationProvider.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Log\LoggerAwareInterface; 23 | use Psr\Log\LoggerAwareTrait; 24 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException; 25 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException; 26 | use TYPO3\CMS\Core\Configuration\ExtensionConfiguration; 27 | use TYPO3\CMS\Core\Utility\GeneralUtility; 28 | 29 | /** 30 | * @internal since v9.2.5 31 | */ 32 | class ExtensionConfigurationProvider implements LoggerAwareInterface 33 | { 34 | use LoggerAwareTrait; 35 | 36 | /** 37 | * Return full extension configuration array. 38 | */ 39 | public function getExtensionConfiguration(): array 40 | { 41 | try { 42 | return GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('crawler'); 43 | } catch (ExtensionConfigurationExtensionNotConfiguredException|ExtensionConfigurationPathDoesNotExistException $e) { 44 | $this->logger?->error($e->getMessage()); 45 | } 46 | return []; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /Classes/Crawler.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Core\Environment; 23 | use TYPO3\CMS\Core\SingletonInterface; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | /** 27 | * @internal since v9.2.5 28 | */ 29 | final class Crawler implements SingletonInterface 30 | { 31 | private readonly string $processFilename; 32 | 33 | public function __construct(?string $processFilename = null) 34 | { 35 | $this->processFilename = $processFilename ?: Environment::getVarPath() . '/lock/tx_crawler.proc'; 36 | $this->setDisabled(false); 37 | $pathInfo = pathinfo($this->processFilename); 38 | GeneralUtility::mkdir_deep($pathInfo['dirname']); 39 | } 40 | 41 | public function setDisabled(bool $disabled = true): void 42 | { 43 | if ($disabled) { 44 | GeneralUtility::writeFile($this->processFilename, ''); 45 | } elseif (is_file($this->processFilename)) { 46 | unlink($this->processFilename); 47 | } 48 | } 49 | 50 | public function isDisabled(): bool 51 | { 52 | return is_file($this->processFilename); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Resources/Private/Language/de.locallang_csh_tx_crawler_configuration.xlf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 | Page Id the crawler will use for initializing the TSFE (required) 8 | Seiten-ID, die der Crawler zur Initialisierung des TSFE verwendet (erforderlich) 9 | 10 | 11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering. 12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise. 13 | Beim Crawlen einer Seite direkt im TYPO3-Backend, z.B. unter Verwendung der "read"-Funktionalität des Moduls "Crawler-Protokoll" wird die ausgewählte Seiten-ID zur Initialisierung der Frontend-Darstellung verwendet. 14 | Zugriff auf die ausgewählte Seite <strong>DARF NICHT</strong> eingeschränkt sein; das Crawling wird sonst fehlschlagen. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Classes/Controller/Backend/Helper/UrlBuilder.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Http\Message\UriInterface; 23 | use TYPO3\CMS\Backend\Routing\Exception\RouteNotFoundException; 24 | use TYPO3\CMS\Backend\Routing\UriBuilder; 25 | use TYPO3\CMS\Core\Utility\GeneralUtility; 26 | 27 | /** 28 | * @internal since v9.2.5 29 | */ 30 | class UrlBuilder 31 | { 32 | /** 33 | * Returns the URL to the current module, including $_GET['id']. 34 | * 35 | * @param array $uriParameters optional parameters to add to the URL 36 | * 37 | * @throws RouteNotFoundException 38 | */ 39 | public static function getBackendModuleUrl( 40 | array $uriParameters = [], 41 | string $module = 'web_site_crawler' 42 | ): UriInterface { 43 | $id = $GLOBALS['TYPO3_REQUEST']->getParsedBody()['id'] ?? $GLOBALS['TYPO3_REQUEST']->getQueryParams()['id'] ?? null; 44 | if ($id) { 45 | $uriParameters['id'] = $id; 46 | } 47 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class); 48 | return $uriBuilder->buildUriFromRoute($module, $uriParameters); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ### Contributing 2 | 3 | When you have a PR, please run the following checks first. 4 | 5 | * `composer test:all` 6 | * Requires a mysql-database, you can boot one with `docker-compose` from the `.Docker`-directory 7 | * `composer cs-fix` 8 | * Ensures that coding standards are respected 9 | * `composer analyse` 10 | * Will run PHPStan and do a static code analysis, this is not adjust completely in build yet, but please try to avoid adding new violations. ;) 11 | 12 | ### Writing documentation 13 | 14 | You can render the documentation in this extension with the command 15 | 16 | ``` 17 | make docs 18 | ``` 19 | 20 | #### Devbox 21 | 22 | If you don't have a setup already, where you can do development, bugfixing etc. for the crawler, don't worry. 23 | 24 | We have included a [ddev](https://www.ddev.com) devbox to help the development. 25 | 26 | ##### Prerequisites 27 | 28 | * [DDEV](https://www.ddev.com) 29 | * Docker 30 | 31 | ##### How to use the devbox? 32 | 33 | ```shell script 34 | $ git clone git@github.com:tomasnorre/crawler.git 35 | $ cd .devbox 36 | $ ddev start 37 | ``` 38 | 39 | Username/password: `admin`/`password` 40 | 41 | And start working. 42 | 43 | **INFO** 44 | xdebug is disabled as default, to speed up the devbox when xdebug isn't needed. 45 | 46 | This can be activated with `ddev xdebug on`. 47 | 48 | #### Running tests without local development environment 49 | If you don't have `php` and/or `composer` installed on your host machine, 50 | you can run the test from withing the `ddev` docker container. 51 | 52 | Do that go into the `.devbox` folder an run `ddev ssh`. 53 | From there you need to switch folder into `/public/typo3conf/ext/crawler` 54 | and run `composer` commands from there (see above). 55 | -------------------------------------------------------------------------------- /Documentation/Scheduler/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _scheduler: 4 | 5 | ========= 6 | Scheduler 7 | ========= 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 5 12 | :titlesonly: 13 | :glob: 14 | 15 | 16 | As seen in :ref:`executing-the-queue-label` you can execute the queue in 17 | multiple ways, but it's no fun doing that manually all the time. 18 | 19 | With the Crawler you have the possibility to add Scheduler Tasks to be executed 20 | on a give time. The Crawler commands are implemented with the Symfony Console, 21 | and therefore they can be configured with the Core supported 22 | `Execute console commands (scheduler)` task. 23 | 24 | So how to setup crawler scheduler tasks: 25 | 26 | 1. Add a new Scheduler Task 27 | 2. Select the class :guilabel:`Execute console commands` 28 | 3. Select :guilabel:`Frequency` for the execution 29 | 4. Go to section :guilabel:`Schedulable Command. Save and reopen to define 30 | command arguments` at the bottom. 31 | 5. Select e.g. :guilabel:`crawler:buildQueue` (press save) 32 | 6. Select the options you want to execute the queue with, it's important to 33 | check the checkboxes and not only fill in the values. 34 | 35 | Now you can save and close, and your scheduler tasks will be running as 36 | configured. 37 | 38 | The configured task will look like this: 39 | 40 | .. figure:: /Images/backend_scheduler_record.png 41 | :alt: Task configuration for building the queue 42 | 43 | Task configuration for building the queue 44 | 45 | And after save and close, you can see what command is executed, it would be 46 | the same parameters, you can use when running from cli, 47 | see :ref:`executing-the-queue-cli-label` 48 | 49 | .. figure:: /Images/backend_scheduler_overview.png 50 | :alt: Task in the scheduled tasks overview 51 | 52 | Task in the scheduled tasks overview 53 | -------------------------------------------------------------------------------- /Classes/Utility/PhpBinaryUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; 23 | use AOE\Crawler\Exception\CommandNotFoundException; 24 | use AOE\Crawler\Exception\ExtensionSettingsException; 25 | use TYPO3\CMS\Core\Utility\CommandUtility; 26 | use TYPO3\CMS\Core\Utility\GeneralUtility; 27 | 28 | /** 29 | * @internal since v9.2.5 30 | */ 31 | class PhpBinaryUtility 32 | { 33 | public static function getPhpBinary(): string 34 | { 35 | $extensionSettings = GeneralUtility::makeInstance( 36 | ExtensionConfigurationProvider::class 37 | )->getExtensionConfiguration(); 38 | 39 | if (empty($extensionSettings)) { 40 | throw new ExtensionSettingsException('ExtensionSettings are empty', 1_587_066_853); 41 | } 42 | 43 | if (empty($extensionSettings['phpPath'])) { 44 | $phpPath = CommandUtility::getCommand($extensionSettings['phpBinary']); 45 | if ($phpPath === false) { 46 | throw new CommandNotFoundException( 47 | 'The phpBinary: "' . $extensionSettings['phpBinary'] . '" could not be found!', 48 | 1_587_068_215 49 | ); 50 | } 51 | } else { 52 | $phpPath = $extensionSettings['phpPath']; 53 | } 54 | 55 | return $phpPath; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /composer-dependency-analyser.php: -------------------------------------------------------------------------------- 1 | addPathToScan(__DIR__ . '/Classes', isDev: false) 11 | //->addPathToExclude(__DIR__ . '/samples') 12 | ->disableComposerAutoloadPathScan() // disable automatic scan of autoload & autoload-dev paths from composer.json 13 | ->setFileExtensions(['php']); // applies only to directory scanning, not directly listed files 14 | 15 | //// Ignoring errors 16 | //->ignoreErrors([ErrorType::DEV_DEPENDENCY_IN_PROD]) 17 | //->ignoreErrorsOnPath(__DIR__ . '/cache/DIC.php', [ErrorType::SHADOW_DEPENDENCY]) 18 | //->ignoreErrorsOnPackage('symfony/polyfill-php73', [ErrorType::UNUSED_DEPENDENCY]) 19 | //->ignoreErrorsOnPackageAndPath('symfony/console', __DIR__ . '/src/OptionalCommand.php', [ErrorType::SHADOW_DEPENDENCY]) 20 | //->ignoreErrorsOnExtension('ext-intl', [ErrorType::SHADOW_DEPENDENCY]) 21 | //->ignoreErrorsOnExtensionAndPath('ext-sqlite3', __DIR__ . '/tests', [ErrorType::SHADOW_DEPENDENCY]) 22 | 23 | //// Ignoring unknown symbols 24 | //->ignoreUnknownClasses(['Memcached']) 25 | //->ignoreUnknownClassesRegex('~^DDTrace~') 26 | //->ignoreUnknownFunctions(['opcache_invalidate']) 27 | //->ignoreUnknownFunctionsRegex('~^opcache_~') 28 | 29 | //// Adjust analysis 30 | //->enableAnalysisOfUnusedDevDependencies() // dev packages are often used only in CI, so this is not enabled by default 31 | //->disableReportingUnmatchedIgnores() // do not report ignores that never matched any error 32 | //->disableExtensionsAnalysis() // do not analyse ext-* dependencies 33 | 34 | //// Use symbols from yaml/xml/neon files 35 | // - designed for DIC config files (see below) 36 | // - beware that those are not validated and do not even trigger unknown class error 37 | //->addForceUsedSymbols($classesExtractedFromNeonJsonYamlXmlEtc); 38 | -------------------------------------------------------------------------------- /Classes/Process/Cleaner/OldProcessCleaner.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\ProcessRepository; 23 | use AOE\Crawler\Domain\Repository\QueueRepository; 24 | use AOE\Crawler\Process\ProcessManagerInterface; 25 | 26 | /** 27 | * @internal since v12.0.10 28 | */ 29 | class OldProcessCleaner 30 | { 31 | public function __construct( 32 | private readonly ProcessRepository $processRepository, 33 | private readonly QueueRepository $queueRepository, 34 | private readonly ProcessManagerInterface $processManager 35 | ) { 36 | } 37 | 38 | public function clean(): void 39 | { 40 | $results = $this->processRepository->getActiveProcessesOlderThanOneHour(); 41 | 42 | if (!is_array($results)) { 43 | throw new \UnexpectedValueException('Expected array, got ' . gettype($results)); 44 | } 45 | 46 | foreach ($results as $result) { 47 | $systemProcessId = (int) $result['system_process_id']; 48 | $processId = $result['process_id']; 49 | 50 | if ($systemProcessId <= 1) { 51 | continue; 52 | } 53 | 54 | if ($this->processManager->processExists($systemProcessId)) { 55 | $this->processManager->killProcess($systemProcessId); 56 | } 57 | 58 | $this->processRepository->removeByProcessId($processId); 59 | $this->queueRepository->unsetQueueProcessId($processId); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /Classes/Service/BackendModuleScriptUrlService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Http\Message\ServerRequestInterface; 23 | use TYPO3\CMS\Backend\Routing\UriBuilder; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | class BackendModuleScriptUrlService 27 | { 28 | public function buildScriptUrl( 29 | ServerRequestInterface $request, 30 | string $elementName, 31 | int $pageUid, 32 | array $queryParameters, 33 | string $queryString = '' 34 | ): string { 35 | $mainParams = [ 36 | 'id' => $pageUid, 37 | ]; 38 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class); 39 | $route = $request->getAttribute('route'); 40 | $scriptUrl = (string) $uriBuilder->buildUriFromRoute($route->getOption('_identifier'), $mainParams); 41 | 42 | return $scriptUrl . ($queryString . $this->getAdditionalQueryParams( 43 | $elementName, 44 | $queryParameters 45 | ) . '&' . $elementName . '=${value}'); 46 | } 47 | 48 | /* 49 | * Build query string with affected checkbox/dropdown value removed. 50 | */ 51 | private function getAdditionalQueryParams(string $keyToBeRemoved, array $queryParameters): string 52 | { 53 | $queryString = ''; 54 | unset($queryParameters[$keyToBeRemoved]); 55 | foreach ($queryParameters as $key => $value) { 56 | $queryString .= "&{$key}={$value}"; 57 | } 58 | return $queryString; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Classes/Utility/TcaUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Package\PackageManager; 23 | use TYPO3\CMS\Core\Utility\GeneralUtility; 24 | 25 | /** 26 | * @internal since v9.2.5 27 | */ 28 | class TcaUtility 29 | { 30 | /** 31 | * Get crawler processing instructions. 32 | * This function is called as a itemsProcFunc in tx_crawler_configuration.processing_instruction_filter 33 | * 34 | * @return array 35 | */ 36 | public function getProcessingInstructions(array $configuration) 37 | { 38 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] ?? null)) { 39 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $extensionKey => $extensionConfiguration) { 40 | $configuration['items'][] = [ 41 | 'label' => $extensionConfiguration['value'] . ' [' . $extensionConfiguration['key'] . ']', 42 | 'value' => $extensionConfiguration['key'], 43 | 'icon' => $this->getExtensionIcon($extensionKey), 44 | ]; 45 | } 46 | } 47 | 48 | return $configuration; 49 | } 50 | 51 | private function getExtensionIcon(string $extensionKey): string 52 | { 53 | $packageManager = GeneralUtility::makeInstance(PackageManager::class); 54 | $package = $packageManager->getPackage($extensionKey); 55 | if ($package->getPackageIcon()) { 56 | return $package->getPackagePath() . $package->getPackageIcon(); 57 | } 58 | return ''; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/_NewsDetailEventListener.php: -------------------------------------------------------------------------------- 1 | getAssignedValues(); 14 | $newsItem = $assignedValues['newsItem']; 15 | $demand = $assignedValues['demand']; 16 | $settings = $assignedValues['settings']; 17 | 18 | if ($newsItem !== null) { 19 | $demandedCategories = $demand->getCategories(); 20 | $itemCategories = $newsItem->getCategories()->toArray(); 21 | $itemCategoryIds = \array_map(function ($category) { 22 | return (string) $category->getUid(); 23 | }, $itemCategories); 24 | 25 | if (count($demandedCategories) > 0 && !$this::itemMatchesCategoryDemand( 26 | $settings['categoryConjunction'], 27 | $itemCategoryIds, 28 | $demandedCategories 29 | )) { 30 | $assignedValues['newsItem'] = null; 31 | $event->setAssignedValues($assignedValues); 32 | } 33 | } 34 | } 35 | 36 | protected static function itemMatchesCategoryDemand( 37 | string $categoryConjunction, 38 | array $itemCategoryIds, 39 | array $demandedCategories 40 | ): bool { 41 | $numOfDemandedCategories = \count($demandedCategories); 42 | $intersection = \array_intersect($itemCategoryIds, $demandedCategories); 43 | $numOfCommonItems = \count($intersection); 44 | 45 | switch ($categoryConjunction) { 46 | case 'AND': 47 | return $numOfCommonItems === $numOfDemandedCategories; 48 | case 'OR': 49 | return $numOfCommonItems > 0; 50 | case 'NOTAND': 51 | return $numOfCommonItems < $numOfDemandedCategories; 52 | case 'NOTOR': 53 | return $numOfCommonItems === 0; 54 | } 55 | return true; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Classes/Value/QueueRow.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal 24 | */ 25 | class QueueRow 26 | { 27 | public string $pageTitleHTML = ''; 28 | public string $message = ''; 29 | public string $configurationKey = ''; 30 | public string $parameterConfig = ''; 31 | public string $valuesExpanded = ''; 32 | public string $urls = ''; 33 | public array $options = []; 34 | public string $parameters = ''; 35 | 36 | public function __construct( 37 | public string $pageTitle = '' 38 | ) { 39 | } 40 | 41 | public function setPageTitleHTML(string $pageTitleHTML): void 42 | { 43 | $this->pageTitleHTML = $pageTitleHTML; 44 | } 45 | 46 | public function setMessage(string $message): void 47 | { 48 | $this->message = $message; 49 | } 50 | 51 | public function setConfigurationKey(string $configurationKey): void 52 | { 53 | $this->configurationKey = $configurationKey; 54 | } 55 | 56 | public function setParameterConfig(string $parameterConfig): void 57 | { 58 | $this->parameterConfig = $parameterConfig; 59 | } 60 | 61 | public function setValuesExpanded(string $valuesExpanded): void 62 | { 63 | $this->valuesExpanded = $valuesExpanded; 64 | } 65 | 66 | public function setUrls(string $urls): void 67 | { 68 | $this->urls = $urls; 69 | } 70 | 71 | public function setOptions(array $options): void 72 | { 73 | $this->options = $options; 74 | } 75 | 76 | public function setParameters(string $parameters): void 77 | { 78 | $this->parameters = $parameters; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /Documentation/Features/PriorityCrawling/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _priority-crawling: 4 | 5 | ================= 6 | Priority Crawling 7 | ================= 8 | 9 | .. versionadded:: 9.1.0 10 | 11 | Some website has a quite large number of pages. Some pages are logically more 12 | important than others e.g. the start-, support-, product-, you name it-pages. 13 | These important pages are also the pages where we want to have the best caching 14 | and performance, as they will most likely be the pages with the most changes and 15 | the most traffic. 16 | 17 | With TYPO3 10 LTS the `sysext/seo` introduced among other things, the 18 | `sitemap_priority`, which is used to generate an SEO optimised sitemap.xml 19 | where page priorities are listed as well. Their priorities will most likely be higher the 20 | more important the page is for you and the end-user. 21 | 22 | This logic is something that we can benefit from in the Crawler as well. A 23 | Website with let us say 10.000 pages, will have different importance depending on 24 | the page you are at. Therefore we have changed the functionality of the crawler, 25 | to take the value of this field, range from 0.0 to 1.0, into consideration when 26 | processing the crawler queue. This means that if you have a page with high priority 27 | for your sitemap, it will also be crawled first when a new crawler process is 28 | added. 29 | 30 | This ensures that we will always crawl the pages that have the highest importance to 31 | you and your end-user based on your sitemap priority. We choose to 32 | reuse this field, to not have editors doing work that is more or less similar twice. 33 | 34 | If you don't want to use this functionality, it's ok. You can just ignore the 35 | options that the `sysext/seo` gives you and all pages will by default get a priority 36 | 0.5, and therefore do not influence the processing order as everyone will have the 37 | same priority. 38 | 39 | The existing :guilabel:`SEO` tab will be used to set priorities when editing 40 | pages. 41 | 42 | .. image:: /Images/backend_crawler_seo_v10.png 43 | 44 | .. figure:: /Images/backend_crawler_seo_priority_v10.png 45 | :alt: The SEO tab will contain the sitemap_priority field 46 | 47 | The SEO tab will contain the sitemap_priority field 48 | -------------------------------------------------------------------------------- /Classes/Hooks/DataHandlerHook.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\QueueRepository; 23 | use AOE\Crawler\Service\QueueService; 24 | use TYPO3\CMS\Core\DataHandling\DataHandler; 25 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; 26 | use TYPO3\CMS\Core\Utility\GeneralUtility; 27 | 28 | /** 29 | * @internal since v9.2.5 30 | */ 31 | class DataHandlerHook 32 | { 33 | /** 34 | * @noRector \Rector\DeadCode\Rector\ClassMethod\RemoveUnusedParameterRector 35 | */ 36 | public function addFlushedPagesToCrawlerQueue(array $parameters, DataHandler $dataHandler): void 37 | { 38 | $pageIdsToBeFlushedFromCache = $parameters['pageIdArray']; 39 | if (empty($pageIdsToBeFlushedFromCache)) { 40 | return; 41 | } 42 | foreach ($pageIdsToBeFlushedFromCache as $pageId) { 43 | $pageId = (int) $pageId; 44 | if ($pageId < 1 || empty($this->getPageRepository()->getPage($pageId))) { 45 | continue; 46 | } 47 | if ($this->getQueueRepository()->isPageInQueue($pageId)) { 48 | continue; 49 | } 50 | $this->getQueueService()->addPageToQueue($pageId); 51 | } 52 | } 53 | 54 | public function getQueueRepository(): QueueRepository 55 | { 56 | return GeneralUtility::makeInstance(QueueRepository::class); 57 | } 58 | 59 | public function getQueueService(): QueueService 60 | { 61 | return GeneralUtility::makeInstance(QueueService::class); 62 | } 63 | 64 | public function getPageRepository(): PageRepository 65 | { 66 | return GeneralUtility::makeInstance(PageRepository::class); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Classes/Controller/Ajax/ProcessStatusController.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * This file is part of the TYPO3 Crawler Extension. 9 | * 10 | * It is free software; you can redistribute it and/or modify it under 11 | * the terms of the GNU General Public License, either version 2 12 | * of the License, or any later version. 13 | * 14 | * For the full copyright and license information, please read the 15 | * LICENSE.txt file that was distributed with this source code. 16 | * 17 | * The TYPO3 project - inspiring people to share! 18 | */ 19 | 20 | namespace AOE\Crawler\Controller\Ajax; 21 | 22 | use AOE\Crawler\Domain\Repository\ProcessRepository; 23 | use Psr\Http\Message\ResponseInterface; 24 | use Psr\Http\Message\ServerRequestInterface; 25 | use TYPO3\CMS\Core\Http\Response; 26 | 27 | /** 28 | * @internal since v12.0.10 29 | */ 30 | class ProcessStatusController 31 | { 32 | public function __construct( 33 | private readonly ProcessRepository $processRepository, 34 | ) { 35 | } 36 | 37 | public function getProcessStatus(ServerRequestInterface $request): ResponseInterface 38 | { 39 | $body = $request->getBody()->getContents(); 40 | $data = json_decode($body, true); 41 | $id = $data['id'] ?? null; 42 | 43 | $response = new Response(); 44 | 45 | if ($id === null) { 46 | return $response->withStatus(400, 'No process ID provided'); 47 | } 48 | 49 | $process = $this->processRepository->findByProcessId($id); 50 | if ($process === null) { 51 | return $response->withStatus(404, 'Process with ID: ' . $id . ' not found'); 52 | } 53 | 54 | $content = json_encode( 55 | [ 56 | 'status' => $process->getProgress(), 57 | 'processedItems' => $process->getAmountOfItemsProcessed(), 58 | 'runtime' => $process->getRuntime(), 59 | 'processId' => $process->getProcessId(), 60 | ] 61 | ); 62 | if ($content === false) { 63 | throw new \RuntimeException('Failed to encode JSON response', 1760971184); 64 | } 65 | $response->getBody()->write($content); 66 | return $response; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /Documentation/Introduction/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _introduction: 4 | 5 | ============ 6 | Introduction 7 | ============ 8 | 9 | .. _introduction-what: 10 | 11 | What does it do? 12 | ================ 13 | 14 | The TYPO3 Crawler is an extension which provides possibilities, from both 15 | the TYPO3 backend and from CLIm that helps you with you cache and e.g. 16 | search index. 17 | 18 | The Crawler implements several PSR-14 events, that you can use to "hook" into 19 | if you have certain requirements for your site at the given time. 20 | 21 | See more :ref:`psr14-modify-skip-page-event`. 22 | 23 | It features an API that other extensions can plug into. Example of this 24 | is "indexed\_search" which uses crawler to index content defined by 25 | its Indexing Configurations. Other extensions supporting it are 26 | "staticpub" (publishing to static pages) or "cachemgm" (allows 27 | recaching of pages). 28 | 29 | The requests of URLs is specially designed to request TYPO3 frontends 30 | with special processing instructions. The requests sends a TYPO3 31 | specific header in the GET requests which identifies a special action. 32 | For instance the action requested could be to publish the URL to a 33 | static file or it could be to index its content - or re-cache the 34 | page. These processing instructions are also defined by third-party 35 | extensions (and indexed search is one of them). In this way a 36 | processing instruction can instruct the frontend to perform an action 37 | (like indexing, publishing etc.) which cannot be done with a request 38 | from outside. 39 | 40 | .. _introduction-screenshots: 41 | 42 | Screenshots 43 | =========== 44 | 45 | The extension provides a backend module which displays the queue and log and 46 | allows execution and status check of the "cronscript" from the backend for 47 | testing purposes. 48 | 49 | .. figure:: /Images/backend_processlist.png 50 | 51 | CLI status display 52 | 53 | CLI = Command Line Interface = shell script = cron script 54 | 55 | .. figure:: /Images/backend_crawlerlog.png 56 | 57 | Crawler queue (before processing) / log (after processing) 58 | 59 | .. figure:: /Images/backend_pendingurls.png 60 | 61 | Interface for submitting a batch of URLs to be crawled 62 | 63 | The parameter combinations are programmable through Page TSconfig or 64 | configuration records. 65 | -------------------------------------------------------------------------------- /Documentation/Configuration/Examples/News/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _example-configuration-news: 4 | 5 | ======== 6 | EXT:news 7 | ======== 8 | 9 | The news extensions is one of the most used extensions in the TYPO3 CMS. This 10 | configuration is made under the assumption with a page tree looking similar to this: 11 | 12 | .. figure:: /Images/ext_news_pagetree.png 13 | :alt: Example Pagetree of EXT:news setup 14 | 15 | Example Pagetree of EXT:news setup 16 | 17 | If you want to have a Crawler Configuration that matches this, you can add 18 | following to the :guilabel:`PageTS` for PageId `56`. 19 | 20 | .. literalinclude:: _page.tsconfig 21 | :caption: packages/my_extension/Configuration/Sets/MySet/page.tsconfig 22 | 23 | Now you can add the News detail-view pages to the crawler queue and have them in 24 | the cache and the `indexed_search` index if you are using that. 25 | 26 | .. _example-configuration-news-category: 27 | 28 | Respecting Categories in News 29 | ============================= 30 | 31 | On some installations news is configured in such a way, that news of category A 32 | have their detail view on one page and news of category B have their detail view on 33 | another page. In this case it would still be possible to view news of category A on 34 | the detail page for category B (example.com/detail-page-for-category-B/news-of-category-A). 35 | That means that each news article would be crawled twice - once on the detail page 36 | for category A and once on the detail page for category B. It is possible to use a 37 | PSR-14 event with news to prevent this. 38 | 39 | On both detail pages include this typoscript setup: 40 | 41 | .. literalinclude:: _setup.typoscript 42 | :caption: packages/my_extension/Configuration/Sets/MySet/setup.typoscript 43 | 44 | and register an event listener in your site package. 45 | 46 | .. literalinclude:: _services.yaml 47 | :caption: packages/my_extension/Configuration/Services.yaml 48 | 49 | .. literalinclude:: _NewsDetailEventListener.php 50 | :caption: packages/my_extension/Classes/EventListeners/NewsDetailEventListener.php 51 | 52 | .. warning:: 53 | 54 | Note that this does more than just prevent articles from being indexed twice. It 55 | actually prevents articles from being displayed on a page that is supposed to show 56 | only articles of a certain category! 57 | -------------------------------------------------------------------------------- /Resources/Public/JavaScript/ProcessStatus.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | const ajaxKey = 'crawler_process_status'; 3 | const ajaxUrl = TYPO3.settings?.ajaxUrls?.[ajaxKey]; 4 | 5 | async function fetchStatus(id) { 6 | if (!ajaxUrl) { 7 | console.error('Missing TYPO3 AJAX URL for crawler_process_status'); 8 | return; 9 | } 10 | try { 11 | const resp = await fetch(ajaxUrl, { 12 | method: 'POST', 13 | credentials: 'same-origin', 14 | headers: { 'Content-Type': 'application/json' }, 15 | body: JSON.stringify({id}) 16 | }); 17 | if (!resp.ok) { 18 | throw new Error(`HTTP error ${resp.status}`); 19 | } 20 | const data = await resp.json(); 21 | updateProgress(id, data); 22 | } catch (err) { 23 | console.error('Error fetching status', err); 24 | } 25 | } 26 | 27 | function updateProgress(id, data) { 28 | const bar = document.getElementById(id); 29 | let status = `${data.status}%`; 30 | bar.style.width = status; 31 | bar.innerHTML = status; 32 | updateTableCellByClass(id, 'processedItems', `${data.processedItems}`); 33 | updateTableCellByClass(id, 'runtime', `${data.runtime}`); 34 | 35 | if (Number(data.status) >= 100) { 36 | bar.classList.remove('crawlerprocessprogress-bar'); 37 | // Trigger a refresh of the page to show updated status 38 | document.querySelector('a[title="Refresh"]').click(); 39 | } 40 | } 41 | 42 | function updateTableCellByClass(elementId, cellClass, newValue) { 43 | const el = document.getElementById(elementId); 44 | if (!el) return; 45 | 46 | const row = el.closest('tr'); 47 | if (!row) return; 48 | 49 | const cell = row.querySelector(`td.${cellClass}`); 50 | if (cell) { 51 | cell.textContent = newValue; 52 | } 53 | } 54 | 55 | async function getElementsToUpdate() { 56 | const progressBars = document.getElementsByClassName('crawlerprocessprogress-bar'); 57 | const promises = Array.from(progressBars).map(bar => fetchStatus(bar.id)); 58 | await Promise.all(promises); 59 | } 60 | setInterval(getElementsToUpdate, 3000); 61 | })(); 62 | -------------------------------------------------------------------------------- /Documentation/ExecutingTheQueue/ExecutingQueueWithCron-job/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _with-crown: 4 | 5 | ============================= 6 | Executing queue with cron-job 7 | ============================= 8 | 9 | A "cron-job" refers to a script that runs on the server with time 10 | intervals. 11 | 12 | For this to become reality you must ideally have a cron-job set up. 13 | This assumes you are running on Unix architecture of some sort. The 14 | crontab is often edited by :bash:`crontab -e` and you should insert a line 15 | like this: 16 | 17 | .. code-block:: plaintext 18 | 19 | * * * * * vendor/bin/typo3 crawler:buildQueue > /dev/null 20 | 21 | This will run the script every minute. You should try to run the 22 | script on the command line first to make sure it runs without any 23 | errors. If it doesn't output anything it was successful. 24 | 25 | You will need to have a user called `_cli_` and you must have PHP installed 26 | as a CGI script as well in :path:`/usr/bin/`. 27 | 28 | The user `_cli_` is created by the framework on demand if it does not exist 29 | at the first command line call. 30 | 31 | Make sure that the user `_cli_` has admin-rights. 32 | 33 | In the :guilabel:`CLI status` menu of the :guilabel:`Site Crawler` info module 34 | you can see the status: 35 | 36 | .. figure:: /Images/backend_processlist.png 37 | :alt: Status page in the backend 38 | 39 | Status page in the backend 40 | 41 | This is how it looks just after you ran the script. (You can also see 42 | the full path to the script in the bottom - this is the path to the 43 | script as you should use it on the command line / in the crontab) 44 | 45 | If the cron-script stalls there is a default delay of 1 hour before a 46 | new process will announce the old one dead and run a new one. If a 47 | cron-script takes more than 1 minute and thereby overlaps the next 48 | process, the next process will NOT start if it sees that the "lock- 49 | file" exists (unless that hour has passed). 50 | 51 | The reason why it works like this is to make sure that overlapping 52 | calls to the crawler CLI script will not run parallel processes. So 53 | the second call will just exit if it finds in the status file that the 54 | process is already running. But of course a crashed script will fail 55 | to set the status to "end" and hence this situation can occur. 56 | -------------------------------------------------------------------------------- /Classes/Utility/MessageUtility.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use TYPO3\CMS\Core\Messaging\FlashMessage; 23 | use TYPO3\CMS\Core\Messaging\FlashMessageService; 24 | use TYPO3\CMS\Core\Type\ContextualFeedbackSeverity; 25 | use TYPO3\CMS\Core\Utility\GeneralUtility; 26 | 27 | /** 28 | * @internal since v9.2.5 29 | */ 30 | class MessageUtility 31 | { 32 | /** 33 | * Add notice message to the user interface. 34 | */ 35 | public static function addNoticeMessage(string $message): void 36 | { 37 | self::addMessage($message, ContextualFeedbackSeverity::NOTICE); 38 | } 39 | 40 | /** 41 | * Add error message to the user interface. 42 | */ 43 | public static function addErrorMessage(string $message): void 44 | { 45 | self::addMessage($message, ContextualFeedbackSeverity::ERROR); 46 | } 47 | 48 | /** 49 | * Add error message to the user interface. 50 | */ 51 | public static function addWarningMessage(string $message): void 52 | { 53 | self::addMessage($message, ContextualFeedbackSeverity::WARNING); 54 | } 55 | 56 | /** 57 | * This method is used to add a message to the internal queue 58 | * 59 | * @param string $message the message itself 60 | * @param ContextualFeedbackSeverity $severity message level (0 = success (default), -1 = info, -2 = notice, 1 = warning, 2 = error) 61 | */ 62 | private static function addMessage( 63 | string $message, 64 | ContextualFeedbackSeverity $severity = ContextualFeedbackSeverity::OK 65 | ): void { 66 | $message = GeneralUtility::makeInstance(FlashMessage::class, $message, '', $severity); 67 | 68 | $flashMessageService = GeneralUtility::makeInstance(FlashMessageService::class); 69 | $flashMessageService->getMessageQueueByIdentifier()->addMessage($message); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /Classes/Process/Cleaner/OrphanProcessCleaner.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\ProcessRepository; 23 | use AOE\Crawler\Domain\Repository\QueueRepository; 24 | use AOE\Crawler\Process\ProcessManagerInterface; 25 | use TYPO3\CMS\Core\Utility\GeneralUtility; 26 | 27 | /** 28 | * @internal since v12.0.10 29 | */ 30 | class OrphanProcessCleaner 31 | { 32 | public function __construct( 33 | private readonly ProcessRepository $processRepository, 34 | private readonly QueueRepository $queueRepository, 35 | private readonly ProcessManagerInterface $processManager 36 | ) { 37 | } 38 | 39 | public function clean(): void 40 | { 41 | $results = $this->processRepository->getActiveOrphanProcesses(); 42 | 43 | foreach ($results as $result) { 44 | $systemProcessId = (int) $result['system_process_id']; 45 | $processId = $result['process_id']; 46 | 47 | if ($systemProcessId <= 1) { 48 | continue; 49 | } 50 | 51 | $dispatcherProcesses = $this->processManager->findDispatcherProcesses(); 52 | if (empty($dispatcherProcesses)) { 53 | $this->remove($processId); 54 | return; 55 | } 56 | 57 | $exists = false; 58 | foreach ($dispatcherProcesses as $process) { 59 | $parts = GeneralUtility::trimExplode(' ', $process, true); 60 | if ($systemProcessId === (int) ($parts[1] ?? 0)) { 61 | $exists = true; 62 | break; 63 | } 64 | } 65 | 66 | if (!$exists) { 67 | $this->remove($processId); 68 | } 69 | } 70 | } 71 | 72 | private function remove(string $processId): void 73 | { 74 | $this->processRepository->removeByProcessId($processId); 75 | $this->queueRepository->unsetQueueProcessId($processId); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Classes/Controller/Backend/Helper/RequestHelper.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Psr\Http\Message\ServerRequestInterface; 23 | 24 | /** 25 | * @internal since 12.0.10 26 | */ 27 | final class RequestHelper 28 | { 29 | public static function getIntFromRequest(ServerRequestInterface $request, string $key, int $default = 0): int 30 | { 31 | $body = $request->getParsedBody(); 32 | $query = $request->getQueryParams(); 33 | 34 | $value = (is_array($body) ? ($body[$key] ?? null) : null) 35 | ?? ($query[$key] ?? null) 36 | ?? $default; 37 | 38 | return (int) $value; 39 | } 40 | 41 | public static function getBoolFromRequest(ServerRequestInterface $request, string $key): bool 42 | { 43 | $body = $request->getParsedBody(); 44 | $query = $request->getQueryParams(); 45 | 46 | $value = (is_array($body) ? ($body[$key] ?? null) : null) 47 | ?? ($query[$key] ?? null); 48 | 49 | return !empty($value); 50 | } 51 | 52 | public static function getStringFromRequest( 53 | ServerRequestInterface $request, 54 | string $key, 55 | string $default = '' 56 | ): string { 57 | $body = $request->getParsedBody(); 58 | $query = $request->getQueryParams(); 59 | 60 | $value = (is_array($body) ? ($body[$key] ?? null) : null) 61 | ?? ($query[$key] ?? null) 62 | ?? $default; 63 | 64 | return is_scalar($value) ? (string) $value : $default; 65 | } 66 | 67 | public static function getArrayFromRequest(ServerRequestInterface $request, string $key): array 68 | { 69 | $body = $request->getParsedBody(); 70 | $query = $request->getQueryParams(); 71 | 72 | $source = is_array($body) ? $body : $query; 73 | $value = $source[$key] ?? $query[$key] ?? null; 74 | 75 | if (is_string($value)) { 76 | $value = json_decode($value, true); 77 | } 78 | 79 | return is_array($value) ? $value : []; 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /Classes/Controller/Backend/Helper/ResultHandler.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | /** 23 | * @internal since v9.2.5 24 | */ 25 | class ResultHandler 26 | { 27 | /** 28 | * Extract the log information from the current row and retrieve it as formatted string. 29 | */ 30 | public static function getResultLog(array $resultRow): string 31 | { 32 | $content = ''; 33 | if (array_key_exists('result_data', $resultRow)) { 34 | $requestContent = json_decode((string) $resultRow['result_data'], true) ?: []; 35 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) { 36 | return $content; 37 | } 38 | $requestResult = json_decode((string) $requestContent['content'], true); 39 | 40 | if (is_array($requestResult) && array_key_exists('log', $requestResult)) { 41 | $content = implode(chr(10), $requestResult['log']); 42 | } 43 | } 44 | return $content; 45 | } 46 | 47 | public static function getResStatus(array|bool $requestContent): string 48 | { 49 | if (empty($requestContent)) { 50 | return '-'; 51 | } 52 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) { 53 | return 'Content index does not exists in requestContent array'; 54 | } 55 | 56 | $requestResult = json_decode((string) $requestContent['content'], true); 57 | if (is_array($requestResult)) { 58 | if (empty($requestResult['errorlog'])) { 59 | return 'OK'; 60 | } 61 | return implode("\n", $requestResult['errorlog']); 62 | } 63 | 64 | return 'Error - no info, sorry!'; 65 | } 66 | 67 | /** 68 | * Find Fe vars 69 | */ 70 | public static function getResFeVars(array $resultData): array 71 | { 72 | if (empty($resultData)) { 73 | return []; 74 | } 75 | $requestResult = json_decode((string) $resultData['content'], true); 76 | if (is_bool($requestResult)) { 77 | return []; 78 | } 79 | return $requestResult['vars'] ?? []; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TYPO3 Crawler 2 | [![Latest Stable Version](https://poser.pugx.org/tomasnorre/crawler/v/stable)](https://packagist.org/packages/tomasnorre/crawler) 3 | [![Total Downloads](https://poser.pugx.org/tomasnorre/crawler/downloads)](https://packagist.org/packages/tomasnorre/crawler) 4 | [![License](https://poser.pugx.org/tomasnorre/crawler/license)](https://packagist.org/packages/tomasnorre/crawler) 5 | ![Tests](https://github.com/tomasnorre/crawler/workflows/Tests/badge.svg) 6 | [![Coverage Status](https://coveralls.io/repos/github/tomasnorre/crawler/badge.svg)](https://coveralls.io/github/tomasnorre/crawler) 7 | [![Mutation testing badge](https://img.shields.io/endpoint?style=flat&url=https%3A%2F%2Fbadge-api.stryker-mutator.io%2Fgithub.com%2Ftomasnorre%2Fcrawler%2Fmain)](https://dashboard.stryker-mutator.io/reports/github.com/tomasnorre/crawler/main) 8 | ![Psalm coverage](https://shepherd.dev/github/tomasnorre/crawler/coverage.svg) 9 | 10 | TYPO3 Crawler crawls the TYPO3 page tree. Used for cache warmup, indexing, publishing applications etc. 11 | 12 | 13 | You can include the crawler in your TYPO3 project with composer or from the [TYPO3 Extension Repository](https://extensions.typo3.org/extension/crawler) 14 | 15 | ```shell script 16 | composer require tomasnorre/crawler 17 | ``` 18 | 19 | **Crawler processes** 20 | 21 | ![backend_processlist](https://user-images.githubusercontent.com/1212481/142763110-936be57c-1e9e-4d62-afbe-4134b139fd56.png) 22 | 23 | ## Versions and Support 24 | 25 | | Release | TYPO3 | PHP | Fixes will contain 26 | |---------|-----------|---------|---| 27 | | 12.x.y | 12.4-13.4 | 8.1-8.4 |Features, Bugfixes, Security Updates, Since 12.0.6 TYPO3 13.4, Since 12.0.7 PHP 8.4 28 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Security Updates, Since 11.0.3 PHP 8.1 29 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates 30 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead. 31 | | 8.x.y | | | Releases do not exist 32 | | 7.x.y | | | Releases do not exist 33 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates 34 | 35 | ### Documentation 36 | Please read the [documentation](https://docs.typo3.org/p/tomasnorre/crawler/main/en-us/) 37 | 38 | To render the documentation locally, please use the official TYPO3 Documentation rendering Docker Tool. 39 | 40 | 41 | ### Contributions 42 | 43 | Please see [CONTRIBUTING.md](https://github.com/tomasnorre/crawler/blob/main/CONTRIBUTING.md) 44 | 45 | ### Honorable Previous Maintainers 46 | 47 | * Kasper Skaarhoj 48 | * Daniel Poetzinger 49 | * Fabrizio Branca 50 | * Tolleiv Nietsch 51 | * Timo Schmidt 52 | * Michael Klapper 53 | * Stefan Rotsch 54 | -------------------------------------------------------------------------------- /Classes/Domain/Repository/ConfigurationRepository.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use Doctrine\DBAL\ArrayParameterType; 23 | use TYPO3\CMS\Backend\Utility\BackendUtility; 24 | use TYPO3\CMS\Core\Database\ConnectionPool; 25 | use TYPO3\CMS\Core\Database\Query\QueryBuilder; 26 | use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction; 27 | use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction; 28 | use TYPO3\CMS\Core\Utility\GeneralUtility; 29 | use TYPO3\CMS\Extbase\Persistence\Repository; 30 | 31 | /** 32 | * @internal since v9.2.5 33 | */ 34 | class ConfigurationRepository extends Repository 35 | { 36 | final public const TABLE_NAME = 'tx_crawler_configuration'; 37 | 38 | /** 39 | * Traverses up the rootline of a page and fetches all crawler records. 40 | */ 41 | public function getCrawlerConfigurationRecordsFromRootLine(int $pageId, array $parentIds = []): array 42 | { 43 | if (empty($parentIds)) { 44 | $pageIdsInRootLine = []; 45 | $rootLine = BackendUtility::BEgetRootLine($pageId); 46 | 47 | foreach ($rootLine as $pageInRootLine) { 48 | $pageIdsInRootLine[] = (int) $pageInRootLine['uid']; 49 | } 50 | } else { 51 | $pageIdsInRootLine = $parentIds; 52 | } 53 | 54 | $queryBuilder = $this->createQueryBuilder(); 55 | $queryBuilder 56 | ->getRestrictions()->removeAll() 57 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class)) 58 | ->add(GeneralUtility::makeInstance(HiddenRestriction::class)); 59 | return $queryBuilder 60 | ->select('*') 61 | ->from(self::TABLE_NAME) 62 | ->where( 63 | $queryBuilder->expr()->in( 64 | 'pid', 65 | $queryBuilder->createNamedParameter($pageIdsInRootLine, ArrayParameterType::INTEGER) 66 | ) 67 | ) 68 | ->orderBy('name') 69 | ->executeQuery() 70 | ->fetchAllAssociative(); 71 | } 72 | 73 | protected function createQueryBuilder(): QueryBuilder 74 | { 75 | return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(self::TABLE_NAME); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /Configuration/Backend/Modules.php: -------------------------------------------------------------------------------- 1 | 7 | * 8 | * This file is part of the TYPO3 Crawler Extension. 9 | * 10 | * It is free software; you can redistribute it and/or modify it under 11 | * the terms of the GNU General Public License, either version 2 12 | * of the License, or any later version. 13 | * 14 | * For the full copyright and license information, please read the 15 | * LICENSE.txt file that was distributed with this source code. 16 | * 17 | * The TYPO3 project - inspiring people to share! 18 | */ 19 | 20 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerLogController; 21 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerProcessController; 22 | use AOE\Crawler\Controller\Backend\BackendModuleStartCrawlingController; 23 | 24 | return [ 25 | 'web_site_crawler' => [ 26 | 'parent' => 'web', 27 | 'position' => [ 28 | 'after' => 'web_info', 29 | ], 30 | 'access' => 'user', 31 | 'workspaces' => 'live', 32 | 'path' => '/module/page/crawler', 33 | 'labels' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf', 34 | 'extensionName' => 'Crawler', 35 | 'iconIdentifier' => 'tx-crawler-icon', 36 | 'routes' => [ 37 | '_default' => [ 38 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest', 39 | ], 40 | ], 41 | ], 42 | 'web_site_crawler_start' => [ 43 | 'parent' => 'web_site_crawler', 44 | 'access' => 'user', 45 | 'path' => '/module/page/crawler/start', 46 | 'iconIdentifier' => 'crawler-start', 47 | 'labels' => [ 48 | 'title' => 'Start', 49 | ], 50 | 'routes' => [ 51 | '_default' => [ 52 | 'target' => BackendModuleStartCrawlingController::class . '::handleRequest', 53 | ], 54 | ], 55 | ], 56 | 'web_site_crawler_process' => [ 57 | 'parent' => 'web_site_crawler', 58 | 'access' => 'user', 59 | 'path' => '/module/page/crawler/process', 60 | 'iconIdentifier' => 'crawler-process', 61 | 'labels' => [ 62 | 'title' => 'Process', 63 | ], 64 | 'routes' => [ 65 | '_default' => [ 66 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest', 67 | ], 68 | ], 69 | ], 70 | 'web_site_crawler_log' => [ 71 | 'parent' => 'web_site_crawler', 72 | 'access' => 'user', 73 | 'path' => '/module/page/crawler/log', 74 | 'iconIdentifier' => 'crawler-log', 75 | 'labels' => [ 76 | 'title' => 'Log', 77 | ], 78 | 'routes' => [ 79 | '_default' => [ 80 | 'target' => BackendModuleCrawlerLogController::class . '::handleRequest', 81 | ], 82 | ], 83 | ], 84 | ]; 85 | -------------------------------------------------------------------------------- /Classes/Service/PageService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; 23 | use AOE\Crawler\Event\ModifySkipPageEvent; 24 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; 25 | use TYPO3\CMS\Core\EventDispatcher\EventDispatcher; 26 | use TYPO3\CMS\Core\Utility\GeneralUtility; 27 | 28 | /** 29 | * @internal since v9.2.5 30 | */ 31 | class PageService 32 | { 33 | private readonly EventDispatcher $eventDispatcher; 34 | 35 | public function __construct(?EventDispatcher $eventDispatcher = null) 36 | { 37 | $this->eventDispatcher = $eventDispatcher ?? GeneralUtility::makeInstance(EventDispatcher::class); 38 | } 39 | 40 | /** 41 | * Check if the given page should be crawled 42 | * 43 | * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped 44 | */ 45 | public function checkIfPageShouldBeSkipped(array $pageRow): false|string 46 | { 47 | $extensionSettings = GeneralUtility::makeInstance( 48 | ExtensionConfigurationProvider::class 49 | )->getExtensionConfiguration(); 50 | 51 | // if page is hidden 52 | if (!($extensionSettings['crawlHiddenPages'] ?? false) && ($pageRow['hidden'] ?? false)) { 53 | return 'Because page is hidden'; 54 | } 55 | 56 | if (in_array($pageRow['doktype'], $this->getDisallowedDokTypes(), true)) { 57 | return sprintf('Because doktype "%d" is not allowed', $pageRow['doktype']); 58 | } 59 | 60 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) { 61 | if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) { 62 | return sprintf( 63 | 'Doktype "%d" was excluded by excludeDoktype configuration key "%s"', 64 | $pageRow['doktype'], 65 | $key 66 | ); 67 | } 68 | } 69 | 70 | $event = $this->eventDispatcher->dispatch(new ModifySkipPageEvent($pageRow)); 71 | return $event->isSkipped(); 72 | } 73 | 74 | private function getDisallowedDokTypes(): array 75 | { 76 | return [ 77 | PageRepository::DOKTYPE_LINK, 78 | PageRepository::DOKTYPE_SHORTCUT, 79 | PageRepository::DOKTYPE_SPACER, 80 | PageRepository::DOKTYPE_SYSFOLDER, 81 | PageRepository::DOKTYPE_BE_USER_SECTION, 82 | ]; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /Classes/ContextMenu/ItemProvider.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Domain\Repository\ConfigurationRepository; 23 | use TYPO3\CMS\Backend\ContextMenu\ItemProviders\AbstractProvider; 24 | use TYPO3\CMS\Backend\Utility\BackendUtility; 25 | use TYPO3\CMS\Core\Utility\GeneralUtility; 26 | 27 | /** 28 | * Provides a ContextMenu item 29 | * @internal since v9.2.5 30 | */ 31 | class ItemProvider extends AbstractProvider 32 | { 33 | /** 34 | * @var array 35 | */ 36 | protected $itemsConfiguration = [ 37 | 'crawler' => [ 38 | 'type' => 'item', 39 | 'label' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf:contextMenu.label', 40 | 'iconIdentifier' => 'tx-crawler', 41 | 'callbackAction' => 'crawler', 42 | ], 43 | ]; 44 | 45 | /** 46 | * Item is added only for crawler configurations 47 | */ 48 | #[\Override] 49 | public function canHandle(): bool 50 | { 51 | return $this->table === ConfigurationRepository::TABLE_NAME; 52 | } 53 | 54 | /** 55 | * This needs to be lower than priority of the RecordProvider 56 | */ 57 | #[\Override] 58 | public function getPriority(): int 59 | { 60 | return 50; 61 | } 62 | 63 | /** 64 | * Adds the crawler info 65 | */ 66 | #[\Override] 67 | public function addItems(array $items): array 68 | { 69 | $localItems = $this->prepareItems($this->itemsConfiguration); 70 | return $items + $localItems; 71 | } 72 | 73 | #[\Override] 74 | protected function getAdditionalAttributes(string $itemName): array 75 | { 76 | $crawlerConfiguration = BackendUtility::getRecordWSOL($this->table, (int) $this->identifier); 77 | if ($crawlerConfiguration === null) { 78 | return []; 79 | } 80 | 81 | if (!array_key_exists('name', $crawlerConfiguration)) { 82 | $crawlerConfiguration['name'] = 'No Name found in configuration'; 83 | } 84 | 85 | $additionalParameters = []; 86 | $additionalParameters[] = 'SET[function]=AOE\Crawler\Backend\BackendModule'; 87 | $additionalParameters[] = 'SET[crawlaction]=start'; 88 | $additionalParameters[] = 'configurationSelection[]=' . $crawlerConfiguration['name']; 89 | return [ 90 | 'data-dispatch-action' => 'TYPO3.ModuleMenu.showModule', 91 | 'data-dispatch-args-list' => 'web_site_crawler_start,&' . GeneralUtility::quoteJSvalue( 92 | '&' . implode('&', $additionalParameters) 93 | ), 94 | ]; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /Documentation/Configuration/ConfigurationRecords/Index.rst: -------------------------------------------------------------------------------- 1 | .. include:: /Includes.rst.txt 2 | 3 | .. _backend-configuration-record: 4 | 5 | ===================== 6 | Configuration records 7 | ===================== 8 | 9 | Formerly configuration was done by using pageTS (see below). This is 10 | still possible (fully backwards compatible) but not recommended. 11 | Instead of writing pageTS simply create a configuration record (table: 12 | ``tx_crawler_configuration``) and put it on the topmost page of the 13 | pagetree you want to affect with this configuration. 14 | 15 | The fields in these records are related to the pageTS keys described 16 | below. 17 | 18 | .. _backend-configuration-record-fields: 19 | 20 | Fields and their pageTS equivalents 21 | =================================== 22 | 23 | .. _backend-configuration-record-general: 24 | 25 | General 26 | ------- 27 | 28 | .. figure:: /Images/backend_configurationrecord_general.png 29 | :alt: Backend configuration record: General 30 | 31 | Backend configuration record: General 32 | 33 | Name 34 | Corresponds to the "key" part in the pageTS setup e.g. 35 | :typoscript:`tx_crawler.crawlerCfg.paramSets.myConfigurationKeyName` 36 | 37 | Protocol for crawling 38 | Force HTTP, HTTPS or keep the configured protocol 39 | 40 | Processing instruction filter 41 | List of processing instructions. See also: 42 | :ref:`paramSets.[key].procInstrFilter ` 43 | 44 | Base URL 45 | Set baseUrl (most likely the same as the entry point configured in your 46 | site configuration) 47 | 48 | Pids only 49 | List of Page Ids to limit this configuration to. See also: 50 | :ref:`paramSets.[key].pidsOnly ` 51 | 52 | Exclude pages 53 | Comma separated list of page ids which should not be crawled. 54 | You can do recursive exclusion by adding `uid`+`depth` e.g. 6+3, 55 | this will ensure that all pages including pageUid 6 and 3 levels down 56 | will not be crawled. 57 | 58 | Configuration 59 | Parameter configuration. The values of GET variables are according to a 60 | special syntax. See also: :ref:`paramSets.[key] 61 | ` 62 | 63 | Processing instruction parameters 64 | Options for processing instructions. Will be defined in the respective third 65 | party modules. See also: :ref:`paramSets.[key].procInstrParams 66 | ` 67 | 68 | Crawl with FE user groups 69 | User groups to set for the request. See also: 70 | :ref:`paramSets.[key].userGroups ` and the hint in :ref:`create-crawler-configuration` 71 | 72 | .. _backend-configuration-record-access: 73 | 74 | Access 75 | ------ 76 | 77 | .. figure:: /Images/backend_configurationrecord_access.png 78 | :alt: Backend configuration record: Access 79 | 80 | Backend configuration record: Access 81 | 82 | Hide 83 | If activated the configuration record is not taken into account. 84 | 85 | Restrict access to 86 | Restricts access to this configuration record to selected backend user 87 | groups. Empty means no restriction is set. 88 | -------------------------------------------------------------------------------- /ext_tables.sql: -------------------------------------------------------------------------------- 1 | # 2 | # Table structure for table 'tx_crawler_queue' 3 | # 4 | CREATE TABLE tx_crawler_queue 5 | ( 6 | qid int(11) DEFAULT '0' NOT NULL auto_increment, 7 | page_id int(11) DEFAULT '0' NOT NULL, 8 | parameters text NOT NULL, 9 | parameters_hash varchar(50) DEFAULT '' NOT NULL, 10 | configuration_hash varchar(50) DEFAULT '' NOT NULL, 11 | scheduled int(11) DEFAULT '0' NOT NULL, 12 | exec_time int(11) DEFAULT '0' NOT NULL, 13 | set_id int(11) DEFAULT '0' NOT NULL, 14 | result_data longtext NOT NULL, 15 | process_scheduled int(11) DEFAULT '0' NOT NULL, 16 | process_id varchar(50) DEFAULT '' NOT NULL, 17 | process_id_completed varchar(50) DEFAULT '' NOT NULL, 18 | configuration varchar(250) DEFAULT '' NOT NULL, 19 | 20 | PRIMARY KEY (qid), 21 | KEY page_id (page_id), 22 | KEY set_id (set_id), 23 | KEY exec_time (exec_time), 24 | KEY scheduled (scheduled), 25 | KEY process_id (process_id), 26 | KEY parameters_hash (parameters_hash), 27 | KEY configuration_hash (configuration_hash), 28 | KEY cleanup (exec_time,scheduled) 29 | ) ENGINE=InnoDB; 30 | 31 | # 32 | # Table structure for table 'tx_crawler_process' 33 | # 34 | CREATE TABLE tx_crawler_process 35 | ( 36 | process_id varchar(50) DEFAULT '' NOT NULL, 37 | active smallint(6) DEFAULT '0', 38 | ttl int(11) DEFAULT '0' NOT NULL, 39 | assigned_items_count int(11) DEFAULT '0' NOT NULL, 40 | deleted tinyint(4) unsigned DEFAULT '0' NOT NULL, 41 | system_process_id int(11) DEFAULT '0' NOT NULL, 42 | 43 | KEY update_key (active,deleted), 44 | KEY process_id (process_id) 45 | ) ENGINE=InnoDB; 46 | 47 | # 48 | # Table structure for table 'tx_crawler_configuration' 49 | # 50 | CREATE TABLE tx_crawler_configuration 51 | ( 52 | name tinytext NOT NULL, 53 | force_ssl tinyint(4) DEFAULT '0' NOT NULL, 54 | processing_instruction_filter varchar(200) DEFAULT '' NOT NULL, 55 | processing_instruction_parameters_ts varchar(200) DEFAULT '' NOT NULL, 56 | configuration text NOT NULL, 57 | base_url tinytext NOT NULL, 58 | pidsonly blob, 59 | begroups varchar(100) DEFAULT '0' NOT NULL, 60 | fegroups varchar(100) DEFAULT '0' NOT NULL, 61 | exclude text NOT NULL 62 | 63 | ) ENGINE=InnoDB; 64 | 65 | # 66 | # Table structure for table 'pages' 67 | # This is added to reuse the information from typo3/cms-seo. 68 | # As we don't have a dependency for typo3/cms-seo it's added here to ensure that the 69 | # database queries isn't breaking 70 | # 71 | CREATE TABLE pages 72 | ( 73 | sitemap_priority decimal(2, 1) DEFAULT '0.5' NOT NULL 74 | ); 75 | -------------------------------------------------------------------------------- /Classes/Service/QueueService.php: -------------------------------------------------------------------------------- 1 | 9 | * 10 | * This file is part of the TYPO3 Crawler Extension. 11 | * 12 | * It is free software; you can redistribute it and/or modify it under 13 | * the terms of the GNU General Public License, either version 2 14 | * of the License, or any later version. 15 | * 16 | * For the full copyright and license information, please read the 17 | * LICENSE.txt file that was distributed with this source code. 18 | * 19 | * The TYPO3 project - inspiring people to share! 20 | */ 21 | 22 | use AOE\Crawler\Controller\CrawlerController; 23 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; 24 | use TYPO3\CMS\Core\Utility\GeneralUtility; 25 | 26 | /** 27 | * @internal since v9.2.5 28 | */ 29 | class QueueService 30 | { 31 | public function __construct( 32 | private readonly CrawlerController $crawlerController 33 | ) { 34 | if ($this->crawlerController->setID <= 0) { 35 | $this->crawlerController->setID = GeneralUtility::md5int(microtime()); 36 | } 37 | } 38 | 39 | public function addPageToQueue(int $pageUid, int $time = 0): void 40 | { 41 | $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($pageUid, true); 42 | $configurations = $this->crawlerController->getUrlsForPageRow($pageData); 43 | // Currently this is only used from the DataHandlerHook, and we don't know of any allowed/disallowed configurations, 44 | // when clearing the cache, therefore we allow all configurations in this case. 45 | // This next lines could be skipped as it will return the incoming configurations, but for visibility and 46 | // later implementation it's kept as it do no harm. 47 | $allowedConfigurations = []; 48 | $configurations = ConfigurationService::removeDisallowedConfigurations($allowedConfigurations, $configurations); 49 | $downloadUrls = []; 50 | $duplicateTrack = []; 51 | 52 | foreach ($configurations as $configuration) { 53 | //enable inserting of entries 54 | $this->crawlerController->registerQueueEntriesInternallyOnly = false; 55 | $this->crawlerController->urlListFromUrlArray( 56 | $configuration, 57 | $pageData, 58 | $time, 59 | 300, 60 | true, 61 | false, 62 | $duplicateTrack, 63 | $downloadUrls, 64 | array_keys($this->getCrawlerProcInstructions()) 65 | ); 66 | 67 | //reset the queue because the entries have been written to the db 68 | unset($this->crawlerController->queueEntries); 69 | } 70 | } 71 | 72 | /** 73 | * Reads the registered processingInstructions of the crawler 74 | */ 75 | private function getCrawlerProcInstructions(): array 76 | { 77 | $crawlerProcInstructions = []; 78 | if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) { 79 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) { 80 | $crawlerProcInstructions[$configuration['key']] = $configuration['value']; 81 | } 82 | } 83 | 84 | return $crawlerProcInstructions; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /Classes/Domain/Model/ProcessCollection.php: -------------------------------------------------------------------------------- 1 | 14 | * 15 | * All rights reserved 16 | * 17 | * This script is part of the TYPO3 project. The TYPO3 project is 18 | * free software; you can redistribute it and/or modify 19 | * it under the terms of the GNU General Public License as published by 20 | * the Free Software Foundation; either version 3 of the License, or 21 | * (at your option) any later version. 22 | * 23 | * The GNU General Public License can be found at 24 | * http://www.gnu.org/copyleft/gpl.html. 25 | * 26 | * This script is distributed in the hope that it will be useful, 27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | * GNU General Public License for more details. 30 | * 31 | * This copyright notice MUST APPEAR in all copies of the script! 32 | ***************************************************************/ 33 | 34 | /** 35 | * @internal since v9.2.5 36 | */ 37 | class ProcessCollection extends \ArrayObject 38 | { 39 | /** 40 | * Method to retrieve an element from the collection. 41 | * @throws NoIndexFoundException 42 | */ 43 | #[\Override] 44 | public function offsetGet(mixed $key): Process 45 | { 46 | if (!parent::offsetExists($key)) { 47 | throw new NoIndexFoundException('Index "' . var_export( 48 | $key, 49 | true 50 | ) . '" for \AOE\Crawler\Domain\Model\Process are not available', 1_593_714_823); 51 | } 52 | return parent::offsetGet($key); 53 | } 54 | 55 | /** 56 | * Method to add an element to the collection- 57 | * 58 | * @param Process $value 59 | * @throws InvalidArgumentException 60 | */ 61 | #[\Override] 62 | public function offsetSet(mixed $key, $value): void 63 | { 64 | if (!$value instanceof Process) { 65 | throw new \InvalidArgumentException( 66 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!', 67 | 1_593_714_822 68 | ); 69 | } 70 | 71 | parent::offsetSet($key, $value); 72 | } 73 | 74 | /** 75 | * Method to append an element to the collection 76 | * @param Process $value 77 | * @throws InvalidArgumentException 78 | */ 79 | #[\Override] 80 | public function append($value): void 81 | { 82 | if (!$value instanceof Process) { 83 | throw new \InvalidArgumentException( 84 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!', 85 | 1_593_714_821 86 | ); 87 | } 88 | 89 | parent::append($value); 90 | } 91 | 92 | /** 93 | * returns array of process ids of the current collection 94 | */ 95 | public function getProcessIds(): array 96 | { 97 | $result = []; 98 | foreach ($this->getIterator() as $value) { 99 | $result[] = $value->getProcessId(); 100 | } 101 | return $result; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /ext_conf_template.txt: -------------------------------------------------------------------------------- 1 | ######### 2 | ## Settings 3 | ######### 4 | 5 | # cat=Settings; type=string; label=Frontend website base path: Base path of the website frontend (e.g. if you call http://mydomain.com/cms/index.php in the browser the base path is "/cms/"). Leave empty to use the value of config.absRefPrefix instead. 6 | frontendBasePath=/ 7 | 8 | # cat=Settings; type=boolean; label= Crawl hidden pages: Crawl hidden pages (By default they won't be crawled) 9 | crawlHiddenPages=0 10 | 11 | # cat=Settings; type=boolean; label= Make direct requests: If checked the crawler will make direct requests by including the index.php file instead of getting the page content via http(s) 12 | makeDirectRequests=0 13 | 14 | ######### 15 | ## Queue 16 | ######### 17 | 18 | # cat=Queue; type=int [1- 86400]; label= Maximal number of URLs, which can be added to the queue at one time 19 | maxCompileUrls=10000 20 | 21 | # cat=Queue; type=boolean; label= Enabled timeslot for duplication check: When this option is active, items will not be queued twice for the past if their scheduled time is the current time +-100 seconds. 22 | enableTimeslot=1 23 | 24 | ######### 25 | ## Processing 26 | ######### 27 | 28 | # cat=Processing; type=int [0-10000]; label= Sleep time between requests: Time in microseconds the crawler should sleep between requesting urls: low = faster / high = less stress for the server 29 | sleepTime = 1000 30 | 31 | # cat=Processing; type=int [0-100]; label= Sleep time after finishing: Time in seconds the crawler should sleep before finishing 32 | sleepAfterFinish=10 33 | 34 | # cat=Processing; type=int [1-10000]; label= Entries per run: How many queue entries should be processed in a run 35 | countInARun=100 36 | 37 | # cat=Processing; type=int [1-99]; label= Maximum processes 38 | processLimit=1 39 | 40 | # cat=Processing; type=int [1- 86400]; label= Maximal process runtime: in seconds - only necessary if processLimit > 1 41 | processMaxRunTime=300 42 | 43 | ######### 44 | ## Cleanup 45 | ######### 46 | 47 | # cat=Cleanup; type=boolean; label=Clean up old queue entries: If checked the older queue entries will be deleted when adding new crawler configurations from CLI. 48 | cleanUpOldQueueEntries=1 49 | 50 | # cat=Cleanup; type=int [1- 99]; label=Processed Age: If Clean up old queue entries is checked, then processed entries older than X days are deleted. 51 | cleanUpProcessedAge=2 52 | 53 | # cat=Cleanup; type=int [1- 99]; label=Scheduled Age: If Clean up old queue entries is checked, then scheduled entries older than X days are deleted. 54 | cleanUpScheduledAge=7 55 | 56 | # cat=Cleanup; type=int [1-365]; label= Delete processed items: Delete processed items from the queue after n days (0 will keep the entries forever - the database may grow very large over time!) 57 | purgeQueueDays=14 58 | 59 | ######### 60 | ## System 61 | ######### 62 | 63 | # cat=System; type=string; label= Name of the php binary (e.g. PHP72-LATEST-CLI ), default is php 64 | phpBinary=php 65 | 66 | # cat=System; type=string; label= PHP Path: Local path to php binary file (e.g. "/usr/bin/php"), you should ONLY use this when the resolved php-binary isn't the correct one. You can check that in the Info -> Site Crawling -> Crawling Process -> CLI-Path 67 | phpPath= 68 | 69 | ######### 70 | ## Debug 71 | ######### 72 | 73 | # cat=Debug; type=boolean; label= Debug: Print Multiprocess- processing informations - prints some information whether a process was really executed and which status it has 74 | processDebug=0 75 | 76 | # cat=Debug; type=boolean; label= Make Multiprocess- processing be verbose while running 77 | processVerbose=0 78 | --------------------------------------------------------------------------------