├── .coveralls.yml
├── .phive
└── phars.xml
├── .run
└── Xdebug.run.xml
├── CHANGELOG.md
├── CONTRIBUTERS.md
├── CONTRIBUTING.md
├── Classes
├── Command
│ ├── BuildQueueCommand.php
│ ├── FlushQueueCommand.php
│ └── ProcessQueueCommand.php
├── Configuration
│ └── ExtensionConfigurationProvider.php
├── ContextMenu
│ └── ItemProvider.php
├── Controller
│ ├── Backend
│ │ ├── AbstractBackendModuleController.php
│ │ ├── BackendModuleControllerInterface.php
│ │ ├── BackendModuleCrawlerLogController.php
│ │ ├── BackendModuleCrawlerProcessController.php
│ │ ├── BackendModuleStartCrawlingController.php
│ │ └── Helper
│ │ │ ├── ResultHandler.php
│ │ │ └── UrlBuilder.php
│ └── CrawlerController.php
├── Converter
│ └── JsonCompatibilityConverter.php
├── CrawlStrategy
│ ├── CallbackExecutionStrategy.php
│ ├── CrawlStrategyFactory.php
│ ├── CrawlStrategyInterface.php
│ ├── GuzzleExecutionStrategy.php
│ └── SubProcessExecutionStrategy.php
├── Crawler.php
├── Domain
│ ├── Model
│ │ ├── Configuration.php
│ │ ├── Process.php
│ │ ├── ProcessCollection.php
│ │ ├── Queue.php
│ │ └── Reason.php
│ └── Repository
│ │ ├── ConfigurationRepository.php
│ │ ├── ProcessRepository.php
│ │ └── QueueRepository.php
├── Event
│ ├── AfterQueueItemAddedEvent.php
│ ├── AfterUrlAddedToQueueEvent.php
│ ├── AfterUrlCrawledEvent.php
│ ├── BeforeQueueItemAddedEvent.php
│ ├── InvokeQueueChangeEvent.php
│ └── ModifySkipPageEvent.php
├── EventListener
│ ├── AfterQueueItemAddedEventListener.php
│ └── ShouldUseCachedPageDataIfAvailableEventListener.php
├── Exception
│ ├── CommandNotFoundException.php
│ ├── CrawlerObjectException.php
│ ├── ExtensionSettingsException.php
│ ├── NoIndexFoundException.php
│ ├── ProcessException.php
│ └── TimeStampException.php
├── Helper
│ └── Sleeper
│ │ ├── NullSleeper.php
│ │ ├── SleeperInterface.php
│ │ └── SystemSleeper.php
├── Hooks
│ ├── CrawlerHookInterface.php
│ ├── DataHandlerHook.php
│ └── ProcessCleanUpHook.php
├── Middleware
│ ├── CrawlerInitialization.php
│ └── FrontendUserAuthenticator.php
├── QueueExecutor.php
├── Service
│ ├── BackendModuleLinkService.php
│ ├── BackendModuleLogService.php
│ ├── BackendModuleScriptUrlService.php
│ ├── ConfigurationService.php
│ ├── PageService.php
│ ├── ProcessInstructionService.php
│ ├── ProcessService.php
│ ├── QueueService.php
│ ├── UrlService.php
│ └── UserService.php
├── Utility
│ ├── HookUtility.php
│ ├── MessageUtility.php
│ ├── PhpBinaryUtility.php
│ └── TcaUtility.php
├── Value
│ ├── CrawlAction.php
│ ├── QueueFilter.php
│ └── QueueRow.php
└── Writer
│ └── FileWriter
│ └── CsvWriter
│ ├── CrawlerCsvWriter.php
│ └── CsvWriterInterface.php
├── Configuration
├── Backend
│ └── Modules.php
├── Extbase
│ └── Persistence
│ │ └── Classes.php
├── Icons.php
├── RequestMiddlewares.php
├── Services.yaml
└── TCA
│ └── tx_crawler_configuration.php
├── Documentation
├── Configuration
│ ├── ConfigurationRecords
│ │ └── Index.rst
│ ├── Examples
│ │ ├── Index.rst
│ │ └── News
│ │ │ ├── Index.rst
│ │ │ ├── _NewsDetailEventListener.php
│ │ │ ├── _page.tsconfig
│ │ │ ├── _services.yaml
│ │ │ └── _setup.typoscript
│ ├── ExtensionManagerConfiguration
│ │ └── Index.rst
│ ├── HttpAuthentication
│ │ └── Index.rst
│ ├── Index.rst
│ └── PageTsconfigReference(txCrawlercrawlercfg)
│ │ ├── Index.rst
│ │ ├── _page.tsconfig
│ │ └── _paramSets_page.tsconfig
├── ExecutingTheQueue
│ ├── BuildingAndExecutingQueueRightAway(fromCli)
│ │ ├── Index.rst
│ │ ├── _output_buildQueue_6_default.txt
│ │ ├── _output_buildQueue_6_default_mode_exec.txt
│ │ └── _output_buildQueue_6_default_mode_url.txt
│ ├── ExecutingQueueWithCron-job
│ │ └── Index.rst
│ ├── Index.rst
│ ├── RunViaBackend
│ │ └── Index.rst
│ └── RunningViaCommandController
│ │ └── Index.rst
├── Features
│ ├── AutomaticAddPagesToQueue
│ │ └── Index.rst
│ ├── Events
│ │ ├── Index.rst
│ │ ├── _AfterQueueItemAddedEventListener.php
│ │ ├── _AfterQueueItemAddedEventListener_services.yaml
│ │ ├── _AfterUrlAddedToQueueEventListener.php
│ │ ├── _AfterUrlAddedToQueueEventListener_services.yaml
│ │ ├── _AfterUrlCrawledEventListener.php
│ │ ├── _AfterUrlCrawledEventListener_services.yaml
│ │ ├── _BeforeQueueItemAddedEventListener.php
│ │ ├── _BeforeQueueItemAddedEventListener_services.yaml
│ │ ├── _InvokeQueueChangeEventListener.php
│ │ ├── _InvokeQueueChangeEvent_services.yaml
│ │ ├── _ModifySkipPageEventListener.php
│ │ └── _ModifySkipPageEventListener_services.yaml
│ ├── Hooks
│ │ ├── Index.rst
│ │ └── _PageVeto.php
│ ├── Index.rst
│ ├── MultiprocessSupport
│ │ └── Index.rst
│ ├── PollableProcessingInstructions
│ │ └── Index.rst
│ └── PriorityCrawling
│ │ └── Index.rst
├── Images
│ ├── backend_addfromcontextmenu.png
│ ├── backend_clear_cache.png
│ ├── backend_clear_cache_queue.png
│ ├── backend_configuration_deployment.png
│ ├── backend_configuration_queue.png
│ ├── backend_configuration_settings.png
│ ├── backend_configurationrecord_access.png
│ ├── backend_configurationrecord_general.png
│ ├── backend_crawler_seo_priority_v10.png
│ ├── backend_crawler_seo_v10.png
│ ├── backend_crawlerlog.png
│ ├── backend_crawlerlog_recrawl.png
│ ├── backend_info_php_error.png
│ ├── backend_pendingurls.png
│ ├── backend_php_path_configuration.png
│ ├── backend_processlist.png
│ ├── backend_processlist_add_process.png
│ ├── backend_recrawl.png
│ ├── backend_scheduler_overview.png
│ ├── backend_scheduler_processqueue.png
│ ├── backend_scheduler_record.png
│ ├── backend_startcrawling.png
│ ├── backend_startnewprocess.png
│ ├── cli_addtoque.png
│ ├── cli_processque.png
│ ├── crawler_settings_processLimit.png
│ └── ext_news_pagetree.png
├── Includes.rst.txt
├── Index.rst
├── Introduction
│ └── Index.rst
├── Links
│ └── Links.rst
├── Scheduler
│ └── Index.rst
├── Sitemap.rst
├── Troubleshooting
│ ├── Index.rst
│ └── _htaccess.txt
├── UseCases
│ ├── CacheWarmup
│ │ ├── Index.rst
│ │ └── _commands.bash
│ ├── Index.rst
│ └── IndexedSearch
│ │ └── Index.rst
└── guides.xml
├── LICENSE
├── Makefile
├── README.md
├── Resources
├── Private
│ ├── Language
│ │ ├── Backend.xlf
│ │ ├── af.Backend.xlf
│ │ ├── af.locallang.xlf
│ │ ├── af.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── ar.Backend.xlf
│ │ ├── ar.locallang.xlf
│ │ ├── ar.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── ca.Backend.xlf
│ │ ├── ca.locallang.xlf
│ │ ├── ca.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── cs.Backend.xlf
│ │ ├── cs.locallang.xlf
│ │ ├── cs.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── da.Backend.xlf
│ │ ├── da.locallang.xlf
│ │ ├── da.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── de.Backend.xlf
│ │ ├── de.locallang.xlf
│ │ ├── de.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── el.Backend.xlf
│ │ ├── el.locallang.xlf
│ │ ├── el.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── es.Backend.xlf
│ │ ├── es.locallang.xlf
│ │ ├── es.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── fi.Backend.xlf
│ │ ├── fi.locallang.xlf
│ │ ├── fi.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── fr.Backend.xlf
│ │ ├── fr.locallang.xlf
│ │ ├── fr.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── he.Backend.xlf
│ │ ├── he.locallang.xlf
│ │ ├── he.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── hu.Backend.xlf
│ │ ├── hu.locallang.xlf
│ │ ├── hu.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── it.Backend.xlf
│ │ ├── it.locallang.xlf
│ │ ├── it.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── ja.Backend.xlf
│ │ ├── ja.locallang.xlf
│ │ ├── ja.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── ko.Backend.xlf
│ │ ├── ko.locallang.xlf
│ │ ├── ko.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── locallang.xlf
│ │ ├── locallang_csh_tx_crawler_configuration.xlf
│ │ ├── nl.Backend.xlf
│ │ ├── nl.locallang.xlf
│ │ ├── nl.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── no.Backend.xlf
│ │ ├── no.locallang.xlf
│ │ ├── no.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── pl.Backend.xlf
│ │ ├── pl.locallang.xlf
│ │ ├── pl.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── pt.Backend.xlf
│ │ ├── pt.locallang.xlf
│ │ ├── pt.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── ro.Backend.xlf
│ │ ├── ro.locallang.xlf
│ │ ├── ro.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── ru.Backend.xlf
│ │ ├── ru.locallang.xlf
│ │ ├── ru.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── sr.Backend.xlf
│ │ ├── sr.locallang.xlf
│ │ ├── sr.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── sv.Backend.xlf
│ │ ├── sv.locallang.xlf
│ │ ├── sv.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── tr.Backend.xlf
│ │ ├── tr.locallang.xlf
│ │ ├── tr.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── uk.Backend.xlf
│ │ ├── uk.locallang.xlf
│ │ ├── uk.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── vi.Backend.xlf
│ │ ├── vi.locallang.xlf
│ │ ├── vi.locallang_csh_tx_crawler_configuration.xlf
│ │ ├── zh.Backend.xlf
│ │ ├── zh.locallang.xlf
│ │ └── zh.locallang_csh_tx_crawler_configuration.xlf
│ ├── Layouts
│ │ └── BackendModule.html
│ ├── Php
│ │ └── Libraries
│ │ │ └── composer.json
│ └── Templates
│ │ └── Backend
│ │ ├── ProcessOverview.html
│ │ ├── ShowCrawlerInformation.html
│ │ └── ShowLog.html
└── Public
│ ├── Css
│ └── backend_crawler.css
│ └── Icons
│ ├── Extension.svg
│ ├── bullet_green.svg
│ ├── bullet_orange.svg
│ ├── bullet_red.svg
│ ├── crawler_configuration.svg
│ ├── crawler_start.svg
│ └── crawler_stop.svg
├── SECURITY.md
├── cli
├── bootstrap.php
└── conf.php
├── composer.json
├── ext_conf_template.txt
├── ext_emconf.php
├── ext_localconf.php
└── ext_tables.sql
/.coveralls.yml:
--------------------------------------------------------------------------------
1 | coverage_clover: "*-coverage.clover"
2 |
--------------------------------------------------------------------------------
/.phive/phars.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.run/Xdebug.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/CONTRIBUTERS.md:
--------------------------------------------------------------------------------
1 | # Contributers
2 |
3 | List of contributers to the Crawler TYPO3 V9 Compatibility.
4 |
5 | Adding the name to the list is optional, email as well if you want your name on the list.
6 | Thanks for helping out.
7 |
8 | PS: Please add in alphabetical order.
9 |
10 | * Benni Mack
11 | * Sebastian Mazza
12 | * Chris Müller
13 | * Tizian Schmidlin
14 | * Tobias Stahn
15 | * Tomas Norre Mikkelsen
16 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ### Contributing
2 |
3 | When you have a PR, please run the following checks first.
4 |
5 | * `composer test:all`
6 | * Requires a mysql-database, you can boot one with `docker-compose` from the `.Docker`-directory
7 | * `composer cs-fix`
8 | * Ensures that coding standards are respected
9 | * `composer analyse`
10 | * Will run PHPStan and do a static code analysis, this is not adjust completely in build yet, but please try to avoid adding new violations. ;)
11 |
12 | ### Writing documentation
13 |
14 | You can render the documentation in this extension with the command
15 |
16 | ```
17 | make docs
18 | ```
19 |
20 | #### Devbox
21 |
22 | If you don't have a setup already, where you can do development, bugfixing etc. for the crawler, don't worry.
23 |
24 | We have included a [ddev](https://www.ddev.com) devbox to help the development.
25 |
26 | ##### Prerequisites
27 |
28 | * [DDEV](https://www.ddev.com)
29 | * Docker
30 |
31 | ##### How to use the devbox?
32 |
33 | ```shell script
34 | $ git clone git@github.com:tomasnorre/crawler.git
35 | $ cd .devbox
36 | $ ddev start
37 | ```
38 |
39 | Username/password: `admin`/`password`
40 |
41 | And start working.
42 |
43 | **INFO**
44 | xdebug is disabled as default, to speed up the devbox when xdebug isn't needed.
45 |
46 | This can be activated with `ddev xdebug on`.
47 |
48 | #### Running tests without local development environment
49 | If you don't have `php` and/or `composer` installed on your host machine,
50 | you can run the test from withing the `ddev` docker container.
51 |
52 | Do that go into the `.devbox` folder an run `ddev ssh`.
53 | From there you need to switch folder into `/public/typo3conf/ext/crawler`
54 | and run `composer` commands from there (see above).
55 |
--------------------------------------------------------------------------------
/Classes/Command/FlushQueueCommand.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\QueueRepository;
23 | use AOE\Crawler\Value\QueueFilter;
24 | use Symfony\Component\Console\Command\Command;
25 | use Symfony\Component\Console\Input\InputArgument;
26 | use Symfony\Component\Console\Input\InputInterface;
27 | use Symfony\Component\Console\Output\OutputInterface;
28 | use TYPO3\CMS\Core\Utility\GeneralUtility;
29 |
30 | /**
31 | * @internal since v12.0.0
32 | */
33 | class FlushQueueCommand extends Command
34 | {
35 | protected function configure(): void
36 | {
37 | $this->setDescription('Remove queue entries and perform a cleanup');
38 |
39 | $this->setHelp(
40 | 'Try "typo3 help crawler:flushQueue" to see your options' . chr(10) . chr(10) .
41 | 'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
42 | It will remove queue entries and perform a cleanup.' . chr(10) . chr(10) .
43 | '
44 | Examples:
45 | --- Remove all finished queue-entries
46 | $ typo3 crawler:flushQueue finished
47 |
48 | --- Remove all pending queue-entries
49 | $ typo3 crawler:flushQueue pending
50 | '
51 | );
52 | $this->addArgument('mode', InputArgument::REQUIRED, 'What to clear: all, finished, pending');
53 | }
54 |
55 | /**
56 | * Crawler Command - Cleaning up the queue.
57 | *
58 | * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
59 | * It will remove queue entries and perform a cleanup.
60 | *
61 | * Examples:
62 | *
63 | * --- Remove all finished queue-entries
64 | * $ typo3 crawler:flushQueue finished
65 | *
66 | * --- Remove all pending queue-entries for all pages
67 | * $ typo3 crawler:flushQueue pending
68 | */
69 | protected function execute(InputInterface $input, OutputInterface $output): int
70 | {
71 | $queueFilter = new QueueFilter($input->getArgument('mode'));
72 |
73 | /** @var QueueRepository $queueRepository */
74 | $queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
75 |
76 | switch ($queueFilter) {
77 | case 'all':
78 | $queueRepository->flushQueue($queueFilter);
79 | $output->writeln('All entries in Crawler queue have been flushed');
80 | break;
81 | case 'finished':
82 | case 'pending':
83 | $queueRepository->flushQueue($queueFilter);
84 | $output->writeln(
85 | 'All entries in Crawler queue with status "' . $queueFilter . '" have been flushed'
86 | );
87 | break;
88 | default:
89 | $output->writeln(
90 | 'No matching parameters found.' . PHP_EOL . 'Try "typo3 help crawler:flushQueue" to see your options'
91 | );
92 | break;
93 | }
94 |
95 | return Command::SUCCESS;
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/Classes/Configuration/ExtensionConfigurationProvider.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Log\LoggerAwareInterface;
23 | use Psr\Log\LoggerAwareTrait;
24 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException;
25 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException;
26 | use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
27 | use TYPO3\CMS\Core\Utility\GeneralUtility;
28 |
29 | /**
30 | * @internal since v9.2.5
31 | */
32 | class ExtensionConfigurationProvider implements LoggerAwareInterface
33 | {
34 | use LoggerAwareTrait;
35 |
36 | /**
37 | * Return full extension configuration array.
38 | */
39 | public function getExtensionConfiguration(): array
40 | {
41 | try {
42 | return GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('crawler');
43 | } catch (ExtensionConfigurationExtensionNotConfiguredException|ExtensionConfigurationPathDoesNotExistException $e) {
44 | $this->logger?->error($e->getMessage());
45 | }
46 | return [];
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Classes/ContextMenu/ItemProvider.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\ConfigurationRepository;
23 | use TYPO3\CMS\Backend\ContextMenu\ItemProviders\AbstractProvider;
24 | use TYPO3\CMS\Backend\Utility\BackendUtility;
25 | use TYPO3\CMS\Core\Utility\GeneralUtility;
26 |
27 | /**
28 | * Provides a ContextMenu item
29 | * @internal since v9.2.5
30 | */
31 | class ItemProvider extends AbstractProvider
32 | {
33 | /**
34 | * @var array
35 | */
36 | protected $itemsConfiguration = [
37 | 'crawler' => [
38 | 'type' => 'item',
39 | 'label' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf:contextMenu.label',
40 | 'iconIdentifier' => 'tx-crawler',
41 | 'callbackAction' => 'crawler',
42 | ],
43 | ];
44 |
45 | /**
46 | * Item is added only for crawler configurations
47 | */
48 | public function canHandle(): bool
49 | {
50 | return $this->table === ConfigurationRepository::TABLE_NAME;
51 | }
52 |
53 | /**
54 | * This needs to be lower than priority of the RecordProvider
55 | */
56 | public function getPriority(): int
57 | {
58 | return 50;
59 | }
60 |
61 | /**
62 | * Adds the crawler info
63 | */
64 | public function addItems(array $items): array
65 | {
66 | $localItems = $this->prepareItems($this->itemsConfiguration);
67 | return $items + $localItems;
68 | }
69 |
70 | protected function getAdditionalAttributes(string $itemName): array
71 | {
72 | $crawlerConfiguration = BackendUtility::getRecordWSOL($this->table, (int) $this->identifier);
73 | if ($crawlerConfiguration === null) {
74 | return [];
75 | }
76 |
77 | if (!array_key_exists('name', $crawlerConfiguration)) {
78 | $crawlerConfiguration['name'] = 'No Name found in configuration';
79 | }
80 |
81 | $additionalParameters = [];
82 | $additionalParameters[] = 'SET[function]=AOE\Crawler\Backend\BackendModule';
83 | $additionalParameters[] = 'SET[crawlaction]=start';
84 | $additionalParameters[] = 'configurationSelection[]=' . $crawlerConfiguration['name'];
85 | return [
86 | 'data-dispatch-action' => 'TYPO3.ModuleMenu.showModule',
87 | 'data-dispatch-args-list' => 'web_site_crawler_start,&' . GeneralUtility::quoteJSvalue(
88 | '&' . implode('&', $additionalParameters)
89 | ),
90 | ];
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/Classes/Controller/Backend/BackendModuleControllerInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Converter\JsonCompatibilityConverter;
23 | use TYPO3\CMS\Core\Utility\GeneralUtility;
24 |
25 | /**
26 | * @internal since v9.2.5
27 | */
28 | class ResultHandler
29 | {
30 | /**
31 | * Extract the log information from the current row and retrieve it as formatted string.
32 | */
33 | public static function getResultLog(array $resultRow): string
34 | {
35 | $content = '';
36 | if (is_array($resultRow) && array_key_exists('result_data', $resultRow)) {
37 | $requestContent = self::getJsonCompatibilityConverter()->convert($resultRow['result_data']) ?: [];
38 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) {
39 | return $content;
40 | }
41 | $requestResult = self::getJsonCompatibilityConverter()->convert($requestContent['content']);
42 |
43 | if (is_array($requestResult) && array_key_exists('log', $requestResult)) {
44 | $content = implode(chr(10), $requestResult['log']);
45 | }
46 | }
47 | return $content;
48 | }
49 |
50 | public static function getResStatus(array|bool $requestContent): string
51 | {
52 | if (empty($requestContent)) {
53 | return '-';
54 | }
55 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) {
56 | return 'Content index does not exists in requestContent array';
57 | }
58 |
59 | $requestResult = self::getJsonCompatibilityConverter()->convert($requestContent['content']);
60 | if (is_array($requestResult)) {
61 | if (empty($requestResult['errorlog'])) {
62 | return 'OK';
63 | }
64 | return implode("\n", $requestResult['errorlog']);
65 | }
66 |
67 | return 'Error - no info, sorry!';
68 | }
69 |
70 | /**
71 | * Find Fe vars
72 | */
73 | public static function getResFeVars(array $resultData): array
74 | {
75 | if (empty($resultData)) {
76 | return [];
77 | }
78 | $requestResult = self::getJsonCompatibilityConverter()->convert($resultData['content']);
79 | if (is_bool($requestResult)) {
80 | return [];
81 | }
82 | return $requestResult['vars'] ?? [];
83 | }
84 |
85 | private static function getJsonCompatibilityConverter(): JsonCompatibilityConverter
86 | {
87 | return GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/Classes/Controller/Backend/Helper/UrlBuilder.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 | use TYPO3\CMS\Backend\Routing\Exception\RouteNotFoundException;
22 | use TYPO3\CMS\Backend\Routing\UriBuilder;
23 | use TYPO3\CMS\Core\Http\Uri;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | /**
27 | * @internal since v9.2.5
28 | */
29 | class UrlBuilder
30 | {
31 | /**
32 | * Returns the URL to the current module, including $_GET['id'].
33 | *
34 | * @param array $uriParameters optional parameters to add to the URL
35 | *
36 | * @throws RouteNotFoundException
37 | */
38 | public static function getBackendModuleUrl(array $uriParameters = [], string $module = 'web_site_crawler'): Uri
39 | {
40 | $id = $GLOBALS['TYPO3_REQUEST']->getParsedBody()['id'] ?? $GLOBALS['TYPO3_REQUEST']->getQueryParams()['id'] ?? null;
41 | if ($id) {
42 | $uriParameters['id'] = $id;
43 | }
44 | /** @var UriBuilder $uriBuilder */
45 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class);
46 | return $uriBuilder->buildUriFromRoute($module, $uriParameters);
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Classes/Converter/JsonCompatibilityConverter.php:
--------------------------------------------------------------------------------
1 |
9 | * (c) 2023- Tomas Norre Mikkelsen
10 | *
11 | * This file is part of the TYPO3 Crawler Extension.
12 | *
13 | * It is free software; you can redistribute it and/or modify it under
14 | * the terms of the GNU General Public License, either version 2
15 | * of the License, or any later version.
16 | *
17 | * For the full copyright and license information, please read the
18 | * LICENSE.txt file that was distributed with this source code.
19 | *
20 | * The TYPO3 project - inspiring people to share!
21 | */
22 |
23 | use Exception;
24 |
25 | /**
26 | * @internal since v9.2.5
27 | */
28 | class JsonCompatibilityConverter
29 | {
30 | /**
31 | * This is implemented as we want to switch away from serialized data to json data, when the crawler is storing
32 | * in the database. To ensure that older crawler entries, which have already been stored as serialized data
33 | * still works, we have added this converter that can be used for the reading part. The writing part will be done
34 | * in json from now on.
35 | * @see https://github.com/tomasnorre/crawler/issues/417
36 | *
37 | * @throws Exception
38 | */
39 | public function convert(string $dataString): array|bool
40 | {
41 | $decoded = '';
42 | try {
43 | $decoded = json_decode($dataString, true, 512, JSON_THROW_ON_ERROR);
44 | } catch (\JsonException) {
45 | // Do nothing as we want to continue with unserialize as a test.
46 | }
47 |
48 | if (is_array($decoded)) {
49 | return $decoded;
50 | }
51 |
52 | try {
53 | $deserialized = unserialize($dataString, [
54 | 'allowed_classes' => false,
55 | ]);
56 | } catch (\Throwable) {
57 | return false;
58 | }
59 |
60 | if (is_object($deserialized)) {
61 | throw new \RuntimeException('Objects are not allowed: ' . var_export($deserialized, true), 1_593_758_307);
62 | }
63 |
64 | if (is_array($deserialized)) {
65 | return $deserialized;
66 | }
67 |
68 | return false;
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/CallbackExecutionStrategy.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Controller\CrawlerController;
23 | use TYPO3\CMS\Core\Utility\GeneralUtility;
24 |
25 | /**
26 | * Used for hooks (e.g. crawling external files)
27 | * @internal since v12.0.0
28 | */
29 | class CallbackExecutionStrategy
30 | {
31 | /**
32 | * In the future, the callback should implement an interface.
33 | */
34 | public function fetchByCallback(string $callbackClassName, array $parameters, CrawlerController $crawlerController)
35 | {
36 | // Calling custom object
37 | $callBackObj = GeneralUtility::makeInstance($callbackClassName);
38 | return $callBackObj->crawler_execute($parameters, $crawlerController);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/CrawlStrategyFactory.php:
--------------------------------------------------------------------------------
1 | configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(
20 | ExtensionConfigurationProvider::class
21 | );
22 | }
23 |
24 | public function create(): CrawlStrategyInterface
25 | {
26 | $settings = $this->configurationProvider->getExtensionConfiguration();
27 | $extensionSettings = is_array($settings) ? $settings : [];
28 |
29 | if ($extensionSettings['makeDirectRequests'] ?? false) {
30 | /** @var CrawlStrategyInterface $instance */
31 | $instance = GeneralUtility::makeInstance(SubProcessExecutionStrategy::class, $this->configurationProvider);
32 | } else {
33 | $instance = GeneralUtility::makeInstance(GuzzleExecutionStrategy::class, $this->configurationProvider);
34 | }
35 |
36 | return $instance;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/CrawlStrategyInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Http\Message\UriInterface;
23 |
24 | /**
25 | * @internal since v12.0.0
26 | */
27 | interface CrawlStrategyInterface
28 | {
29 | public function fetchUrlContents(UriInterface $url, string $crawlerId);
30 | }
31 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/GuzzleExecutionStrategy.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use GuzzleHttp\Exception\ConnectException;
23 | use GuzzleHttp\Exception\RequestException;
24 | use Psr\Http\Message\ResponseInterface;
25 | use Psr\Http\Message\UriInterface;
26 | use Psr\Log\LoggerAwareInterface;
27 | use Psr\Log\LoggerAwareTrait;
28 | use TYPO3\CMS\Core\Http\Client\GuzzleClientFactory;
29 | use TYPO3\CMS\Core\Http\RequestFactory;
30 | use TYPO3\CMS\Core\Utility\GeneralUtility;
31 |
32 | /**
33 | * Calls Guzzle / CURL (based on TYPO3 settings) for fetching a URL.
34 | * @internal since v12.0.0
35 | */
36 | class GuzzleExecutionStrategy implements LoggerAwareInterface, CrawlStrategyInterface
37 | {
38 | use LoggerAwareTrait;
39 |
40 | /**
41 | * Sets up a CURL / Guzzle Request for fetching the request.
42 | *
43 | * @return bool|mixed
44 | */
45 | public function fetchUrlContents(UriInterface $url, string $crawlerId)
46 | {
47 | $reqHeaders = $this->buildRequestHeaders($crawlerId);
48 |
49 | $options = [
50 | 'headers' => $reqHeaders,
51 | ];
52 | if ($url->getUserInfo()) {
53 | $options['auth'] = explode(':', $url->getUserInfo());
54 | }
55 | try {
56 | $url = (string) $url;
57 | $response = $this->getResponse($url, $options);
58 | return unserialize($response->getHeaderLine('X-T3Crawler-Meta'));
59 | } catch (RequestException $e) {
60 | $response = $e->getResponse();
61 | $message = ($response ? $response->getStatusCode() : 0)
62 | . chr(32)
63 | . ($response ? $response->getReasonPhrase() : $e->getMessage());
64 |
65 | $this->logger->debug(
66 | sprintf('Error while opening "%s" - ' . $message, $url),
67 | [
68 | 'crawlerId' => $crawlerId,
69 | ]
70 | );
71 | return $message;
72 | } catch (ConnectException $e) {
73 | $message = $e->getCode() . chr(32) . $e->getMessage();
74 |
75 | $this->logger->debug(
76 | sprintf('Error while opening "%s" - ' . $message, $url),
77 | [
78 | 'crawlerId' => $crawlerId,
79 | ]
80 | );
81 | return $message;
82 | }
83 | }
84 |
85 | protected function getResponse(string $url, array $options): ResponseInterface
86 | {
87 | $guzzleClientFactory = GeneralUtility::makeInstance(GuzzleClientFactory::class);
88 | return GeneralUtility::makeInstance(RequestFactory::class, $guzzleClientFactory)
89 | ->request($url, 'GET', $options);
90 | }
91 |
92 | /**
93 | * Builds HTTP request headers.
94 | */
95 | private function buildRequestHeaders(string $crawlerId): array
96 | {
97 | return [
98 | 'Connection' => 'close',
99 | 'X-T3Crawler' => $crawlerId,
100 | 'User-Agent' => 'TYPO3 crawler',
101 | ];
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/Classes/Crawler.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Core\Environment;
23 | use TYPO3\CMS\Core\SingletonInterface;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | /**
27 | * @internal since v9.2.5
28 | */
29 | final class Crawler implements SingletonInterface
30 | {
31 | private readonly string $processFilename;
32 |
33 | public function __construct(?string $processFilename = null)
34 | {
35 | $this->processFilename = $processFilename ?: Environment::getVarPath() . '/lock/tx_crawler.proc';
36 | $this->setDisabled(false);
37 | $pathInfo = pathinfo($this->processFilename);
38 | GeneralUtility::mkdir_deep($pathInfo['dirname']);
39 | }
40 |
41 | public function setDisabled(bool $disabled = true): void
42 | {
43 | if ($disabled) {
44 | GeneralUtility::writeFile($this->processFilename, '');
45 | } elseif (is_file($this->processFilename)) {
46 | unlink($this->processFilename);
47 | }
48 | }
49 |
50 | public function isDisabled(): bool
51 | {
52 | return is_file($this->processFilename);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/Classes/Domain/Model/ProcessCollection.php:
--------------------------------------------------------------------------------
1 |
14 | *
15 | * All rights reserved
16 | *
17 | * This script is part of the TYPO3 project. The TYPO3 project is
18 | * free software; you can redistribute it and/or modify
19 | * it under the terms of the GNU General Public License as published by
20 | * the Free Software Foundation; either version 3 of the License, or
21 | * (at your option) any later version.
22 | *
23 | * The GNU General Public License can be found at
24 | * http://www.gnu.org/copyleft/gpl.html.
25 | *
26 | * This script is distributed in the hope that it will be useful,
27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 | * GNU General Public License for more details.
30 | *
31 | * This copyright notice MUST APPEAR in all copies of the script!
32 | ***************************************************************/
33 |
34 | /**
35 | * @internal since v9.2.5
36 | */
37 | class ProcessCollection extends \ArrayObject
38 | {
39 | /**
40 | * Method to retrieve an element from the collection.
41 | * @throws NoIndexFoundException
42 | */
43 | public function offsetGet(mixed $index): Process
44 | {
45 | if (!parent::offsetExists($index)) {
46 | throw new NoIndexFoundException('Index "' . var_export(
47 | $index,
48 | true
49 | ) . '" for \AOE\Crawler\Domain\Model\Process are not available', 1_593_714_823);
50 | }
51 | return parent::offsetGet($index);
52 | }
53 |
54 | /**
55 | * Method to add an element to the collection-
56 | *
57 | * @param Process $subject
58 | * @throws InvalidArgumentException
59 | */
60 | public function offsetSet(mixed $index, $subject): void
61 | {
62 | if (!$subject instanceof Process) {
63 | throw new \InvalidArgumentException(
64 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!',
65 | 1_593_714_822
66 | );
67 | }
68 |
69 | parent::offsetSet($index, $subject);
70 | }
71 |
72 | /**
73 | * Method to append an element to the collection
74 | * @param Process $subject
75 | * @throws InvalidArgumentException
76 | */
77 | public function append($subject): void
78 | {
79 | if (!$subject instanceof Process) {
80 | throw new \InvalidArgumentException(
81 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!',
82 | 1_593_714_821
83 | );
84 | }
85 |
86 | parent::append($subject);
87 | }
88 |
89 | /**
90 | * returns array of process ids of the current collection
91 | * @return array
92 | */
93 | public function getProcessIds()
94 | {
95 | $result = [];
96 | foreach ($this->getIterator() as $value) {
97 | $result[] = $value->getProcessId();
98 | }
99 | return $result;
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/Classes/Domain/Repository/ConfigurationRepository.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Doctrine\DBAL\ArrayParameterType;
23 | use TYPO3\CMS\Backend\Utility\BackendUtility;
24 | use TYPO3\CMS\Core\Database\ConnectionPool;
25 | use TYPO3\CMS\Core\Database\Query\QueryBuilder;
26 | use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
27 | use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
28 | use TYPO3\CMS\Core\Utility\GeneralUtility;
29 | use TYPO3\CMS\Extbase\Persistence\Repository;
30 |
31 | /**
32 | * @internal since v9.2.5
33 | */
34 | class ConfigurationRepository extends Repository
35 | {
36 | final public const TABLE_NAME = 'tx_crawler_configuration';
37 |
38 | /**
39 | * Traverses up the rootline of a page and fetches all crawler records.
40 | */
41 | public function getCrawlerConfigurationRecordsFromRootLine(int $pageId, array $parentIds = []): array
42 | {
43 | if (empty($parentIds)) {
44 | $pageIdsInRootLine = [];
45 | $rootLine = BackendUtility::BEgetRootLine($pageId);
46 |
47 | foreach ($rootLine as $pageInRootLine) {
48 | $pageIdsInRootLine[] = (int) $pageInRootLine['uid'];
49 | }
50 | } else {
51 | $pageIdsInRootLine = $parentIds;
52 | }
53 |
54 | $queryBuilder = $this->createQueryBuilder();
55 | $queryBuilder
56 | ->getRestrictions()->removeAll()
57 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
58 | ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
59 | return $queryBuilder
60 | ->select('*')
61 | ->from(self::TABLE_NAME)
62 | ->where(
63 | $queryBuilder->expr()->in(
64 | 'pid',
65 | $queryBuilder->createNamedParameter($pageIdsInRootLine, ArrayParameterType::INTEGER)
66 | )
67 | )
68 | ->orderBy('name')
69 | ->executeQuery()
70 | ->fetchAllAssociative();
71 | }
72 |
73 | protected function createQueryBuilder(): QueryBuilder
74 | {
75 | return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(self::TABLE_NAME);
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/Classes/Event/AfterQueueItemAddedEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class AfterQueueItemAddedEvent
26 | {
27 | /**
28 | * @param int|string $queueId
29 | */
30 | public function __construct(
31 | private $queueId,
32 | private array $fieldArray
33 | ) {
34 | }
35 |
36 | public function getQueueId(): int|string
37 | {
38 | return $this->queueId;
39 | }
40 |
41 | public function getFieldArray(): array
42 | {
43 | return $this->fieldArray;
44 | }
45 |
46 | public function setFieldArray(array $fieldArray): void
47 | {
48 | $this->fieldArray = $fieldArray;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/Classes/Event/AfterUrlAddedToQueueEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class AfterUrlAddedToQueueEvent
26 | {
27 | public function __construct(
28 | private readonly string $uid,
29 | private readonly array $fieldArray
30 | ) {
31 | }
32 |
33 | public function getUid(): string
34 | {
35 | return $this->uid;
36 | }
37 |
38 | public function getFieldArray(): array
39 | {
40 | return $this->fieldArray;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Classes/Event/AfterUrlCrawledEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class AfterUrlCrawledEvent
26 | {
27 | public function __construct(
28 | private readonly string $url,
29 | private readonly array $result
30 | ) {
31 | }
32 |
33 | public function getUrl(): string
34 | {
35 | return $this->url;
36 | }
37 |
38 | public function getResult(): array
39 | {
40 | return $this->result;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Classes/Event/BeforeQueueItemAddedEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class BeforeQueueItemAddedEvent
26 | {
27 | public function __construct(
28 | private readonly int $queueId,
29 | private array $queueRecord
30 | ) {
31 | }
32 |
33 | public function getQueueId(): int
34 | {
35 | return $this->queueId;
36 | }
37 |
38 | public function getQueueRecord(): array
39 | {
40 | return $this->queueRecord;
41 | }
42 |
43 | public function setQueueRecord(array $queueRecord): void
44 | {
45 | $this->queueRecord = $queueRecord;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/Classes/Event/InvokeQueueChangeEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Model\Reason;
23 |
24 | /**
25 | * @internal since v12.0.0
26 | */
27 | final class InvokeQueueChangeEvent
28 | {
29 | public function __construct(
30 | private readonly Reason $reason
31 | ) {
32 | }
33 |
34 | public function getReasonDetailedText(): string
35 | {
36 | return $this->reason->getDetailText();
37 | }
38 |
39 | public function getReasonText(): string
40 | {
41 | return $this->reason->getReason();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/Classes/Event/ModifySkipPageEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class ModifySkipPageEvent
26 | {
27 | private bool|string $skipped = false;
28 |
29 | public function __construct(
30 | private readonly array $pageRow
31 | ) {
32 | }
33 |
34 | public function isSkipped(): false|string
35 | {
36 | return $this->skipped;
37 | }
38 |
39 | public function setSkipped(false|string $skipped): void
40 | {
41 | $this->skipped = $skipped;
42 | }
43 |
44 | public function getPageRow(): array
45 | {
46 | return $this->pageRow;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Classes/EventListener/AfterQueueItemAddedEventListener.php:
--------------------------------------------------------------------------------
1 | getConnectionForTable(QueueRepository::TABLE_NAME)
17 | ->update(QueueRepository::TABLE_NAME, $event->getFieldArray(), [
18 | 'qid' => (int) $event->getQueueId(),
19 | ]);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Classes/EventListener/ShouldUseCachedPageDataIfAvailableEventListener.php:
--------------------------------------------------------------------------------
1 | getRequest()->getAttribute('tx_crawler') === null) {
18 | return;
19 | }
20 | $event->setShouldUseCachedPageData(false);
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Classes/Exception/CommandNotFoundException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class CommandNotFoundException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Classes/Exception/CrawlerObjectException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | * @deprecated since 12.0.5 will be removed in v14.x
25 | */
26 | class CrawlerObjectException extends \Exception
27 | {
28 | }
29 |
--------------------------------------------------------------------------------
/Classes/Exception/ExtensionSettingsException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class ExtensionSettingsException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Classes/Exception/NoIndexFoundException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class NoIndexFoundException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Classes/Exception/ProcessException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class ProcessException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Classes/Exception/TimeStampException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | * @deprecated since 12.0.5 will be removed in v14.x
25 | */
26 | class TimeStampException extends \Exception
27 | {
28 | }
29 |
--------------------------------------------------------------------------------
/Classes/Helper/Sleeper/NullSleeper.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /*
23 | * @internal
24 | * @codeCoverageIgnore
25 | */
26 | final class NullSleeper implements SleeperInterface
27 | {
28 | public function sleep(int $seconds): void
29 | {
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/Classes/Helper/Sleeper/SleeperInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | interface SleeperInterface
26 | {
27 | public function sleep(int $seconds): void;
28 | }
29 |
--------------------------------------------------------------------------------
/Classes/Helper/Sleeper/SystemSleeper.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /*
23 | * @internal
24 | */
25 | final class SystemSleeper implements SleeperInterface
26 | {
27 | public function sleep(int $seconds): void
28 | {
29 | \sleep($seconds);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/Classes/Hooks/CrawlerHookInterface.php:
--------------------------------------------------------------------------------
1 |
9 | * (c) 2021- Tomas Norre Mikkelsen
10 | *
11 | * This file is part of the TYPO3 Crawler Extension.
12 | *
13 | * It is free software; you can redistribute it and/or modify it under
14 | * the terms of the GNU General Public License, either version 2
15 | * of the License, or any later version.
16 | *
17 | * For the full copyright and license information, please read the
18 | * LICENSE.txt file that was distributed with this source code.
19 | *
20 | * The TYPO3 project - inspiring people to share!
21 | */
22 |
23 | /**
24 | * @internal since v12.0.0
25 | */
26 | interface CrawlerHookInterface
27 | {
28 | public function crawler_init(): void;
29 | }
30 |
--------------------------------------------------------------------------------
/Classes/Hooks/DataHandlerHook.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\QueueRepository;
23 | use AOE\Crawler\Service\QueueService;
24 | use TYPO3\CMS\Core\DataHandling\DataHandler;
25 | use TYPO3\CMS\Core\Domain\Repository\PageRepository;
26 | use TYPO3\CMS\Core\Utility\GeneralUtility;
27 |
28 | /**
29 | * @internal since v9.2.5
30 | */
31 | class DataHandlerHook
32 | {
33 | /**
34 | * @noRector \Rector\DeadCode\Rector\ClassMethod\RemoveUnusedParameterRector
35 | */
36 | public function addFlushedPagesToCrawlerQueue(array $parameters, DataHandler $dataHandler): void
37 | {
38 | $pageIdsToBeFlushedFromCache = $parameters['pageIdArray'];
39 | if (empty($pageIdsToBeFlushedFromCache)) {
40 | return;
41 | }
42 | foreach ($pageIdsToBeFlushedFromCache as $pageId) {
43 | $pageId = (int) $pageId;
44 | if ($pageId < 1 || empty($this->getPageRepository()->getPage($pageId))) {
45 | continue;
46 | }
47 | if ($this->getQueueRepository()->isPageInQueue($pageId)) {
48 | continue;
49 | }
50 | $this->getQueueService()->addPageToQueue($pageId);
51 | }
52 | }
53 |
54 | public function getQueueRepository(): QueueRepository
55 | {
56 | return GeneralUtility::makeInstance(QueueRepository::class);
57 | }
58 |
59 | public function getQueueService(): QueueService
60 | {
61 | return GeneralUtility::makeInstance(QueueService::class);
62 | }
63 |
64 | public function getPageRepository(): PageRepository
65 | {
66 | return GeneralUtility::makeInstance(PageRepository::class);
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/Classes/QueueExecutor.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Controller\CrawlerController;
23 | use AOE\Crawler\Converter\JsonCompatibilityConverter;
24 | use AOE\Crawler\CrawlStrategy\CallbackExecutionStrategy;
25 | use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
26 | use AOE\Crawler\CrawlStrategy\CrawlStrategyInterface;
27 | use AOE\Crawler\Event\AfterUrlCrawledEvent;
28 | use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
29 | use TYPO3\CMS\Core\Http\Uri;
30 | use TYPO3\CMS\Core\SingletonInterface;
31 | use TYPO3\CMS\Core\Utility\GeneralUtility;
32 |
33 | /**
34 | * Fetches a URL based on the selected strategy or via a callback.
35 | * @internal since v9.2.5
36 | */
37 | class QueueExecutor implements SingletonInterface
38 | {
39 | protected CrawlStrategyInterface $crawlStrategy;
40 |
41 | public function __construct(
42 | CrawlStrategyFactory $crawlStrategyFactory,
43 | private readonly EventDispatcher $eventDispatcher
44 | ) {
45 | $this->crawlStrategy = $crawlStrategyFactory->create();
46 | }
47 |
48 | /**
49 | * Takes a queue record and fetches the contents of the URL.
50 | * In the future, updating the queue item & additional signal/slot/events should also happen in here.
51 | *
52 | * @return array|bool|mixed|string
53 | */
54 | public function executeQueueItem(array $queueItem, CrawlerController $crawlerController)
55 | {
56 | $parameters = '';
57 | if (isset($queueItem['parameters'])) {
58 | // Decode parameters:
59 | /** @var JsonCompatibilityConverter $jsonCompatibleConverter */
60 | $jsonCompatibleConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
61 | $parameters = $jsonCompatibleConverter->convert($queueItem['parameters']);
62 | }
63 |
64 | if (!is_array($parameters) || empty($parameters)) {
65 | return 'ERROR';
66 | }
67 | if (isset($parameters['_CALLBACKOBJ'])) {
68 | $className = $parameters['_CALLBACKOBJ'];
69 | unset($parameters['_CALLBACKOBJ']);
70 | $result = GeneralUtility::makeInstance(CallbackExecutionStrategy::class)
71 | ->fetchByCallback($className, $parameters, $crawlerController);
72 | $result = [
73 | 'content' => json_encode($result),
74 | ];
75 | } else {
76 | // Regular FE request
77 | $crawlerId = $this->generateCrawlerIdFromQueueItem($queueItem);
78 |
79 | $url = new Uri($parameters['url']);
80 | $result = $this->crawlStrategy->fetchUrlContents($url, $crawlerId);
81 | if ($result !== false) {
82 | $result = [
83 | 'content' => json_encode($result),
84 | ];
85 | $this->eventDispatcher->dispatch(new AfterUrlCrawledEvent($parameters['url'], $result));
86 | }
87 | }
88 | return $result;
89 | }
90 |
91 | protected function generateCrawlerIdFromQueueItem(array $queueItem): string
92 | {
93 | return $queueItem['qid'] . ':' . md5(
94 | $queueItem['qid'] . '|' . $queueItem['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']
95 | );
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/Classes/Service/BackendModuleScriptUrlService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Http\Message\ServerRequestInterface;
23 | use TYPO3\CMS\Backend\Routing\UriBuilder;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | class BackendModuleScriptUrlService
27 | {
28 | public function buildScriptUrl(
29 | ServerRequestInterface $request,
30 | string $elementName,
31 | int $pageUid,
32 | array $queryParameters,
33 | string $queryString = ''
34 | ): string {
35 | $mainParams = [
36 | 'id' => $pageUid,
37 | ];
38 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class);
39 | $route = $request->getAttribute('route');
40 | $scriptUrl = (string) $uriBuilder->buildUriFromRoute($route->getOption('_identifier'), $mainParams);
41 |
42 | return $scriptUrl . ($queryString . $this->getAdditionalQueryParams(
43 | $elementName,
44 | $queryParameters
45 | ) . '&' . $elementName . '=${value}');
46 | }
47 |
48 | /*
49 | * Build query string with affected checkbox/dropdown value removed.
50 | */
51 | private function getAdditionalQueryParams(string $keyToBeRemoved, array $queryParameters): string
52 | {
53 | $queryString = '';
54 | unset($queryParameters[$keyToBeRemoved]);
55 | foreach ($queryParameters as $key => $value) {
56 | $queryString .= "&{$key}={$value}";
57 | }
58 | return $queryString;
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/Classes/Service/ProcessInstructionService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\GeneralUtility;
23 |
24 | /**
25 | * @internal since v11.0.3
26 | */
27 | class ProcessInstructionService
28 | {
29 | public function isAllowed(string $processInstruction, array $incoming): bool
30 | {
31 | if (empty($incoming)) {
32 | return true;
33 | }
34 |
35 | foreach ($incoming as $pi) {
36 | if (GeneralUtility::inList($processInstruction, $pi)) {
37 | return true;
38 | }
39 | }
40 | return false;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Classes/Service/QueueService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Controller\CrawlerController;
23 | use TYPO3\CMS\Core\Domain\Repository\PageRepository;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | /**
27 | * @internal since v9.2.5
28 | */
29 | class QueueService
30 | {
31 | private ?\AOE\Crawler\Controller\CrawlerController $crawlerController = null;
32 |
33 | public function injectCrawlerController(CrawlerController $crawlerController): void
34 | {
35 | $this->crawlerController = $crawlerController;
36 | $this->crawlerController->setID = GeneralUtility::md5int(microtime());
37 | }
38 |
39 | public function addPageToQueue(int $pageUid, int $time = 0): void
40 | {
41 | if ($this->crawlerController === null) {
42 | return;
43 | }
44 |
45 | $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($pageUid, true);
46 | $configurations = $this->crawlerController->getUrlsForPageRow($pageData);
47 | // Currently this is only used from the DataHandlerHook, and we don't know of any allowed/disallowed configurations,
48 | // when clearing the cache, therefore we allow all configurations in this case.
49 | // This next lines could be skipped as it will return the incoming configurations, but for visibility and
50 | // later implementation it's kept as it do no harm.
51 | $allowedConfigurations = [];
52 | $configurations = ConfigurationService::removeDisallowedConfigurations($allowedConfigurations, $configurations);
53 | $downloadUrls = [];
54 | $duplicateTrack = [];
55 |
56 | if (is_array($configurations)) {
57 | foreach ($configurations as $configuration) {
58 | //enable inserting of entries
59 | $this->crawlerController->registerQueueEntriesInternallyOnly = false;
60 | $this->crawlerController->urlListFromUrlArray(
61 | $configuration,
62 | $pageData,
63 | $time,
64 | 300,
65 | true,
66 | false,
67 | $duplicateTrack,
68 | $downloadUrls,
69 | array_keys($this->getCrawlerProcInstructions())
70 | );
71 |
72 | //reset the queue because the entries have been written to the db
73 | unset($this->crawlerController->queueEntries);
74 | }
75 | }
76 | }
77 |
78 | /**
79 | * Reads the registered processingInstructions of the crawler
80 | */
81 | private function getCrawlerProcInstructions(): array
82 | {
83 | $crawlerProcInstructions = [];
84 | if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
85 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
86 | $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
87 | }
88 | }
89 |
90 | return $crawlerProcInstructions;
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/Classes/Service/UserService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\GeneralUtility;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | class UserService
28 | {
29 | public static function hasGroupAccess(string $groupList, string $accessList): bool
30 | {
31 | if (empty($accessList)) {
32 | return true;
33 | }
34 | foreach (explode(',', $groupList) as $groupUid) {
35 | if (GeneralUtility::inList($accessList, $groupUid)) {
36 | return true;
37 | }
38 | }
39 | return false;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/Classes/Utility/HookUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Hooks\ProcessCleanUpHook;
23 | use Psr\Http\Message\ServerRequestInterface;
24 | use TYPO3\CMS\Core\Http\ApplicationType;
25 |
26 | /**
27 | * @codeCoverageIgnore
28 | * @internal since v9.2.5
29 | */
30 | class HookUtility
31 | {
32 | /**
33 | * Registers hooks
34 | *
35 | * @param string $extKey
36 | */
37 | public static function registerHooks($extKey): void
38 | {
39 | // Activating Crawler cli_hooks
40 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['cli_hooks'][] =
41 | ProcessCleanUpHook::class;
42 |
43 | // Activating refresh hooks
44 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['refresh_hooks'][] =
45 | ProcessCleanUpHook::class;
46 |
47 | // Env-dependent
48 | if (($GLOBALS['TYPO3_REQUEST'] ?? null) instanceof ServerRequestInterface
49 | && ApplicationType::fromRequest($GLOBALS['TYPO3_REQUEST'])->isBackend()
50 | ) {
51 | self::registerBackendHooks();
52 | }
53 | }
54 |
55 | private static function registerBackendHooks(): void
56 | {
57 | // DataHandler clear page cache pre-processing
58 | $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['clearPageCacheEval'][] =
59 | "AOE\Crawler\Hooks\DataHandlerHook->addFlushedPagesToCrawlerQueue";
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/Classes/Utility/MessageUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Messaging\FlashMessage;
23 | use TYPO3\CMS\Core\Messaging\FlashMessageService;
24 | use TYPO3\CMS\Core\Type\ContextualFeedbackSeverity;
25 | use TYPO3\CMS\Core\Utility\GeneralUtility;
26 |
27 | /**
28 | * @internal since v9.2.5
29 | */
30 | class MessageUtility
31 | {
32 | /**
33 | * Add notice message to the user interface.
34 | */
35 | public static function addNoticeMessage(string $message): void
36 | {
37 | self::addMessage($message, ContextualFeedbackSeverity::NOTICE);
38 | }
39 |
40 | /**
41 | * Add error message to the user interface.
42 | */
43 | public static function addErrorMessage(string $message): void
44 | {
45 | self::addMessage($message, ContextualFeedbackSeverity::ERROR);
46 | }
47 |
48 | /**
49 | * Add error message to the user interface.
50 | */
51 | public static function addWarningMessage(string $message): void
52 | {
53 | self::addMessage($message, ContextualFeedbackSeverity::WARNING);
54 | }
55 |
56 | /**
57 | * This method is used to add a message to the internal queue
58 | *
59 | * @param string $message the message itself
60 | * @param ContextualFeedbackSeverity $severity message level (0 = success (default), -1 = info, -2 = notice, 1 = warning, 2 = error)
61 | */
62 | private static function addMessage(
63 | string $message,
64 | ContextualFeedbackSeverity $severity = ContextualFeedbackSeverity::OK
65 | ): void {
66 | $message = GeneralUtility::makeInstance(FlashMessage::class, $message, '', $severity);
67 |
68 | /** @var FlashMessageService $flashMessageService */
69 | $flashMessageService = GeneralUtility::makeInstance(FlashMessageService::class);
70 | $flashMessageService->getMessageQueueByIdentifier()->addMessage($message);
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/Classes/Utility/PhpBinaryUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23 | use AOE\Crawler\Exception\CommandNotFoundException;
24 | use AOE\Crawler\Exception\ExtensionSettingsException;
25 | use TYPO3\CMS\Core\Utility\CommandUtility;
26 | use TYPO3\CMS\Core\Utility\GeneralUtility;
27 |
28 | /**
29 | * @internal since v9.2.5
30 | */
31 | class PhpBinaryUtility
32 | {
33 | public static function getPhpBinary(): string
34 | {
35 | $extensionSettings = GeneralUtility::makeInstance(
36 | ExtensionConfigurationProvider::class
37 | )->getExtensionConfiguration();
38 |
39 | if (empty($extensionSettings)) {
40 | throw new ExtensionSettingsException('ExtensionSettings are empty', 1_587_066_853);
41 | }
42 |
43 | if (empty($extensionSettings['phpPath'])) {
44 | $phpPath = CommandUtility::getCommand($extensionSettings['phpBinary']);
45 | if ($phpPath === false) {
46 | throw new CommandNotFoundException(
47 | 'The phpBinary: "' . $extensionSettings['phpBinary'] . '" could not be found!',
48 | 1_587_068_215
49 | );
50 | }
51 | } else {
52 | $phpPath = $extensionSettings['phpPath'];
53 | }
54 |
55 | return $phpPath;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/Classes/Utility/TcaUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | class TcaUtility
28 | {
29 | /**
30 | * Get crawler processing instructions.
31 | * This function is called as a itemsProcFunc in tx_crawler_configuration.processing_instruction_filter
32 | *
33 | * @return array
34 | */
35 | public function getProcessingInstructions(array $configuration)
36 | {
37 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] ?? null)) {
38 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $extensionKey => $extensionConfiguration) {
39 | $configuration['items'][] = [
40 | 'label' => $extensionConfiguration['value'] . ' [' . $extensionConfiguration['key'] . ']',
41 | 'value' => $extensionConfiguration['key'],
42 | 'icon' => $this->getExtensionIcon($extensionKey),
43 | ];
44 | }
45 | }
46 |
47 | return $configuration;
48 | }
49 |
50 | /**
51 | * Get path to ext_icon.gif from processing instruction key
52 | *
53 | * @param string $extensionKey Like staticfilecache or indexed_search
54 | * @return string
55 | */
56 | private function getExtensionIcon($extensionKey)
57 | {
58 | return ExtensionManagementUtility::getExtensionIcon(ExtensionManagementUtility::extPath($extensionKey), true);
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/Classes/Value/CrawlAction.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Assert\Assert;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | final class CrawlAction implements \Stringable
28 | {
29 | private readonly string $crawlAction;
30 |
31 | public function __construct(string $crawlAction)
32 | {
33 | Assert::that($crawlAction)
34 | ->inArray(['start', 'log', 'multiprocess']);
35 |
36 | $this->crawlAction = $crawlAction;
37 | }
38 |
39 | public function __toString(): string
40 | {
41 | return $this->crawlAction;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/Classes/Value/QueueFilter.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Assert\Assert;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | class QueueFilter implements \Stringable
28 | {
29 | private readonly string $queueFilter;
30 |
31 | public function __construct(string $queueFilter = 'all')
32 | {
33 | Assert::that($queueFilter)
34 | ->inArray(['all', 'pending', 'finished']);
35 |
36 | $this->queueFilter = $queueFilter;
37 | }
38 |
39 | public function __toString(): string
40 | {
41 | return $this->queueFilter;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/Classes/Value/QueueRow.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal
24 | */
25 | class QueueRow
26 | {
27 | public string $pageTitleHTML = '';
28 | public string $message = '';
29 | public string $configurationKey = '';
30 | public string $parameterConfig = '';
31 | public string $valuesExpanded = '';
32 | public string $urls = '';
33 | public array $options = [];
34 | public string $parameters = '';
35 |
36 | public function __construct(
37 | public string $pageTitle = ''
38 | ) {
39 | }
40 |
41 | public function setPageTitleHTML(string $pageTitleHTML): void
42 | {
43 | $this->pageTitleHTML = $pageTitleHTML;
44 | }
45 |
46 | public function setMessage(string $message): void
47 | {
48 | $this->message = $message;
49 | }
50 |
51 | public function setConfigurationKey(string $configurationKey): void
52 | {
53 | $this->configurationKey = $configurationKey;
54 | }
55 |
56 | public function setParameterConfig(string $parameterConfig): void
57 | {
58 | $this->parameterConfig = $parameterConfig;
59 | }
60 |
61 | public function setValuesExpanded(string $valuesExpanded): void
62 | {
63 | $this->valuesExpanded = $valuesExpanded;
64 | }
65 |
66 | public function setUrls(string $urls): void
67 | {
68 | $this->urls = $urls;
69 | }
70 |
71 | public function setOptions(array $options): void
72 | {
73 | $this->options = $options;
74 | }
75 |
76 | public function setParameters(string $parameters): void
77 | {
78 | $this->parameters = $parameters;
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/Classes/Writer/FileWriter/CsvWriter/CrawlerCsvWriter.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\CsvUtility;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | final class CrawlerCsvWriter implements CsvWriterInterface
28 | {
29 | private const CARRIAGE_RETURN = 13;
30 | private const LINE_FEED = 10;
31 |
32 | public function arrayToCsv(array $records): string
33 | {
34 | $csvLines = [];
35 | reset($records);
36 |
37 | $csvLines[] = $this->getRowHeaders($records);
38 | foreach ($records as $row) {
39 | $csvLines[] = CsvUtility::csvValues($row);
40 | }
41 |
42 | return implode(chr(self::CARRIAGE_RETURN) . chr(self::LINE_FEED), $csvLines);
43 | }
44 |
45 | private function getRowHeaders(array $lines): string
46 | {
47 | $fieldNames = array_keys(current($lines));
48 | return CsvUtility::csvValues($fieldNames);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/Classes/Writer/FileWriter/CsvWriter/CsvWriterInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | interface CsvWriterInterface
26 | {
27 | public function arrayToCsv(array $records): string;
28 | }
29 |
--------------------------------------------------------------------------------
/Configuration/Backend/Modules.php:
--------------------------------------------------------------------------------
1 |
7 | *
8 | * This file is part of the TYPO3 Crawler Extension.
9 | *
10 | * It is free software; you can redistribute it and/or modify it under
11 | * the terms of the GNU General Public License, either version 2
12 | * of the License, or any later version.
13 | *
14 | * For the full copyright and license information, please read the
15 | * LICENSE.txt file that was distributed with this source code.
16 | *
17 | * The TYPO3 project - inspiring people to share!
18 | */
19 |
20 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerLogController;
21 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerProcessController;
22 | use AOE\Crawler\Controller\Backend\BackendModuleStartCrawlingController;
23 |
24 | return [
25 | 'web_site_crawler' => [
26 | 'parent' => 'web',
27 | 'position' => [
28 | 'after' => 'web_info',
29 | ],
30 | 'access' => 'user',
31 | 'workspaces' => 'live',
32 | 'path' => '/module/page/crawler',
33 | 'labels' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf',
34 | 'extensionName' => 'Crawler',
35 | 'iconIdentifier' => 'tx-crawler-icon',
36 | 'routes' => [
37 | '_default' => [
38 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest',
39 | ],
40 | ],
41 | ],
42 | 'web_site_crawler_start' => [
43 | 'parent' => 'web_site_crawler',
44 | 'access' => 'user',
45 | 'path' => '/module/page/crawler/start',
46 | 'iconIdentifier' => 'crawler-start',
47 | 'labels' => [
48 | 'title' => 'Start',
49 | ],
50 | 'routes' => [
51 | '_default' => [
52 | 'target' => BackendModuleStartCrawlingController::class . '::handleRequest',
53 | ],
54 | ],
55 | ],
56 | 'web_site_crawler_process' => [
57 | 'parent' => 'web_site_crawler',
58 | 'access' => 'user',
59 | 'path' => '/module/page/crawler/process',
60 | 'iconIdentifier' => 'crawler-process',
61 | 'labels' => [
62 | 'title' => 'Process',
63 | ],
64 | 'routes' => [
65 | '_default' => [
66 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest',
67 | ],
68 | ],
69 | ],
70 | 'web_site_crawler_log' => [
71 | 'parent' => 'web_site_crawler',
72 | 'access' => 'user',
73 | 'path' => '/module/page/crawler/log',
74 | 'iconIdentifier' => 'crawler-log',
75 | 'labels' => [
76 | 'title' => 'Log',
77 | ],
78 | 'routes' => [
79 | '_default' => [
80 | 'target' => BackendModuleCrawlerLogController::class . '::handleRequest',
81 | ],
82 | ],
83 | ],
84 | ];
85 |
--------------------------------------------------------------------------------
/Configuration/Extbase/Persistence/Classes.php:
--------------------------------------------------------------------------------
1 | [
7 | 'tableName' => 'tx_crawler_configuration',
8 | ],
9 | AOE\Crawler\Domain\Model\Process::class => [
10 | 'tableName' => 'tx_crawler_process',
11 | ],
12 | AOE\Crawler\Domain\Model\Queue::class => [
13 | 'tableName' => 'tx_crawler_queue',
14 | ],
15 | ];
16 |
--------------------------------------------------------------------------------
/Configuration/Icons.php:
--------------------------------------------------------------------------------
1 | [
9 | 'provider' => SvgIconProvider::class,
10 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_configuration.svg',
11 | ],
12 | 'tx-crawler-start' => [
13 | 'provider' => SvgIconProvider::class,
14 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_start.svg',
15 | ],
16 | 'tx-crawler-stop' => [
17 | 'provider' => SvgIconProvider::class,
18 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_stop.svg',
19 | ],
20 | 'tx-crawler-icon' => [
21 | 'provider' => SvgIconProvider::class,
22 | 'source' => 'EXT:crawler/Resources/Public/Icons/Extension.svg',
23 | ],
24 | ];
25 |
--------------------------------------------------------------------------------
/Configuration/RequestMiddlewares.php:
--------------------------------------------------------------------------------
1 | [
10 | 'aoe/crawler/authentication' => [
11 | 'target' => FrontendUserAuthenticator::class,
12 | 'after' => ['typo3/cms-frontend/authentication'],
13 | 'before' => ['typo3/cms-frontend/page-resolver'],
14 | ],
15 | 'aoe/crawler/initialization' => [
16 | 'target' => CrawlerInitialization::class,
17 | 'before' => ['typo3/cms-frontend/prepare-tsfe-rendering'],
18 | ],
19 | ],
20 | ];
21 |
--------------------------------------------------------------------------------
/Documentation/Configuration/ConfigurationRecords/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _backend-configuration-record:
4 |
5 | =====================
6 | Configuration records
7 | =====================
8 |
9 | Formerly configuration was done by using pageTS (see below). This is
10 | still possible (fully backwards compatible) but not recommended.
11 | Instead of writing pageTS simply create a configuration record (table:
12 | ``tx_crawler_configuration``) and put it on the topmost page of the
13 | pagetree you want to affect with this configuration.
14 |
15 | The fields in these records are related to the pageTS keys described
16 | below.
17 |
18 | .. _backend-configuration-record-fields:
19 |
20 | Fields and their pageTS equivalents
21 | ===================================
22 |
23 | .. _backend-configuration-record-general:
24 |
25 | General
26 | -------
27 |
28 | .. figure:: /Images/backend_configurationrecord_general.png
29 | :alt: Backend configuration record: General
30 |
31 | Backend configuration record: General
32 |
33 | Name
34 | Corresponds to the "key" part in the pageTS setup e.g.
35 | :typoscript:`tx_crawler.crawlerCfg.paramSets.myConfigurationKeyName`
36 |
37 | Protocol for crawling
38 | Force HTTP, HTTPS or keep the configured protocol
39 |
40 | Processing instruction filter
41 | List of processing instructions. See also:
42 | :ref:`paramSets.[key].procInstrFilter `
43 |
44 | Base URL
45 | Set baseUrl (most likely the same as the entry point configured in your
46 | site configuration)
47 |
48 | Pids only
49 | List of Page Ids to limit this configuration to. See also:
50 | :ref:`paramSets.[key].pidsOnly `
51 |
52 | Exclude pages
53 | Comma separated list of page ids which should not be crawled.
54 | You can do recursive exclusion by adding `uid`+`depth` e.g. 6+3,
55 | this will ensure that all pages including pageUid 6 and 3 levels down
56 | will not be crawled.
57 |
58 | Configuration
59 | Parameter configuration. The values of GET variables are according to a
60 | special syntax. See also: :ref:`paramSets.[key]
61 | `
62 |
63 | Processing instruction parameters
64 | Options for processing instructions. Will be defined in the respective third
65 | party modules. See also: :ref:`paramSets.[key].procInstrParams
66 | `
67 |
68 | Crawl with FE user groups
69 | User groups to set for the request. See also:
70 | :ref:`paramSets.[key].userGroups ` and the hint in :ref:`create-crawler-configuration`
71 |
72 | .. _backend-configuration-record-access:
73 |
74 | Access
75 | ------
76 |
77 | .. figure:: /Images/backend_configurationrecord_access.png
78 | :alt: Backend configuration record: Access
79 |
80 | Backend configuration record: Access
81 |
82 | Hide
83 | If activated the configuration record is not taken into account.
84 |
85 | Restrict access to
86 | Restricts access to this configuration record to selected backend user
87 | groups. Empty means no restriction is set.
88 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _examples:
4 |
5 | ========
6 | Examples
7 | ========
8 |
9 | .. toctree::
10 | :maxdepth: 5
11 | :titlesonly:
12 | :glob:
13 |
14 | News/Index
15 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _example-configuration-news:
4 |
5 | ========
6 | EXT:news
7 | ========
8 |
9 | The news extensions is one of the most used extensions in the TYPO3 CMS. This
10 | configuration is made under the assumption with a page tree looking similar to this:
11 |
12 | .. figure:: /Images/ext_news_pagetree.png
13 | :alt: Example Pagetree of EXT:news setup
14 |
15 | Example Pagetree of EXT:news setup
16 |
17 | If you want to have a Crawler Configuration that matches this, you can add
18 | following to the :guilabel:`PageTS` for PageId `56`.
19 |
20 | .. literalinclude:: _page.tsconfig
21 | :caption: packages/my_extension/Configuration/Sets/MySet/page.tsconfig
22 |
23 | Now you can add the News detail-view pages to the crawler queue and have them in
24 | the cache and the `indexed_search` index if you are using that.
25 |
26 | .. _example-configuration-news-category:
27 |
28 | Respecting Categories in News
29 | =============================
30 |
31 | On some installations news is configured in such a way, that news of category A
32 | have their detail view on one page and news of category B have their detail view on
33 | another page. In this case it would still be possible to view news of category A on
34 | the detail page for category B (example.com/detail-page-for-category-B/news-of-category-A).
35 | That means that each news article would be crawled twice - once on the detail page
36 | for category A and once on the detail page for category B. It is possible to use a
37 | PSR-14 event with news to prevent this.
38 |
39 | On both detail pages include this typoscript setup:
40 |
41 | .. literalinclude:: _setup.typoscript
42 | :caption: packages/my_extension/Configuration/Sets/MySet/setup.typoscript
43 |
44 | and register an event listener in your site package.
45 |
46 | .. literalinclude:: _services.yaml
47 | :caption: packages/my_extension/Configuration/Services.yaml
48 |
49 | .. literalinclude:: _NewsDetailEventListener.php
50 | :caption: packages/my_extension/Classes/EventListeners/NewsDetailEventListener.php
51 |
52 | .. warning::
53 |
54 | Note that this does more than just prevent articles from being indexed twice. It
55 | actually prevents articles from being displayed on a page that is supposed to show
56 | only articles of a certain category!
57 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/_NewsDetailEventListener.php:
--------------------------------------------------------------------------------
1 | getAssignedValues();
14 | $newsItem = $assignedValues['newsItem'];
15 | $demand = $assignedValues['demand'];
16 | $settings = $assignedValues['settings'];
17 |
18 | if ($newsItem !== null) {
19 | $demandedCategories = $demand->getCategories();
20 | $itemCategories = $newsItem->getCategories()->toArray();
21 | $itemCategoryIds = \array_map(function ($category) {
22 | return (string) $category->getUid();
23 | }, $itemCategories);
24 |
25 | if (count($demandedCategories) > 0 && !$this::itemMatchesCategoryDemand(
26 | $settings['categoryConjunction'],
27 | $itemCategoryIds,
28 | $demandedCategories
29 | )) {
30 | $assignedValues['newsItem'] = null;
31 | $event->setAssignedValues($assignedValues);
32 | }
33 | }
34 | }
35 |
36 | protected static function itemMatchesCategoryDemand(
37 | string $categoryConjunction,
38 | array $itemCategoryIds,
39 | array $demandedCategories
40 | ): bool {
41 | $numOfDemandedCategories = \count($demandedCategories);
42 | $intersection = \array_intersect($itemCategoryIds, $demandedCategories);
43 | $numOfCommonItems = \count($intersection);
44 |
45 | switch ($categoryConjunction) {
46 | case 'AND':
47 | return $numOfCommonItems === $numOfDemandedCategories;
48 | case 'OR':
49 | return $numOfCommonItems > 0;
50 | case 'NOTAND':
51 | return $numOfCommonItems < $numOfDemandedCategories;
52 | case 'NOTOR':
53 | return $numOfCommonItems === 0;
54 | }
55 | return true;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/_page.tsconfig:
--------------------------------------------------------------------------------
1 | tx_crawler.crawlerCfg.paramSets {
2 | tx_news = &tx_news_pi1[controller]=News&tx_news_pi1[action]=detail&tx_news_pi1[news]=[_TABLE:tx_news_domain_model_news; _PID:58; _WHERE: hidden = 0]
3 | tx_news {
4 | pidsOnly = 57
5 | }
6 | }
7 |
8 | # _PID:58 is the Folder where news records are stored.
9 | # pidSOnly = 57 is the detail-view PageId.
10 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/_services.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | MyVendor\MyExtension\EventListeners\NewsDetailEventListener:
3 | tags:
4 | - name: event.listener
5 | identifier: 'myNewsDetailListener'
6 | event: GeorgRinger\News\Event\NewsDetailActionEvent
7 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/_setup.typoscript:
--------------------------------------------------------------------------------
1 | plugin.tx_news.settings {
2 | # categories and categoryconjunction are not considered in detail view, so they must be overridden
3 | overrideFlexformSettingsIfEmpty = cropMaxCharacters,dateField,timeRestriction,archiveRestriction,orderBy,orderDirection,backPid,listPid,startingpoint,recursive,list.paginate.itemsPerPage,list.paginate.templatePath,categories,categoryConjunction
4 | # see the news extension for possible values of categoryConjunction
5 | categoryConjunction = AND
6 | categories =
7 | detail.errorHandling = pageNotFoundHandler
8 | }
9 |
--------------------------------------------------------------------------------
/Documentation/Configuration/ExtensionManagerConfiguration/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _extension-manager-configuration:
4 |
5 | ===============================
6 | Extension Manager Configuration
7 | ===============================
8 |
9 | A lot of options were added to the extension manager configuration,
10 | that allow settings to improve and enable new crawler features:
11 |
12 | .. figure:: /Images/backend_configuration_settings.png
13 | :alt: Backend configuration: Settings
14 |
15 | Backend configuration: Settings
16 |
17 | .. figure:: /Images/backend_configuration_queue.png
18 | :alt: Backend configuration: Queue
19 |
20 | Backend configuration: Queue
21 |
--------------------------------------------------------------------------------
/Documentation/Configuration/HttpAuthentication/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _http-authentication:
4 |
5 | ===================
6 | HTTP Authentication
7 | ===================
8 |
9 | If you want to use HTTP Authentication you need to configure your base url
10 | to contain user:pass
11 |
12 | .. code-block:: text
13 |
14 | https://user:pass@www.mydomain.com/
15 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _configuration:
4 |
5 | =============
6 | Configuration
7 | =============
8 |
9 | .. toctree::
10 | :maxdepth: 5
11 | :titlesonly:
12 | :glob:
13 |
14 | ExtensionManagerConfiguration/Index
15 | ConfigurationRecords/Index
16 | PageTsconfigReference(txCrawlercrawlercfg)/Index
17 | HttpAuthentication/Index
18 | Examples/Index
19 |
--------------------------------------------------------------------------------
/Documentation/Configuration/PageTsconfigReference(txCrawlercrawlercfg)/_page.tsconfig:
--------------------------------------------------------------------------------
1 | tx_crawler.crawlerCfg.paramSets.test = &L=[0-3]
2 | tx_crawler.crawlerCfg.paramSets.test {
3 | procInstrFilter = tx_indexedsearch_reindex
4 | pidsOnly = 1,5,13,55
5 | userGroups = 1
6 | force_ssl = 1
7 | }
8 |
--------------------------------------------------------------------------------
/Documentation/Configuration/PageTsconfigReference(txCrawlercrawlercfg)/_paramSets_page.tsconfig:
--------------------------------------------------------------------------------
1 | tx_crawler.crawlerCfg.paramSets {
2 | myConfigurationKeyName = &tx_myext[items]=[_TABLE:tt_myext_items;_PID:15;_WHERE: hidden = 0]
3 | myConfigurationKeyName {
4 | pidsOnly = 13
5 | procInstrFilter = tx_indexedsearch_reindex
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default.txt:
--------------------------------------------------------------------------------
1 | 38 entries found for processing. (Use "mode" to decide action):
2 |
3 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/overview
4 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/rich-text
5 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/headers
6 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/bullet-list
7 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser
8 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-and-icon
9 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-in-columns
10 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/list-group
11 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/panel
12 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/table
13 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/quote
14 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/audio
15 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/text-and-images
16 | ...
17 | [10-04-20 10:36] https://crawler-devbox.ddev.site/content-examples/and-more/frames
18 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_exec.txt:
--------------------------------------------------------------------------------
1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode exec
2 | https://crawler-devbox.ddev.site/content-examples/overview
3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text
4 | https://crawler-devbox.ddev.site/content-examples/text/headers
5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list
6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser
7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon
8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns
9 | https://crawler-devbox.ddev.site/content-examples/text/list-group
10 | https://crawler-devbox.ddev.site/content-examples/text/panel
11 | ...
12 | Processing
13 |
14 | https://crawler-devbox.ddev.site/content-examples/overview () =>
15 |
16 | OK:
17 | User Groups:
18 |
19 | https://crawler-devbox.ddev.site/content-examples/text/rich-text () =>
20 |
21 | OK:
22 | User Groups:
23 |
24 | https://crawler-devbox.ddev.site/content-examples/text/headers () =>
25 |
26 | OK:
27 | User Groups:
28 |
29 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list () =>
30 |
31 | OK:
32 | User Groups:
33 | ...
34 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_url.txt:
--------------------------------------------------------------------------------
1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode url
2 | https://crawler-devbox.ddev.site/content-examples/overview
3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text
4 | https://crawler-devbox.ddev.site/content-examples/text/headers
5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list
6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser
7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon
8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns
9 | https://crawler-devbox.ddev.site/content-examples/text/list-group
10 | https://crawler-devbox.ddev.site/content-examples/text/panel
11 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/ExecutingQueueWithCron-job/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _with-crown:
4 |
5 | =============================
6 | Executing queue with cron-job
7 | =============================
8 |
9 | A "cron-job" refers to a script that runs on the server with time
10 | intervals.
11 |
12 | For this to become reality you must ideally have a cron-job set up.
13 | This assumes you are running on Unix architecture of some sort. The
14 | crontab is often edited by :bash:`crontab -e` and you should insert a line
15 | like this:
16 |
17 | .. code-block:: plaintext
18 |
19 | * * * * * vendor/bin/typo3 crawler:buildQueue > /dev/null
20 |
21 | This will run the script every minute. You should try to run the
22 | script on the command line first to make sure it runs without any
23 | errors. If it doesn't output anything it was successful.
24 |
25 | You will need to have a user called `_cli_` and you must have PHP installed
26 | as a CGI script as well in :path:`/usr/bin/`.
27 |
28 | The user `_cli_` is created by the framework on demand if it does not exist
29 | at the first command line call.
30 |
31 | Make sure that the user `_cli_` has admin-rights.
32 |
33 | In the :guilabel:`CLI status` menu of the :guilabel:`Site Crawler` info module
34 | you can see the status:
35 |
36 | .. figure:: /Images/backend_processlist.png
37 | :alt: Status page in the backend
38 |
39 | Status page in the backend
40 |
41 | This is how it looks just after you ran the script. (You can also see
42 | the full path to the script in the bottom - this is the path to the
43 | script as you should use it on the command line / in the crontab)
44 |
45 | If the cron-script stalls there is a default delay of 1 hour before a
46 | new process will announce the old one dead and run a new one. If a
47 | cron-script takes more than 1 minute and thereby overlaps the next
48 | process, the next process will NOT start if it sees that the "lock-
49 | file" exists (unless that hour has passed).
50 |
51 | The reason why it works like this is to make sure that overlapping
52 | calls to the crawler CLI script will not run parallel processes. So
53 | the second call will just exit if it finds in the status file that the
54 | process is already running. But of course a crashed script will fail
55 | to set the status to "end" and hence this situation can occur.
56 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _executing-the-queue-label:
4 |
5 | ===================
6 | Executing the queue
7 | ===================
8 |
9 | The idea of the queue is that a large number of tasks can be submitted
10 | to the queue and performed over longer time. This could be interesting
11 | for several reasons;
12 |
13 | - To spread server load over time.
14 |
15 | - To time the requests for nightly processing.
16 |
17 | - And simply to avoid `max_execution_time` of PHP to limit processing
18 | to 30 seconds!
19 |
20 |
21 | .. toctree::
22 | :maxdepth: 5
23 | :titlesonly:
24 | :glob:
25 |
26 | RunningViaCommandController/Index
27 | ExecutingQueueWithCron-job/Index
28 | RunViaBackend/Index
29 | BuildingAndExecutingQueueRightAway(fromCli)/Index
30 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/RunViaBackend/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _run-backend:
4 |
5 | ===============
6 | Run via backend
7 | ===============
8 |
9 | To process the queue you must either set up a cron-job on your server
10 | or use the backend to process the queue:
11 |
12 | .. figure:: /Images/backend_processlist_add_process.png
13 | :alt: Process the queue via backend
14 |
15 | Process the queue via backend
16 |
17 | You can also (re-)crawl single URLs manually from within the :guilabel:`Crawler
18 | log` view in the info module:
19 |
20 | .. figure:: /Images/backend_crawlerlog_recrawl.png
21 | :alt: Crawl single URLs via backend
22 |
23 | Crawl single URLs via backend
24 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/RunningViaCommandController/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _command-controller:
4 |
5 | ==========================
6 | Run via command controller
7 | ==========================
8 |
9 | .. _command-controller-buildqueue:
10 |
11 | Create queue
12 | ------------
13 |
14 | .. code-block:: bash
15 | :caption: replace vendor/bin/typo3 with your own cli runner
16 |
17 | $ vendor/bin/typo3 crawler:buildQueue [--depth ] [--number ] [--mode ]
18 |
19 | .. _command-controller-processqueue:
20 |
21 | Run queue
22 | ---------
23 |
24 | .. code-block:: bash
25 | :caption: replace vendor/bin/typo3 with your own cli runner
26 |
27 | $ vendor/bin/typo3 crawler:processQueue [--amount ] [--sleeptime ] [--sleepafter ]
28 |
29 | .. _command-controller-flushqueue:
30 |
31 | Flush queue
32 | -----------
33 |
34 | .. code-block:: bash
35 | :caption: replace vendor/bin/typo3 with your own cli runner
36 |
37 | $ vendor/bin/typo3 crawler:flushQueue
38 |
--------------------------------------------------------------------------------
/Documentation/Features/AutomaticAddPagesToQueue/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _add-to-queue:
4 |
5 | ============================
6 | Automatic add pages to Queue
7 | ============================
8 |
9 | .. versionadded:: 9.1.0
10 |
11 | .. _add-to-queue-edit:
12 |
13 | Edit Pages
14 | ----------
15 |
16 | With this feature, you will automatically add pages to the crawler queue
17 | when you are editing content on the page, unless it's within a workspace, then
18 | it will not be added to the queue before it's published.
19 |
20 | This functionality gives you the advantages that you would not need to keep track
21 | of which pages you have edited, it will automatically be handle on next crawler
22 | process task, see :ref:`executing-the-queue-label`. This ensure that
23 | your cache or e.g. Search Index is always up to date and the end-users will see
24 | the most current content as soon as possible.
25 |
26 | .. _add-to-queue-cache:
27 |
28 | Clear Page Single Cache
29 | -----------------------
30 |
31 | As the edit and clear page cache function is using the same dataHandler hooks,
32 | we have an additional feature for free. When you clear the page cache for a specific
33 | page then it will also be added automatically to the crawler queue. Again this will
34 | be processed during the next crawler process.
35 |
36 | .. figure:: /Images/backend_clear_cache.png
37 | :alt: Clearing the page cache
38 |
39 | Clearing the page cache
40 |
41 | .. figure:: /Images/backend_clear_cache_queue.png
42 | :alt: Page is added to the crawler queue
43 |
44 | Page is added to the crawler queue
45 |
--------------------------------------------------------------------------------
/Documentation/Features/Events/_AfterQueueItemAddedEventListener.php:
--------------------------------------------------------------------------------
1 | $afterUrl());
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/Documentation/Features/Events/_AfterUrlCrawledEventListener_services.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | AOE\Crawler\EventListener\AfterUrlCrawledEventListener:
3 | tags:
4 | - name: event.listener
5 | identifier: 'ext-extension-key/AfterUrlCrawledEventListener'
6 | event: AOE\Crawler\Event\AfterUrlCrawledEvent
7 |
--------------------------------------------------------------------------------
/Documentation/Features/Events/_BeforeQueueItemAddedEventListener.php:
--------------------------------------------------------------------------------
1 | getReasonText();
14 | // You can implement different logic based on reason, GUI or CLI
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Documentation/Features/Events/_InvokeQueueChangeEvent_services.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | AOE\Crawler\EventListener\InvokeQueueChangeEvent:
3 | tags:
4 | - name: event.listener
5 | identifier: 'ext-extension-key/InvokeQueueChangeEventListener'
6 | event: AOE\Crawler\Event\InvokeQueueChangeEvent
7 |
--------------------------------------------------------------------------------
/Documentation/Features/Events/_ModifySkipPageEventListener.php:
--------------------------------------------------------------------------------
1 | getPageRow()['uid'] === 42) {
14 | $modifySkipPageEvent->setSkipped('Page with uid "42" is excluded by ModifySkipPageEvent');
15 | }
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/Documentation/Features/Events/_ModifySkipPageEventListener_services.yaml:
--------------------------------------------------------------------------------
1 | services:
2 | AOE\Crawler\EventListener\ModifySkipPageEventListener:
3 | tags:
4 | - name: event.listener
5 | identifier: 'ext-extension-key/ModifySkipPageEventListener'
6 | event: AOE\Crawler\Event\ModifySkipPageEvent
7 |
--------------------------------------------------------------------------------
/Documentation/Features/Hooks/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _hooks:
4 |
5 | =====
6 | Hooks
7 | =====
8 |
9 | Register the following hooks in :file:`ext_localconf.php` of your extension.
10 |
11 | .. _hooks-excludeDoktype:
12 |
13 | excludeDoktype Hook
14 | ===================
15 |
16 | By adding doktype ids to following array you can exclude them from
17 | being crawled:
18 |
19 | .. code-block:: php
20 | :caption: packages/my_extension/ext_localconf.php
21 |
22 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'][] =
23 |
24 | .. _hooks-pageVeto:
25 |
26 | pageVeto Hook
27 | =============
28 |
29 | .. deprecated:: 11.0.0
30 | Will be removed in 13.0, please migrate to the PSR-14 Event :ref:`psr14-modify-skip-page-event`!
31 |
32 | You can also decide whether a page should not be crawled in an
33 | individual userfunction. Register your function here:
34 |
35 | .. code-block:: php
36 | :caption: packages/my_extension/ext_localconf.php
37 |
38 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'][] = MyVendor\MyExtension\Hooks\Crawler\PageVeto::class . '->excludePage';
39 |
40 | .. literalinclude:: _PageVeto.php
41 | :caption: packages/my_extension/Classes/Hooks/Crawler/PageVeto.php
42 |
--------------------------------------------------------------------------------
/Documentation/Features/Hooks/_PageVeto.php:
--------------------------------------------------------------------------------
1 | applicationData['tx_crawler']['success']['tx_staticpub'] = true;
33 |
34 |
--------------------------------------------------------------------------------
/Documentation/Features/PriorityCrawling/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _priority-crawling:
4 |
5 | =================
6 | Priority Crawling
7 | =================
8 |
9 | .. versionadded:: 9.1.0
10 |
11 | Some website has a quite large number of pages. Some pages are logically more
12 | important than others e.g. the start-, support-, product-, you name it-pages.
13 | These important pages are also the pages where we want to have the best caching
14 | and performance, as they will most likely be the pages with the most changes and
15 | the most traffic.
16 |
17 | With TYPO3 10 LTS the `sysext/seo` introduced among other things, the
18 | `sitemap_priority`, which is used to generate an SEO optimised sitemap.xml
19 | where page priorities are listed as well. Their priorities will most likely be higher the
20 | more important the page is for you and the end-user.
21 |
22 | This logic is something that we can benefit from in the Crawler as well. A
23 | Website with let us say 10.000 pages, will have different importance depending on
24 | the page you are at. Therefore we have changed the functionality of the crawler,
25 | to take the value of this field, range from 0.0 to 1.0, into consideration when
26 | processing the crawler queue. This means that if you have a page with high priority
27 | for your sitemap, it will also be crawled first when a new crawler process is
28 | added.
29 |
30 | This ensures that we will always crawl the pages that have the highest importance to
31 | you and your end-user based on your sitemap priority. We choose to
32 | reuse this field, to not have editors doing work that is more or less similar twice.
33 |
34 | If you don't want to use this functionality, it's ok. You can just ignore the
35 | options that the `sysext/seo` gives you and all pages will by default get a priority
36 | 0.5, and therefore do not influence the processing order as everyone will have the
37 | same priority.
38 |
39 | The existing :guilabel:`SEO` tab will be used to set priorities when editing
40 | pages.
41 |
42 | .. image:: /Images/backend_crawler_seo_v10.png
43 |
44 | .. figure:: /Images/backend_crawler_seo_priority_v10.png
45 | :alt: The SEO tab will contain the sitemap_priority field
46 |
47 | The SEO tab will contain the sitemap_priority field
48 |
--------------------------------------------------------------------------------
/Documentation/Images/backend_addfromcontextmenu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_addfromcontextmenu.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_clear_cache.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_clear_cache.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_clear_cache_queue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_clear_cache_queue.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_configuration_deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configuration_deployment.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_configuration_queue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configuration_queue.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_configuration_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configuration_settings.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_configurationrecord_access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configurationrecord_access.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_configurationrecord_general.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_configurationrecord_general.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_crawler_seo_priority_v10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawler_seo_priority_v10.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_crawler_seo_v10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawler_seo_v10.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_crawlerlog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawlerlog.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_crawlerlog_recrawl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_crawlerlog_recrawl.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_info_php_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_info_php_error.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_pendingurls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_pendingurls.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_php_path_configuration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_php_path_configuration.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_processlist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_processlist.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_processlist_add_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_processlist_add_process.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_recrawl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_recrawl.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_scheduler_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_scheduler_overview.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_scheduler_processqueue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_scheduler_processqueue.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_scheduler_record.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_scheduler_record.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_startcrawling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_startcrawling.png
--------------------------------------------------------------------------------
/Documentation/Images/backend_startnewprocess.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/backend_startnewprocess.png
--------------------------------------------------------------------------------
/Documentation/Images/cli_addtoque.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/cli_addtoque.png
--------------------------------------------------------------------------------
/Documentation/Images/cli_processque.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/cli_processque.png
--------------------------------------------------------------------------------
/Documentation/Images/crawler_settings_processLimit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/crawler_settings_processLimit.png
--------------------------------------------------------------------------------
/Documentation/Images/ext_news_pagetree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tomasnorre/crawler/89e9cd198019dff5cddaf6afd97e5ebb05ba5bad/Documentation/Images/ext_news_pagetree.png
--------------------------------------------------------------------------------
/Documentation/Includes.rst.txt:
--------------------------------------------------------------------------------
1 | .. You can put central messages to display on all pages here
2 |
--------------------------------------------------------------------------------
/Documentation/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _start:
4 |
5 | ======================
6 | Site Crawler Extension
7 | ======================
8 |
9 | :Extension key:
10 | crawler
11 |
12 | :Package name:
13 | tomasnorre/crawler
14 |
15 | :Version:
16 | |release|
17 |
18 | :Language:
19 | en
20 |
21 | :Author:
22 | Tomas Norre Mikkelsen
23 |
24 | :Copyright:
25 | 2005-2021 AOE GmbH, since 2021 Tomas Norre Mikkelsen
26 |
27 | :License:
28 | This document is published under the `Open Content License
29 | `_.
30 |
31 | :Rendered:
32 | |today|
33 |
34 | ----
35 |
36 | Libraries and scripts for crawling the TYPO3 page tree. Used for re-caching, re-indexing, publishing applications etc.
37 |
38 | ----
39 |
40 | **Table of Contents:**
41 |
42 | .. toctree::
43 | :maxdepth: 2
44 | :titlesonly:
45 |
46 | Introduction/Index
47 | Configuration/Index
48 | ExecutingTheQueue/Index
49 | Scheduler/Index
50 | UseCases/Index
51 | Features/Index
52 | Troubleshooting/Index
53 | Links/Links
54 |
55 | .. toctree::
56 | :hidden:
57 |
58 | Sitemap
59 |
--------------------------------------------------------------------------------
/Documentation/Introduction/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _introduction:
4 |
5 | ============
6 | Introduction
7 | ============
8 |
9 | .. _introduction-what:
10 |
11 | What does it do?
12 | ================
13 |
14 | The TYPO3 Crawler is an extension which provides possibilities, from both
15 | the TYPO3 backend and from CLIm that helps you with you cache and e.g.
16 | search index.
17 |
18 | The Crawler implements several PSR-14 events, that you can use to "hook" into
19 | if you have certain requirements for your site at the given time.
20 |
21 | See more :ref:`psr14-modify-skip-page-event`.
22 |
23 | It features an API that other extensions can plug into. Example of this
24 | is "indexed\_search" which uses crawler to index content defined by
25 | its Indexing Configurations. Other extensions supporting it are
26 | "staticpub" (publishing to static pages) or "cachemgm" (allows
27 | recaching of pages).
28 |
29 | The requests of URLs is specially designed to request TYPO3 frontends
30 | with special processing instructions. The requests sends a TYPO3
31 | specific header in the GET requests which identifies a special action.
32 | For instance the action requested could be to publish the URL to a
33 | static file or it could be to index its content - or re-cache the
34 | page. These processing instructions are also defined by third-party
35 | extensions (and indexed search is one of them). In this way a
36 | processing instruction can instruct the frontend to perform an action
37 | (like indexing, publishing etc.) which cannot be done with a request
38 | from outside.
39 |
40 | .. _introduction-screenshots:
41 |
42 | Screenshots
43 | ===========
44 |
45 | The extension provides a backend module which displays the queue and log and
46 | allows execution and status check of the "cronscript" from the backend for
47 | testing purposes.
48 |
49 | .. figure:: /Images/backend_processlist.png
50 |
51 | CLI status display
52 |
53 | CLI = Command Line Interface = shell script = cron script
54 |
55 | .. figure:: /Images/backend_crawlerlog.png
56 |
57 | Crawler queue (before processing) / log (after processing)
58 |
59 | .. figure:: /Images/backend_pendingurls.png
60 |
61 | Interface for submitting a batch of URLs to be crawled
62 |
63 | The parameter combinations are programmable through Page TSconfig or
64 | configuration records.
65 |
--------------------------------------------------------------------------------
/Documentation/Links/Links.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _links:
4 |
5 | =====
6 | Links
7 | =====
8 |
9 | :TER:
10 | https://extensions.typo3.org/extension/crawler/
11 |
12 | :Bug Tracker:
13 | https://github.com/tomasnorre/crawler/issues
14 |
15 | :Git Repository:
16 | https://github.com/tomasnorre/crawler.git
17 |
--------------------------------------------------------------------------------
/Documentation/Scheduler/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _scheduler:
4 |
5 | =========
6 | Scheduler
7 | =========
8 |
9 |
10 | .. toctree::
11 | :maxdepth: 5
12 | :titlesonly:
13 | :glob:
14 |
15 |
16 | As seen in :ref:`executing-the-queue-label` you can execute the queue in
17 | multiple ways, but it's no fun doing that manually all the time.
18 |
19 | With the Crawler you have the possibility to add Scheduler Tasks to be executed
20 | on a give time. The Crawler commands are implemented with the Symfony Console,
21 | and therefore they can be configured with the Core supported
22 | `Execute console commands (scheduler)` task.
23 |
24 | So how to setup crawler scheduler tasks:
25 |
26 | 1. Add a new Scheduler Task
27 | 2. Select the class :guilabel:`Execute console commands`
28 | 3. Select :guilabel:`Frequency` for the execution
29 | 4. Go to section :guilabel:`Schedulable Command. Save and reopen to define
30 | command arguments` at the bottom.
31 | 5. Select e.g. :guilabel:`crawler:buildQueue` (press save)
32 | 6. Select the options you want to execute the queue with, it's important to
33 | check the checkboxes and not only fill in the values.
34 |
35 | Now you can save and close, and your scheduler tasks will be running as
36 | configured.
37 |
38 | The configured task will look like this:
39 |
40 | .. figure:: /Images/backend_scheduler_record.png
41 | :alt: Task configuration for building the queue
42 |
43 | Task configuration for building the queue
44 |
45 | And after save and close, you can see what command is executed, it would be
46 | the same parameters, you can use when running from cli,
47 | see :ref:`executing-the-queue-cli-label`
48 |
49 | .. figure:: /Images/backend_scheduler_overview.png
50 | :alt: Task in the scheduled tasks overview
51 |
52 | Task in the scheduled tasks overview
53 |
--------------------------------------------------------------------------------
/Documentation/Sitemap.rst:
--------------------------------------------------------------------------------
1 | :template: sitemap.html
2 |
3 | .. include:: /Includes.rst.txt
4 |
5 | .. _sitemap:
6 |
7 | =======
8 | Sitemap
9 | =======
10 |
11 | .. The sitemap.html template will insert here the page tree automatically.
12 |
--------------------------------------------------------------------------------
/Documentation/Troubleshooting/_htaccess.txt:
--------------------------------------------------------------------------------
1 |
2 | # Rules to set ApplicationContext based on hostname
3 | RewriteCond %{HTTP_HOST} ^(.*)\.my\-site\.localhost$
4 | RewriteRule .? - [E=TYPO3_CONTEXT:Development]
5 | RewriteCond %{HTTP_HOST} ^(.*)\.mysite\.info$
6 | RewriteRule .? - [E=TYPO3_CONTEXT:Production/Staging]
7 | RewriteCond %{HTTP_HOST} ^(.*)\.my\-site\.info$
8 | RewriteRule .? - [E=TYPO3_CONTEXT:Production]
9 |
10 |
--------------------------------------------------------------------------------
/Documentation/UseCases/CacheWarmup/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _use-case-cache-warm-up:
4 |
5 | =============
6 | Cache warm up
7 | =============
8 |
9 | To have a website that is fast for the end-user is essential, therefore having a
10 | warm cache even before the first user hits the newly deployed website, will be
11 | beneficial, so how could one achieve this?
12 |
13 | The crawler have some command line tools (hereafter cli tools) that can be used,
14 | during deployments. The cli tools is implemented with the `symfony/console`
15 | which have been standard in TYPO3 for a while.
16 |
17 | There are 3 commands that can be of you benefit during deployments.
18 |
19 | * :bash:`vendor/bin/typo3 crawler:flushQueue`
20 | * :bash:`vendor/bin/typo3 crawler:buildQueue`
21 | * :bash:`vendor/bin/typo3 crawler:processQueue`
22 |
23 | You can see more on which parameters they take in :ref:`command-controller`,
24 | this example will provide suggestion on how you can set it up, and you can
25 | adjust with additional parameters if you like.
26 |
27 | .. rst-class:: bignums-xxl
28 |
29 | .. _create-crawler-configuration:
30 | #. Create crawler configuration
31 |
32 | First we need a `crawler configuration` these are stored in the database. You
33 | can add it via the backend, see :ref:`backend-configuration-record`.
34 |
35 | It's suggested to select the most important pages of the website and add
36 | them to a Crawler configuration called e.g. `deployment`:
37 |
38 | .. figure:: /Images/backend_configuration_deployment.png
39 | :alt: Crawler configuration record
40 |
41 | Crawler configuration record
42 |
43 | .. hint::
44 | Let's say your website has frontend users with one or multiple user
45 | groups. In this case you need to create multiple crawler
46 | configurations: For every possible combination of User groups that a
47 | user can have you need to create a individual crawler configuration.
48 |
49 | All those crawler configurations need to be added to the
50 | `crawler:processQueue` command to be considered. If you miss this
51 | some user get a warmed up cache but those with a combination of
52 | user groups which was not taken into account in a crawler configuration
53 | will get an uncached page.
54 |
55 | #. Build the queue
56 |
57 | With this only pages added will be crawled when using this configuration. So
58 | how will we execute this from CLI during deployment? I don't know which
59 | deployment tool you use, but it's not important as long as you can execute
60 | shell commands. What would you need to execute?
61 |
62 | .. literalinclude:: _commands.bash
63 | :language: bash
64 |
65 | #. Process the queue
66 |
67 | The last step will add the pages to the queue, and you would need a scheduler
68 | task setup to have them processed. Go to the :guilabel:`Scheduler` module and
69 | do following steps:
70 |
71 | 1. Add a new Scheduler Task
72 | 2. Select the :guilabel:`Execute console commands`
73 | 3. Select :guilabel:`Frequency` for the execution
74 | 4. Go to section :guilabel:`Schedulable Command. Save and reopen to define
75 | command arguments` at the bottom.
76 | 5. Select :guilabel:`crawler:processQueue` (press save)
77 | 6. Select the options you want to execute the queue with, it's important to
78 | check the checkboxes and not only fill in the values.
79 |
80 | .. figure:: /Images/backend_scheduler_processqueue.png
81 | :alt: Options of the task
82 |
83 | Options of the task
84 |
85 |
86 | With there steps you will have a website that is faster by the first visit after
87 | a deployment, and the rest of the website is crawled automatically shortly
88 | after.
89 |
90 | `#HappyCrawling`
91 |
--------------------------------------------------------------------------------
/Documentation/UseCases/CacheWarmup/_commands.bash:
--------------------------------------------------------------------------------
1 | # Done to make sure the crawler queue is empty, so that we will only crawl important pages.
2 | $ vendor/bin/typo3 crawler:flushQueue all
3 |
4 | # Now we want to fill the crawler queue,
5 | # This will start on page uid 1 with the deployment configuration and depth 99,
6 | # --mode exec crawles the pages instantly so we don't need a secondary process for that.
7 | $ vendor/bin/typo3 crawler:buildQueue 1 deployment --depth 99 --mode exec
8 |
9 | # Add the rest of the pages to crawler queue and have the processed with the scheduler
10 | # --mode queue is default, but it is added for visibility,
11 | # we assume that you have a crawler configuration called default
12 | $ vendor/bin/typo3 crawler:buildQueue 1 default --depth 99 --mode queue
13 |
--------------------------------------------------------------------------------
/Documentation/UseCases/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _use-cases:
4 |
5 | =========
6 | Use cases
7 | =========
8 |
9 | This section is made to show different use cases for the crawler, and what value
10 | it can bring by installing it. The crawler has transformed over the years to
11 | have multiple use cases. If you have some that is not listed here, feel free
12 | to make a PR or issue on `https://github.com/tomasnorre/crawler
13 | `_.
14 |
15 | .. toctree::
16 | :maxdepth: 5
17 | :titlesonly:
18 | :glob:
19 |
20 | CacheWarmup/Index
21 | IndexedSearch/Index
22 |
23 |
--------------------------------------------------------------------------------
/Documentation/UseCases/IndexedSearch/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _use-case-indexed-search:
4 |
5 | ==============
6 | Indexed Search
7 | ==============
8 |
9 | The TYPO3 Crawler is quite often used for generated the Index of Indexed Search.
10 |
11 | Unfortunately we don't have any good documentation included on this, but you can help in two ways.
12 |
13 | 1. You can help write the documentations
14 | 2. You can tip into the money pool, to help sponsor those writing the documentation.
15 |
16 | You can see the issue here: https://github.com/tomasnorre/crawler/issues/558
17 | or tip in the money pool here: https://www.paypal.com/paypalme/tomasnorre/10
18 |
19 | `#HappyCrawling`
20 |
21 | .. _use-case-indexed-search-setup:
22 |
23 | Setup Index Search
24 | ==================
25 |
26 | With the latest improvements of the TYPO3 Crawler and Indexed Search, it's gotten
27 | easier to set up Indexed Search to work with the TYPO3 Crawler.
28 |
29 | You need a few things to have this working.
30 |
31 | 1. Create a :ref:`backend-configuration-record`
32 | 2. Setup add a Indexed Search Configuration (See: https://docs.typo3.org/c/typo3/cms-indexed-search/main/en-us/IndexingConfigurations/Configurations/Index.html)
33 |
34 | If you want to index e.g. PDF files please ensure that you have the
35 | respective tools installed on your server. For PDFs that would be `pdftotext` and
36 | `pdfinfo`.
37 |
--------------------------------------------------------------------------------
/Documentation/guides.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
11 |
17 |
18 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help
2 | help: ## Displays this list of targets with descriptions
3 | @echo "The following commands are available:\n"
4 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[32m%-30s\033[0m %s\n", $$1, $$2}'
5 |
6 |
7 | .PHONY: docs
8 | docs: ## Generate projects docs (from "Documentation" directory)
9 | mkdir -p Documentation-GENERATED-temp
10 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation
11 |
12 |
13 | .PHONY: test-docs
14 | test-docs: ## Test the documentation rendering
15 | mkdir -p Documentation-GENERATED-temp
16 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation --no-progress --minimal-test
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TYPO3 Crawler
2 | [](https://packagist.org/packages/tomasnorre/crawler)
3 | [](https://packagist.org/packages/tomasnorre/crawler)
4 | [](https://packagist.org/packages/tomasnorre/crawler)
5 | 
6 | [](https://scrutinizer-ci.com/g/tomasnorre/crawler/?branch=main)
7 | [](https://coveralls.io/github/tomasnorre/crawler)
8 | [](https://dashboard.stryker-mutator.io/reports/github.com/tomasnorre/crawler/main)
9 | 
10 | [](http://isitmaintained.com/project/tomasnorre/crawler "Average time to resolve an issue")
11 | [](http://isitmaintained.com/project/tomasnorre/crawler "Percentage of issues still open")
12 |
13 | Libraries and scripts for crawling the TYPO3 page tree. Used for re-caching, re-indexing, publishing applications etc.
14 |
15 |
16 | You can include the crawler in your TYPO3 project with composer or from the [TYPO3 Extension Repository](https://extensions.typo3.org/extension/crawler)
17 |
18 | ```shell script
19 | composer require tomasnorre/crawler
20 | ```
21 |
22 | **Crawler processes**
23 |
24 | 
25 |
26 | ## Versions and Support
27 |
28 | | Release | TYPO3 | PHP | Fixes will contain
29 | |---------|-----------|---------|---|
30 | | 12.x.y | 12.4-13.3 | 8.1-8.4 |Features, Bugfixes, Security Updates, Since 12.0.6 TYPO3 13.4, Since 12.0.7 PHP 8.4
31 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Security Updates, Since 11.0.3 PHP 8.1
32 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates
33 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead.
34 | | 8.x.y | | | Releases do not exist
35 | | 7.x.y | | | Releases do not exist
36 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates
37 |
38 | ### Documentation
39 | Please read the [documentation](https://docs.typo3.org/p/tomasnorre/crawler/master/en-us/)
40 |
41 | To render the documentation locally, please use the official TYPO3 Documentation rendering Docker Tool.
42 |
43 |
44 | ### Contributions
45 |
46 | Please see [CONTRIBUTING.md](https://github.com/tomasnorre/crawler/blob/main/CONTRIBUTING.md)
47 |
48 | ### Honorable Previous Maintainers
49 |
50 | * Kasper Skaarhoj
51 | * Daniel Poetzinger
52 | * Fabrizio Branca
53 | * Tolleiv Nietsch
54 | * Timo Schmidt
55 | * Michael Klapper
56 | * Stefan Rotsch
57 |
--------------------------------------------------------------------------------
/Resources/Private/Language/af.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ar.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ca.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/cs.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/da.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Side Id som crawleren vil bruge for at indlæse TSFE (påkrævet)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | Når en side crawles direkte fra TYPO3 Backend. fx. ved at bruge "læs" funktionaliteten i "Crawler Log" modulet, bruges den valgte siden til at initialisere frontend renderingen. Adgang til den valgte side <strong>MÅ IKKE</strong> være begrænset, i så fald vil crawlingen fejle.
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/Resources/Private/Language/de.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Seiten-ID, die der Crawler zur Initialisierung des TSFE verwendet (erforderlich)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | Beim Crawlen einer Seite direkt im TYPO3-Backend, z.B. unter Verwendung der "read"-Funktionalität des Moduls "Crawler-Protokoll" wird die ausgewählte Seiten-ID zur Initialisierung der Frontend-Darstellung verwendet.
14 | Zugriff auf die ausgewählte Seite <strong>DARF NICHT</strong> eingeschränkt sein; das Crawling wird sonst fehlschlagen.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/el.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/es.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/fi.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/fr.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/he.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/hu.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/it.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ja.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ko.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 |
9 |
10 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
11 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/Resources/Private/Language/nl.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/no.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/pl.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/pt.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ro.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ru.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/sr.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/sv.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/tr.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/uk.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/vi.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/zh.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Layouts/BackendModule.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/Resources/Private/Php/Libraries/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "config": {
3 | "classmap-authoritative": true,
4 | "prepend-autoloader": false
5 | },
6 | "require": {
7 | "beberlei/assert": "^3.3",
8 | "guzzlehttp/guzzle": "^6.4.1 || ^7.2",
9 | "psr/http-message": "^2.0",
10 | "psr/log": "^1.0 || ^2.0 || ^3.0",
11 | "symfony/console": "^6.4 || ^7.0"
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/Resources/Public/Css/backend_crawler.css:
--------------------------------------------------------------------------------
1 | table.crawlerLogActions {
2 | border-spacing: 0px 10px;
3 | border-collapse: initial;
4 | }
5 |
6 | table.crawlerLogActions tr.firstRow td {
7 | padding: 0px 10px 0px 0px;
8 | }
9 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/Extension.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/bullet_green.svg:
--------------------------------------------------------------------------------
1 |
3 |
7 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/bullet_orange.svg:
--------------------------------------------------------------------------------
1 |
3 |
7 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/bullet_red.svg:
--------------------------------------------------------------------------------
1 |
3 |
7 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/crawler_configuration.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/crawler_start.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/crawler_stop.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Supported Versions
4 |
5 | | Release | TYPO3 | PHP | Fixes will contain
6 | |---------|-----------|---|---|
7 | | 12.x.y | 12.2 | 8.1 |Features, Bugfixes, Security Updates
8 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Bugfixes, Security Updates, Since 11.0.3 PHP 8.1
9 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates
10 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead.
11 | | 8.x.y | | | Releases do not exist
12 | | 7.x.y | | | Releases do not exist
13 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates
14 |
15 |
16 | ## Reporting a Vulnerability
17 |
18 | I case you find a security issue, please write an email to: [tomasnorre@gmail.com](mailto:tomasnorre@gmail.com)
19 |
--------------------------------------------------------------------------------
/cli/bootstrap.php:
--------------------------------------------------------------------------------
1 | 1
41 | processMaxRunTime=300
42 |
43 | #########
44 | ## Cleanup
45 | #########
46 |
47 | # cat=Cleanup; type=boolean; label=Clean up old queue entries: If checked the older queue entries will be deleted when adding new crawler configurations from CLI.
48 | cleanUpOldQueueEntries=1
49 |
50 | # cat=Cleanup; type=int [1- 99]; label=Processed Age: If Clean up old queue entries is checked, then processed entries older than X days are deleted.
51 | cleanUpProcessedAge=2
52 |
53 | # cat=Cleanup; type=int [1- 99]; label=Scheduled Age: If Clean up old queue entries is checked, then scheduled entries older than X days are deleted.
54 | cleanUpScheduledAge=7
55 |
56 | # cat=Cleanup; type=int [1-365]; label= Delete processed items: Delete processed items from the queue after n days (0 will keep the entries forever - the database may grow very large over time!)
57 | purgeQueueDays=14
58 |
59 | #########
60 | ## System
61 | #########
62 |
63 | # cat=System; type=string; label= Name of the php binary (e.g. PHP72-LATEST-CLI ), default is php
64 | phpBinary=php
65 |
66 | # cat=System; type=string; label= PHP Path: Local path to php binary file (e.g. "/usr/bin/php"), you should ONLY use this when the resolved php-binary isn't the correct one. You can check that in the Info -> Site Crawling -> Crawling Process -> CLI-Path
67 | phpPath=
68 |
69 | #########
70 | ## Debug
71 | #########
72 |
73 | # cat=Debug; type=boolean; label= Debug: Print Multiprocess- processing informations - prints some information whether a process was really executed and which status it has
74 | processDebug=0
75 |
76 | # cat=Debug; type=boolean; label= Make Multiprocess- processing be verbose while running
77 | processVerbose=0
78 |
--------------------------------------------------------------------------------
/ext_emconf.php:
--------------------------------------------------------------------------------
1 | 'Site Crawler',
4 | 'description' => 'Libraries and scripts for crawling the TYPO3 page tree.',
5 | 'category' => 'module',
6 | 'state' => 'stable',
7 | 'uploadfolder' => 0,
8 | 'createDirs' => '',
9 | 'clearCacheOnLoad' => 0,
10 | 'author' => 'Tomas Norre Mikkelsen',
11 | 'author_email' => 'tomasnorre@gmail.com',
12 | 'author_company' => '',
13 | 'version' => '12.0.8',
14 | 'constraints' => [
15 | 'depends' => [
16 | 'php' => '8.1.0-8.99.99',
17 | 'typo3' => '12.4.0-13.4.99',
18 | ],
19 | 'conflicts' => [],
20 | 'suggests' => [],
21 | ]
22 | ];
23 |
--------------------------------------------------------------------------------
/ext_localconf.php:
--------------------------------------------------------------------------------
1 | isPackageActive('indexed_search')) {
13 | // Register with "indexed_search" extension
14 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']['indexed_search'] = [
15 | 'key' => 'tx_indexedsearch_reindex',
16 | 'value' => 'Re-indexing'
17 | ];
18 | }
19 |
20 |
21 |
--------------------------------------------------------------------------------
/ext_tables.sql:
--------------------------------------------------------------------------------
1 | #
2 | # Table structure for table 'tx_crawler_queue'
3 | #
4 | CREATE TABLE tx_crawler_queue
5 | (
6 | qid int(11) DEFAULT '0' NOT NULL auto_increment,
7 | page_id int(11) DEFAULT '0' NOT NULL,
8 | parameters text NOT NULL,
9 | parameters_hash varchar(50) DEFAULT '' NOT NULL,
10 | configuration_hash varchar(50) DEFAULT '' NOT NULL,
11 | scheduled int(11) DEFAULT '0' NOT NULL,
12 | exec_time int(11) DEFAULT '0' NOT NULL,
13 | set_id int(11) DEFAULT '0' NOT NULL,
14 | result_data longtext NOT NULL,
15 | process_scheduled int(11) DEFAULT '0' NOT NULL,
16 | process_id varchar(50) DEFAULT '' NOT NULL,
17 | process_id_completed varchar(50) DEFAULT '' NOT NULL,
18 | configuration varchar(250) DEFAULT '' NOT NULL,
19 |
20 | PRIMARY KEY (qid),
21 | KEY page_id (page_id),
22 | KEY set_id (set_id),
23 | KEY exec_time (exec_time),
24 | KEY scheduled (scheduled),
25 | KEY process_id (process_id),
26 | KEY parameters_hash (parameters_hash),
27 | KEY configuration_hash (configuration_hash),
28 | KEY cleanup (exec_time,scheduled)
29 | ) ENGINE=InnoDB;
30 |
31 | #
32 | # Table structure for table 'tx_crawler_process'
33 | #
34 | CREATE TABLE tx_crawler_process
35 | (
36 | process_id varchar(50) DEFAULT '' NOT NULL,
37 | active smallint(6) DEFAULT '0',
38 | ttl int(11) DEFAULT '0' NOT NULL,
39 | assigned_items_count int(11) DEFAULT '0' NOT NULL,
40 | deleted tinyint(4) unsigned DEFAULT '0' NOT NULL,
41 | system_process_id int(11) DEFAULT '0' NOT NULL,
42 |
43 | KEY update_key (active,deleted),
44 | KEY process_id (process_id)
45 | ) ENGINE=InnoDB;
46 |
47 | #
48 | # Table structure for table 'tx_crawler_configuration'
49 | #
50 | CREATE TABLE tx_crawler_configuration
51 | (
52 | name tinytext NOT NULL,
53 | force_ssl tinyint(4) DEFAULT '0' NOT NULL,
54 | processing_instruction_filter varchar(200) DEFAULT '' NOT NULL,
55 | processing_instruction_parameters_ts varchar(200) DEFAULT '' NOT NULL,
56 | configuration text NOT NULL,
57 | base_url tinytext NOT NULL,
58 | pidsonly blob,
59 | begroups varchar(100) DEFAULT '0' NOT NULL,
60 | fegroups varchar(100) DEFAULT '0' NOT NULL,
61 | exclude text NOT NULL
62 |
63 | ) ENGINE=InnoDB;
64 |
65 | #
66 | # Table structure for table 'pages'
67 | # This is added to reuse the information from typo3/cms-seo.
68 | # As we don't have a dependency for typo3/cms-seo it's added here to ensure that the
69 | # database queries isn't breaking
70 | #
71 | CREATE TABLE pages
72 | (
73 | sitemap_priority decimal(2, 1) DEFAULT '0.5' NOT NULL
74 | );
75 |
--------------------------------------------------------------------------------