17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/RunViaBackend/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _run-backend:
4 |
5 | ===============
6 | Run via backend
7 | ===============
8 |
9 | To process the queue you must either set up a cron-job on your server
10 | or use the backend to process the queue:
11 |
12 | .. figure:: /Images/backend_processlist_add_process.png
13 | :alt: Process the queue via backend
14 |
15 | Process the queue via backend
16 |
17 | You can also (re-)crawl single URLs manually from within the :guilabel:`Crawler
18 | log` view in the info module:
19 |
20 | .. figure:: /Images/backend_crawlerlog_recrawl.png
21 | :alt: Crawl single URLs via backend
22 |
23 | Crawl single URLs via backend
24 |
--------------------------------------------------------------------------------
/Classes/EventListener/ShouldUseCachedPageDataIfAvailableEventListener.php:
--------------------------------------------------------------------------------
1 | getRequest()->getAttribute('tx_crawler') === null) {
18 | return;
19 | }
20 | $event->setShouldUseCachedPageData(false);
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/Classes/Exception/ProcessException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class ProcessException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Documentation/Features/Hooks/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _hooks:
4 |
5 | =====
6 | Hooks
7 | =====
8 |
9 | Register the following hooks in :file:`ext_localconf.php` of your extension.
10 |
11 | .. _hooks-excludeDoktype:
12 |
13 | excludeDoktype Hook
14 | ===================
15 |
16 | By adding doktype ids to following array you can exclude them from
17 | being crawled:
18 |
19 | .. code-block:: php
20 | :caption: packages/my_extension/ext_localconf.php
21 |
22 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'][] =
23 |
24 | pageVeto Hook
25 | =============
26 |
27 | .. deprecated:: 11.0.0
28 | Removed in 13.0, please migrate to the PSR-14 Event :ref:`psr14-modify-skip-page-event`!
29 |
--------------------------------------------------------------------------------
/ext_emconf.php:
--------------------------------------------------------------------------------
1 | 'Site Crawler',
4 | 'description' => 'TYPO3 Crawler crawls the TYPO3 page tree. Used for cache warmup, indexing, publishing applications etc.',
5 | 'category' => 'module',
6 | 'state' => 'stable',
7 | 'uploadfolder' => 0,
8 | 'createDirs' => '',
9 | 'clearCacheOnLoad' => 0,
10 | 'author' => 'Tomas Norre Mikkelsen',
11 | 'author_email' => 'tomasnorre@gmail.com',
12 | 'author_company' => '',
13 | 'version' => '12.0.10',
14 | 'constraints' => [
15 | 'depends' => [
16 | 'php' => '8.1.0-8.99.99',
17 | 'typo3' => '12.4.0-13.4.99',
18 | ],
19 | 'conflicts' => [],
20 | 'suggests' => [],
21 | ]
22 | ];
23 |
--------------------------------------------------------------------------------
/Classes/Exception/NoIndexFoundException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class NoIndexFoundException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Classes/EventListener/AfterQueueItemAddedEventListener.php:
--------------------------------------------------------------------------------
1 | getConnectionForTable(QueueRepository::TABLE_NAME)
17 | ->update(QueueRepository::TABLE_NAME, $event->getFieldArray(), [
18 | 'qid' => (int) $event->getQueueId(),
19 | ]);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/Classes/Exception/CommandNotFoundException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class CommandNotFoundException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Classes/Exception/ExtensionSettingsException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | class ExtensionSettingsException extends \Exception
26 | {
27 | }
28 |
--------------------------------------------------------------------------------
/Documentation/UseCases/CacheWarmup/_commands.bash:
--------------------------------------------------------------------------------
1 | # Done to make sure the crawler queue is empty, so that we will only crawl important pages.
2 | $ vendor/bin/typo3 crawler:flushQueue all
3 |
4 | # Now we want to fill the crawler queue,
5 | # This will start on page uid 1 with the deployment configuration and depth 99,
6 | # --mode exec crawles the pages instantly so we don't need a secondary process for that.
7 | $ vendor/bin/typo3 crawler:buildQueue 1 deployment --depth 99 --mode exec
8 |
9 | # Add the rest of the pages to crawler queue and have the processed with the scheduler
10 | # --mode queue is default, but it is added for visibility,
11 | # we assume that you have a crawler configuration called default
12 | $ vendor/bin/typo3 crawler:buildQueue 1 default --depth 99 --mode queue
13 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/crawler_stop.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_url.txt:
--------------------------------------------------------------------------------
1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode url
2 | https://crawler-devbox.ddev.site/content-examples/overview
3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text
4 | https://crawler-devbox.ddev.site/content-examples/text/headers
5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list
6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser
7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon
8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns
9 | https://crawler-devbox.ddev.site/content-examples/text/list-group
10 | https://crawler-devbox.ddev.site/content-examples/text/panel
11 |
--------------------------------------------------------------------------------
/ext_localconf.php:
--------------------------------------------------------------------------------
1 | isPackageActive('indexed_search')) {
13 | // Register with "indexed_search" extension
14 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']['indexed_search'] = [
15 | 'key' => 'tx_indexedsearch_reindex',
16 | 'value' => 'Re-indexing'
17 | ];
18 | }
19 |
20 |
21 |
--------------------------------------------------------------------------------
/Classes/Exception/TimeStampException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | * @deprecated since 12.0.5 will be removed in v14.x
25 | */
26 | class TimeStampException extends \Exception
27 | {
28 | }
29 |
--------------------------------------------------------------------------------
/Classes/Exception/CrawlerObjectException.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | * @deprecated since 12.0.5 will be removed in v14.x
25 | */
26 | class CrawlerObjectException extends \Exception
27 | {
28 | }
29 |
--------------------------------------------------------------------------------
/Classes/Helper/Sleeper/SleeperInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | interface SleeperInterface
26 | {
27 | public function sleep(int $seconds): void;
28 | }
29 |
--------------------------------------------------------------------------------
/Configuration/Icons.php:
--------------------------------------------------------------------------------
1 | [
9 | 'provider' => SvgIconProvider::class,
10 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_configuration.svg',
11 | ],
12 | 'tx-crawler-start' => [
13 | 'provider' => SvgIconProvider::class,
14 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_start.svg',
15 | ],
16 | 'tx-crawler-stop' => [
17 | 'provider' => SvgIconProvider::class,
18 | 'source' => 'EXT:crawler/Resources/Public/Icons/crawler_stop.svg',
19 | ],
20 | 'tx-crawler-icon' => [
21 | 'provider' => SvgIconProvider::class,
22 | 'source' => 'EXT:crawler/Resources/Public/Icons/Extension.svg',
23 | ],
24 | ];
25 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _executing-the-queue-label:
4 |
5 | ===================
6 | Executing the queue
7 | ===================
8 |
9 | The idea of the queue is that a large number of tasks can be submitted
10 | to the queue and performed over longer time. This could be interesting
11 | for several reasons;
12 |
13 | - To spread server load over time.
14 |
15 | - To time the requests for nightly processing.
16 |
17 | - And simply to avoid `max_execution_time` of PHP to limit processing
18 | to 30 seconds!
19 |
20 |
21 | .. toctree::
22 | :maxdepth: 5
23 | :titlesonly:
24 | :glob:
25 |
26 | RunningViaCommandController/Index
27 | ExecutingQueueWithCron-job/Index
28 | RunViaBackend/Index
29 | BuildingAndExecutingQueueRightAway(fromCli)/Index
30 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: help
2 | help: ## Displays this list of targets with descriptions
3 | @echo "The following commands are available:\n"
4 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[32m%-30s\033[0m %s\n", $$1, $$2}'
5 |
6 |
7 | .PHONY: docs
8 | docs: ## Generate projects docs (from "Documentation" directory)
9 | mkdir -p Documentation-GENERATED-temp
10 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation
11 |
12 |
13 | .PHONY: test-docs
14 | test-docs: ## Test the documentation rendering
15 | mkdir -p Documentation-GENERATED-temp
16 | docker run --rm --pull always -v "$(shell pwd)":/project -t ghcr.io/typo3-documentation/render-guides:latest --config=Documentation --no-progress --minimal-test
17 |
--------------------------------------------------------------------------------
/Classes/Writer/FileWriter/CsvWriter/CsvWriterInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | interface CsvWriterInterface
26 | {
27 | public function arrayToCsv(array $records): string;
28 | }
29 |
--------------------------------------------------------------------------------
/Classes/Hooks/CrawlerHookInterface.php:
--------------------------------------------------------------------------------
1 |
9 | * (c) 2021- Tomas Norre Mikkelsen
10 | *
11 | * This file is part of the TYPO3 Crawler Extension.
12 | *
13 | * It is free software; you can redistribute it and/or modify it under
14 | * the terms of the GNU General Public License, either version 2
15 | * of the License, or any later version.
16 | *
17 | * For the full copyright and license information, please read the
18 | * LICENSE.txt file that was distributed with this source code.
19 | *
20 | * The TYPO3 project - inspiring people to share!
21 | */
22 |
23 | /**
24 | * @internal since v12.0.0
25 | */
26 | interface CrawlerHookInterface
27 | {
28 | public function crawler_init(): void;
29 | }
30 |
--------------------------------------------------------------------------------
/Classes/Helper/Sleeper/NullSleeper.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /*
23 | * @internal
24 | * @codeCoverageIgnore
25 | */
26 | final class NullSleeper implements SleeperInterface
27 | {
28 | #[\Override]
29 | public function sleep(int $seconds): void
30 | {
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/Classes/Helper/Sleeper/SystemSleeper.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /*
23 | * @internal
24 | */
25 | final class SystemSleeper implements SleeperInterface
26 | {
27 | #[\Override]
28 | public function sleep(int $seconds): void
29 | {
30 | \sleep($seconds);
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/crawler_start.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Documentation/guides.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
11 |
17 |
18 |
--------------------------------------------------------------------------------
/Documentation/Features/MultiprocessSupport/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _multi-process:
4 |
5 | =====================
6 | Multi process support
7 | =====================
8 |
9 | If you want to optimize the crawling process for speed (instead of low
10 | server stress), maybe because the machine is a dedicated staging
11 | machine you should experiment with the new multi process features.
12 |
13 | In the extension settings you can set how many processes are allowed to
14 | run at the same time, how many queue entries a process should grab and
15 | how long a process is allowed to run. Then run one (or even more)
16 | crawling processes per minute. You'll be able to speed up the crawler quite a lot.
17 |
18 | But choose your settings carefully as it puts loads on the server.
19 |
20 | .. figure:: /Images/crawler_settings_processLimit.png
21 | :alt: Backend configuration: Processing
22 |
23 | Backend configuration: Processing
24 |
--------------------------------------------------------------------------------
/Classes/Process/ProcessManagerInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @codeCoverageIgnore
24 | * @internal since v12.0.10
25 | */
26 | interface ProcessManagerInterface
27 | {
28 | public function processExists(int $pid): bool;
29 |
30 | public function killProcess(int $pid): void;
31 |
32 | public function findDispatcherProcesses(): array;
33 | }
34 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Supported Versions
4 |
5 | | Release | TYPO3 | PHP | Fixes will contain
6 | |---------|-----------|---------|---|
7 | | 12.x.y | 12.4-13.3 | 8.1-8.4 |Features, Bugfixes, Security Updates, Since 12.0.6 TYPO3 13.4, Since 12.0.7 PHP 8.4
8 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Security Updates, Since 11.0.3 PHP 8.1
9 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates
10 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead.
11 | | 8.x.y | | | Releases do not exist
12 | | 7.x.y | | | Releases do not exist
13 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates
14 |
15 | ## Reporting a Vulnerability
16 |
17 | I case you find a security issue, please write an email to: [tomasnorre@gmail.com](mailto:tomasnorre@gmail.com) or reach out to the [TYPO3 Security Team](https://typo3.org/community/teams/security)
18 |
--------------------------------------------------------------------------------
/Classes/Process/ProcessManagerFactory.php:
--------------------------------------------------------------------------------
1 |
7 | *
8 | * This file is part of the TYPO3 Crawler Extension.
9 | *
10 | * It is free software; you can redistribute it and/or modify it under
11 | * the terms of the GNU General Public License, either version 2
12 | * of the License, or any later version.
13 | *
14 | * For the full copyright and license information, please read the
15 | * LICENSE.txt file that was distributed with this source code.
16 | *
17 | * The TYPO3 project - inspiring people to share!
18 | */
19 |
20 | use TYPO3\CMS\Core\Core\Environment;
21 |
22 | /**
23 | * @internal since v12.0.10
24 | */
25 | class ProcessManagerFactory
26 | {
27 | public static function create(): ProcessManagerInterface
28 | {
29 | if (Environment::isWindows()) {
30 | return new WindowsProcessManager();
31 | }
32 |
33 | return new UnixProcessManager();
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/Classes/Event/AfterUrlCrawledEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class AfterUrlCrawledEvent
26 | {
27 | public function __construct(
28 | private readonly string $url,
29 | private readonly array $result
30 | ) {
31 | }
32 |
33 | public function getUrl(): string
34 | {
35 | return $this->url;
36 | }
37 |
38 | public function getResult(): array
39 | {
40 | return $this->result;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Classes/Event/AfterUrlAddedToQueueEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class AfterUrlAddedToQueueEvent
26 | {
27 | public function __construct(
28 | private readonly string $uid,
29 | private readonly array $fieldArray
30 | ) {
31 | }
32 |
33 | public function getUid(): string
34 | {
35 | return $this->uid;
36 | }
37 |
38 | public function getFieldArray(): array
39 | {
40 | return $this->fieldArray;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/RunningViaCommandController/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _command-controller:
4 |
5 | ==========================
6 | Run via command controller
7 | ==========================
8 |
9 | .. _command-controller-buildqueue:
10 |
11 | Create queue
12 | ------------
13 |
14 | .. code-block:: bash
15 | :caption: replace vendor/bin/typo3 with your own cli runner
16 |
17 | $ vendor/bin/typo3 crawler:buildQueue [--depth ] [--number ] [--mode ]
18 |
19 | .. _command-controller-processqueue:
20 |
21 | Run queue
22 | ---------
23 |
24 | .. code-block:: bash
25 | :caption: replace vendor/bin/typo3 with your own cli runner
26 |
27 | $ vendor/bin/typo3 crawler:processQueue [--amount ] [--sleeptime ] [--sleepafter ]
28 |
29 | .. _command-controller-flushqueue:
30 |
31 | Flush queue
32 | -----------
33 |
34 | .. code-block:: bash
35 | :caption: replace vendor/bin/typo3 with your own cli runner
36 |
37 | $ vendor/bin/typo3 crawler:flushQueue
38 |
--------------------------------------------------------------------------------
/Documentation/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _start:
4 |
5 | ======================
6 | Site Crawler Extension
7 | ======================
8 |
9 | :Extension key:
10 | crawler
11 |
12 | :Package name:
13 | tomasnorre/crawler
14 |
15 | :Version:
16 | |release|
17 |
18 | :Language:
19 | en
20 |
21 | :Author:
22 | Tomas Norre Mikkelsen
23 |
24 | :Copyright:
25 | 2005-2021 AOE GmbH, since 2021 Tomas Norre Mikkelsen
26 |
27 | :License:
28 | This document is published under the `Open Content License
29 | `_.
30 |
31 | :Rendered:
32 | |today|
33 |
34 | ----
35 |
36 | Libraries and scripts for crawling the TYPO3 page tree. Used for re-caching, re-indexing, publishing applications etc.
37 |
38 | ----
39 |
40 | **Table of Contents:**
41 |
42 | .. toctree::
43 | :maxdepth: 2
44 | :titlesonly:
45 |
46 | Introduction/Index
47 | Configuration/Index
48 | ExecutingTheQueue/Index
49 | Scheduler/Index
50 | UseCases/Index
51 | Features/Index
52 | Troubleshooting/Index
53 | Links/Links
54 |
55 | .. toctree::
56 | :hidden:
57 |
58 | Sitemap
59 |
--------------------------------------------------------------------------------
/Classes/Event/InvokeQueueChangeEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Model\Reason;
23 |
24 | /**
25 | * @internal since v12.0.0
26 | */
27 | final class InvokeQueueChangeEvent
28 | {
29 | public function __construct(
30 | private readonly Reason $reason
31 | ) {
32 | }
33 |
34 | public function getReasonDetailedText(): string
35 | {
36 | return $this->reason->getDetailText();
37 | }
38 |
39 | public function getReasonText(): string
40 | {
41 | return $this->reason->getReason();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/Classes/Service/UserService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\GeneralUtility;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | class UserService
28 | {
29 | public static function hasGroupAccess(string $groupList, string $accessList): bool
30 | {
31 | if (empty($accessList)) {
32 | return true;
33 | }
34 | foreach (explode(',', $groupList) as $groupUid) {
35 | if (GeneralUtility::inList($accessList, $groupUid)) {
36 | return true;
37 | }
38 | }
39 | return false;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/Classes/Value/CrawlAction.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Assert\Assert;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | final class CrawlAction implements \Stringable
28 | {
29 | private readonly string $crawlAction;
30 |
31 | public function __construct(string $crawlAction)
32 | {
33 | Assert::that($crawlAction)
34 | ->inArray(['start', 'log', 'multiprocess']);
35 |
36 | $this->crawlAction = $crawlAction;
37 | }
38 |
39 | #[\Override]
40 | public function __toString(): string
41 | {
42 | return $this->crawlAction;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/Classes/Value/QueueFilter.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Assert\Assert;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | class QueueFilter implements \Stringable
28 | {
29 | private readonly string $queueFilter;
30 |
31 | public function __construct(string $queueFilter = 'all')
32 | {
33 | Assert::that($queueFilter)
34 | ->inArray(['all', 'pending', 'finished']);
35 |
36 | $this->queueFilter = $queueFilter;
37 | }
38 |
39 | #[\Override]
40 | public function __toString(): string
41 | {
42 | return $this->queueFilter;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/Resources/Private/Language/locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 |
9 |
10 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
11 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/Classes/Service/ProcessInstructionService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\GeneralUtility;
23 |
24 | /**
25 | * @internal since v11.0.3
26 | */
27 | class ProcessInstructionService
28 | {
29 | public function isAllowed(string $processInstruction, array $incoming): bool
30 | {
31 | if (empty($incoming)) {
32 | return true;
33 | }
34 |
35 | foreach ($incoming as $pi) {
36 | if (GeneralUtility::inList($processInstruction, $pi)) {
37 | return true;
38 | }
39 | }
40 | return false;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Classes/Event/ModifySkipPageEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class ModifySkipPageEvent
26 | {
27 | private false|string $skipped = false;
28 |
29 | public function __construct(
30 | private readonly array $pageRow
31 | ) {
32 | }
33 |
34 | public function isSkipped(): false|string
35 | {
36 | return $this->skipped;
37 | }
38 |
39 | public function setSkipped(false|string $skipped): void
40 | {
41 | $this->skipped = $skipped;
42 | }
43 |
44 | public function getPageRow(): array
45 | {
46 | return $this->pageRow;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Classes/Event/BeforeQueueItemAddedEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class BeforeQueueItemAddedEvent
26 | {
27 | public function __construct(
28 | private readonly int $queueId,
29 | private array $queueRecord
30 | ) {
31 | }
32 |
33 | public function getQueueId(): int
34 | {
35 | return $this->queueId;
36 | }
37 |
38 | public function getQueueRecord(): array
39 | {
40 | return $this->queueRecord;
41 | }
42 |
43 | public function setQueueRecord(array $queueRecord): void
44 | {
45 | $this->queueRecord = $queueRecord;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default_mode_exec.txt:
--------------------------------------------------------------------------------
1 | $ bin/typo3 crawler:buildQueue 6 default --depth 2 --mode exec
2 | https://crawler-devbox.ddev.site/content-examples/overview
3 | https://crawler-devbox.ddev.site/content-examples/text/rich-text
4 | https://crawler-devbox.ddev.site/content-examples/text/headers
5 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list
6 | https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser
7 | https://crawler-devbox.ddev.site/content-examples/text/text-and-icon
8 | https://crawler-devbox.ddev.site/content-examples/text/text-in-columns
9 | https://crawler-devbox.ddev.site/content-examples/text/list-group
10 | https://crawler-devbox.ddev.site/content-examples/text/panel
11 | ...
12 | Processing
13 |
14 | https://crawler-devbox.ddev.site/content-examples/overview () =>
15 |
16 | OK:
17 | User Groups:
18 |
19 | https://crawler-devbox.ddev.site/content-examples/text/rich-text () =>
20 |
21 | OK:
22 | User Groups:
23 |
24 | https://crawler-devbox.ddev.site/content-examples/text/headers () =>
25 |
26 | OK:
27 | User Groups:
28 |
29 | https://crawler-devbox.ddev.site/content-examples/text/bullet-list () =>
30 |
31 | OK:
32 | User Groups:
33 | ...
34 |
--------------------------------------------------------------------------------
/Classes/Hooks/ProcessCleanUpHook.php:
--------------------------------------------------------------------------------
1 |
9 | * (c) 2021- Tomas Norre Mikkelsen
10 | *
11 | * This file is part of the TYPO3 Crawler Extension.
12 | *
13 | * It is free software; you can redistribute it and/or modify it under
14 | * the terms of the GNU General Public License, either version 2
15 | * of the License, or any later version.
16 | *
17 | * For the full copyright and license information, please read the
18 | * LICENSE.txt file that was distributed with this source code.
19 | *
20 | * The TYPO3 project - inspiring people to share!
21 | */
22 |
23 | use AOE\Crawler\Process\Cleaner\OldProcessCleaner;
24 | use AOE\Crawler\Process\Cleaner\OrphanProcessCleaner;
25 |
26 | /**
27 | * @internal since v9.2.5
28 | */
29 | class ProcessCleanUpHook implements CrawlerHookInterface
30 | {
31 | public function __construct(
32 | private readonly OrphanProcessCleaner $orphanCleaner,
33 | private readonly OldProcessCleaner $oldCleaner
34 | ) {
35 | }
36 |
37 | #[\Override]
38 | public function crawler_init(): void
39 | {
40 | $this->orphanCleaner->clean();
41 | $this->oldCleaner->clean();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/Classes/Event/AfterQueueItemAddedEvent.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v12.0.0
24 | */
25 | final class AfterQueueItemAddedEvent
26 | {
27 | /**
28 | * @param int|string $queueId
29 | */
30 | public function __construct(
31 | private $queueId,
32 | private array $fieldArray
33 | ) {
34 | }
35 |
36 | public function getQueueId(): int|string
37 | {
38 | return $this->queueId;
39 | }
40 |
41 | public function getFieldArray(): array
42 | {
43 | return $this->fieldArray;
44 | }
45 |
46 | public function setFieldArray(array $fieldArray): void
47 | {
48 | $this->fieldArray = $fieldArray;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/CrawlStrategyFactory.php:
--------------------------------------------------------------------------------
1 | configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(
20 | ExtensionConfigurationProvider::class
21 | );
22 | }
23 |
24 | public function create(): CrawlStrategyInterface
25 | {
26 | $extensionSettings = $this->configurationProvider->getExtensionConfiguration();
27 | if ($extensionSettings['makeDirectRequests'] ?? false) {
28 | /** @var CrawlStrategyInterface $instance */
29 | $instance = GeneralUtility::makeInstance(SubProcessExecutionStrategy::class, $this->configurationProvider);
30 | } else {
31 | $instance = GeneralUtility::makeInstance(GuzzleExecutionStrategy::class, $this->configurationProvider);
32 | }
33 |
34 | return $instance;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/BuildingAndExecutingQueueRightAway(fromCli)/_output_buildQueue_6_default.txt:
--------------------------------------------------------------------------------
1 | 38 entries found for processing. (Use "mode" to decide action):
2 |
3 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/overview
4 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/rich-text
5 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/headers
6 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/bullet-list
7 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-with-teaser
8 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-and-icon
9 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/text-in-columns
10 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/list-group
11 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/panel
12 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/table
13 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/text/quote
14 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/audio
15 | [10-04-20 10:35] https://crawler-devbox.ddev.site/content-examples/media/text-and-images
16 | ...
17 | [10-04-20 10:36] https://crawler-devbox.ddev.site/content-examples/and-more/frames
18 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/CallbackExecutionStrategy.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Controller\CrawlerController;
23 | use TYPO3\CMS\Core\Utility\GeneralUtility;
24 |
25 | /**
26 | * Used for hooks (e.g. crawling external files)
27 | * @internal since v12.0.0
28 | */
29 | class CallbackExecutionStrategy
30 | {
31 | /**
32 | * In the future, the callback should implement an interface.
33 | * @template T of object
34 | * @param class-string $callbackClassName
35 | */
36 | public function fetchByCallback(string $callbackClassName, array $parameters, CrawlerController $crawlerController)
37 | {
38 | // Calling custom object
39 | $callBackObj = GeneralUtility::makeInstance($callbackClassName);
40 | return $callBackObj->crawler_execute($parameters, $crawlerController);
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/Documentation/Features/PollableProcessingInstructions/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _pollable-processing:
4 |
5 | ================================
6 | Pollable processing instructions
7 | ================================
8 |
9 | Some processing instructions are never executed on the "client side"
10 | (the TYPO3 frontend that is called by the crawler). This happens for
11 | example if a try to staticpub a page containing non-cacheable
12 | elements. That bad thing about this is, that staticpub doesn't have
13 | any chance to tell that something went wrong and why. That's why we
14 | introduced the "pollable processing instructions" feature. You can
15 | define in the :file:`ext_localconf.php` file of your extension that this
16 | extension should be "pollable" bye adding following line:
17 |
18 | .. code-block:: php
19 | :caption: packages/my_extension/ext_localconf.php
20 |
21 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'][] = 'tx_staticpub';
22 |
23 | In this case the crawler expects the extension to tell if everything
24 | was ok actively, assuming that something went wrong (and displaying
25 | this in the log) is no "success message" was found.
26 |
27 | In your extension than simple write your "ok" status by calling this:
28 |
29 | .. code-block:: php
30 | :caption: packages/my_extension/ext_localconf.php
31 |
32 | $GLOBALS['TSFE']->applicationData['tx_crawler']['success']['tx_staticpub'] = true;
33 |
34 |
--------------------------------------------------------------------------------
/Classes/Process/WindowsProcessManager.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @codeCoverageIgnore
24 | * @internal since v12.0.10
25 | */
26 | class WindowsProcessManager implements ProcessManagerInterface
27 | {
28 | #[\Override]
29 | public function processExists(int $pid): bool
30 | {
31 | exec('tasklist | find "' . $pid . '"', $returnArray);
32 | return count($returnArray) > 0 && stripos($returnArray[0], 'php') !== false;
33 | }
34 |
35 | #[\Override]
36 | public function killProcess(int $pid): void
37 | {
38 | exec('taskkill /PID ' . $pid);
39 | }
40 |
41 | #[\Override]
42 | public function findDispatcherProcesses(): array
43 | {
44 | $returnArray = [];
45 | exec('tasklist | find \'typo3 crawler:processQueue\'', $returnArray);
46 | return $returnArray;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Classes/CrawlStrategy/CrawlStrategyInterface.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Http\Message\UriInterface;
23 |
24 | /**
25 | * @internal since v12.0.0
26 | */
27 | interface CrawlStrategyInterface
28 | {
29 | /**
30 | * Fetch the given URL and return its textual response
31 | *
32 | * @return array|false "false" on errors without explanation.
33 | * Array may contain the following optional keys:
34 | * - errorlog: array of string error messages
35 | * - content: HTML content (string)
36 | * - running: bool
37 | * - parameters: array
38 | * - log: array of strings
39 | * - vars: array
40 | */
41 | public function fetchUrlContents(UriInterface $url, string $crawlerId);
42 | }
43 |
--------------------------------------------------------------------------------
/Classes/Utility/HookUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Hooks\ProcessCleanUpHook;
23 |
24 | /**
25 | * @codeCoverageIgnore
26 | * @internal since v9.2.5
27 | */
28 | class HookUtility
29 | {
30 | /**
31 | * Registers hooks
32 | *
33 | * @param string $extKey
34 | */
35 | public static function registerHooks($extKey): void
36 | {
37 | // Activating Crawler cli_hooks
38 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['cli_hooks'][] =
39 | ProcessCleanUpHook::class;
40 |
41 | // Activating refresh hooks
42 | $GLOBALS['TYPO3_CONF_VARS']['EXTCONF'][$extKey]['refresh_hooks'][] =
43 | ProcessCleanUpHook::class;
44 |
45 | $GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['t3lib/class.t3lib_tcemain.php']['clearPageCacheEval'][] =
46 | "AOE\Crawler\Hooks\DataHandlerHook->addFlushedPagesToCrawlerQueue";
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/Extension.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Resources/Public/Icons/crawler_configuration.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Classes/Writer/FileWriter/CsvWriter/CrawlerCsvWriter.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Utility\CsvUtility;
23 |
24 | /**
25 | * @internal since v9.2.5
26 | */
27 | final class CrawlerCsvWriter implements CsvWriterInterface
28 | {
29 | private const CARRIAGE_RETURN = 13;
30 | private const LINE_FEED = 10;
31 |
32 | #[\Override]
33 | public function arrayToCsv(array $records): string
34 | {
35 | $csvLines = [];
36 | reset($records);
37 |
38 | $csvLines[] = $this->getRowHeaders($records);
39 | foreach ($records as $row) {
40 | $csvLines[] = CsvUtility::csvValues($row);
41 | }
42 |
43 | return implode(chr(self::CARRIAGE_RETURN) . chr(self::LINE_FEED), $csvLines);
44 | }
45 |
46 | private function getRowHeaders(array $lines): string
47 | {
48 | $fieldNames = array_map(strval(...), array_keys(current($lines)));
49 | return CsvUtility::csvValues($fieldNames);
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/Documentation/Features/AutomaticAddPagesToQueue/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _add-to-queue:
4 |
5 | ============================
6 | Automatic add pages to Queue
7 | ============================
8 |
9 | .. versionadded:: 9.1.0
10 |
11 | .. _add-to-queue-edit:
12 |
13 | Edit Pages
14 | ----------
15 |
16 | With this feature, you will automatically add pages to the crawler queue
17 | when you are editing content on the page, unless it's within a workspace, then
18 | it will not be added to the queue before it's published.
19 |
20 | This functionality gives you the advantages that you would not need to keep track
21 | of which pages you have edited, it will automatically be handle on next crawler
22 | process task, see :ref:`executing-the-queue-label`. This ensure that
23 | your cache or e.g. Search Index is always up to date and the end-users will see
24 | the most current content as soon as possible.
25 |
26 | .. _add-to-queue-cache:
27 |
28 | Clear Page Single Cache
29 | -----------------------
30 |
31 | As the edit and clear page cache function is using the same dataHandler hooks,
32 | we have an additional feature for free. When you clear the page cache for a specific
33 | page then it will also be added automatically to the crawler queue. Again this will
34 | be processed during the next crawler process.
35 |
36 | .. figure:: /Images/backend_clear_cache.png
37 | :alt: Clearing the page cache
38 |
39 | Clearing the page cache
40 |
41 | .. figure:: /Images/backend_clear_cache_queue.png
42 | :alt: Page is added to the crawler queue
43 |
44 | Page is added to the crawler queue
45 |
--------------------------------------------------------------------------------
/Resources/Private/Language/da.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Side Id som crawleren vil bruge for at indlæse TSFE (påkrævet)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | Når en side crawles direkte fra TYPO3 Backend. fx. ved at bruge "læs" funktionaliteten i "Crawler Log" modulet, bruges den valgte siden til at initialisere frontend renderingen. Adgang til den valgte side <strong>MÅ IKKE</strong> være begrænset, i så fald vil crawlingen fejle.
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/Classes/Process/UnixProcessManager.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @codeCoverageIgnore
24 | * @internal since v12.0.10
25 | */
26 | class UnixProcessManager implements ProcessManagerInterface
27 | {
28 | #[\Override]
29 | public function processExists(int $pid): bool
30 | {
31 | return file_exists('/proc/' . $pid);
32 | }
33 |
34 | #[\Override]
35 | public function killProcess(int $pid): void
36 | {
37 | posix_kill($pid, 9);
38 | }
39 |
40 | #[\Override]
41 | public function findDispatcherProcesses(): array
42 | {
43 | $returnArray = [];
44 | if (exec('which ps')) {
45 | // ps command is defined
46 | exec("ps aux | grep 'typo3 crawler:processQueue'", $returnArray);
47 | } else {
48 | trigger_error(
49 | 'Crawler is unable to locate the ps command to clean up orphaned crawler processes.',
50 | E_USER_WARNING
51 | );
52 | }
53 |
54 | return $returnArray;
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/Resources/Private/Language/af.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ar.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ca.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/cs.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/el.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/fi.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/fr.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/he.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/hu.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/it.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ja.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ko.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/nl.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/no.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/pl.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ro.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/ru.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/sr.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/tr.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/uk.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/vi.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/es.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/pt.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/sv.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Resources/Private/Language/zh.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Page Id the crawler will use for initializing the TSFE (required)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
14 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Classes/Configuration/ExtensionConfigurationProvider.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Log\LoggerAwareInterface;
23 | use Psr\Log\LoggerAwareTrait;
24 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationExtensionNotConfiguredException;
25 | use TYPO3\CMS\Core\Configuration\Exception\ExtensionConfigurationPathDoesNotExistException;
26 | use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
27 | use TYPO3\CMS\Core\Utility\GeneralUtility;
28 |
29 | /**
30 | * @internal since v9.2.5
31 | */
32 | class ExtensionConfigurationProvider implements LoggerAwareInterface
33 | {
34 | use LoggerAwareTrait;
35 |
36 | /**
37 | * Return full extension configuration array.
38 | */
39 | public function getExtensionConfiguration(): array
40 | {
41 | try {
42 | return GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('crawler');
43 | } catch (ExtensionConfigurationExtensionNotConfiguredException|ExtensionConfigurationPathDoesNotExistException $e) {
44 | $this->logger?->error($e->getMessage());
45 | }
46 | return [];
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/Classes/Crawler.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Core\Environment;
23 | use TYPO3\CMS\Core\SingletonInterface;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | /**
27 | * @internal since v9.2.5
28 | */
29 | final class Crawler implements SingletonInterface
30 | {
31 | private readonly string $processFilename;
32 |
33 | public function __construct(?string $processFilename = null)
34 | {
35 | $this->processFilename = $processFilename ?: Environment::getVarPath() . '/lock/tx_crawler.proc';
36 | $this->setDisabled(false);
37 | $pathInfo = pathinfo($this->processFilename);
38 | GeneralUtility::mkdir_deep($pathInfo['dirname']);
39 | }
40 |
41 | public function setDisabled(bool $disabled = true): void
42 | {
43 | if ($disabled) {
44 | GeneralUtility::writeFile($this->processFilename, '');
45 | } elseif (is_file($this->processFilename)) {
46 | unlink($this->processFilename);
47 | }
48 | }
49 |
50 | public function isDisabled(): bool
51 | {
52 | return is_file($this->processFilename);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/Resources/Private/Language/de.locallang_csh_tx_crawler_configuration.xlf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Page Id the crawler will use for initializing the TSFE (required)
8 | Seiten-ID, die der Crawler zur Initialisierung des TSFE verwendet (erforderlich)
9 |
10 |
11 | When crawling a page directly fom the TYPO3 backend, e.g. by using the "read" functionality of the "Crawler Log" module, the selected page id will be used to initialize the frontend rendering.
12 | Access to the selected page <strong>MUST NOT</strong> be restricted; crawling will fail otherwise.
13 | Beim Crawlen einer Seite direkt im TYPO3-Backend, z.B. unter Verwendung der "read"-Funktionalität des Moduls "Crawler-Protokoll" wird die ausgewählte Seiten-ID zur Initialisierung der Frontend-Darstellung verwendet.
14 | Zugriff auf die ausgewählte Seite <strong>DARF NICHT</strong> eingeschränkt sein; das Crawling wird sonst fehlschlagen.
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Classes/Controller/Backend/Helper/UrlBuilder.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Http\Message\UriInterface;
23 | use TYPO3\CMS\Backend\Routing\Exception\RouteNotFoundException;
24 | use TYPO3\CMS\Backend\Routing\UriBuilder;
25 | use TYPO3\CMS\Core\Utility\GeneralUtility;
26 |
27 | /**
28 | * @internal since v9.2.5
29 | */
30 | class UrlBuilder
31 | {
32 | /**
33 | * Returns the URL to the current module, including $_GET['id'].
34 | *
35 | * @param array $uriParameters optional parameters to add to the URL
36 | *
37 | * @throws RouteNotFoundException
38 | */
39 | public static function getBackendModuleUrl(
40 | array $uriParameters = [],
41 | string $module = 'web_site_crawler'
42 | ): UriInterface {
43 | $id = $GLOBALS['TYPO3_REQUEST']->getParsedBody()['id'] ?? $GLOBALS['TYPO3_REQUEST']->getQueryParams()['id'] ?? null;
44 | if ($id) {
45 | $uriParameters['id'] = $id;
46 | }
47 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class);
48 | return $uriBuilder->buildUriFromRoute($module, $uriParameters);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ### Contributing
2 |
3 | When you have a PR, please run the following checks first.
4 |
5 | * `composer test:all`
6 | * Requires a mysql-database, you can boot one with `docker-compose` from the `.Docker`-directory
7 | * `composer cs-fix`
8 | * Ensures that coding standards are respected
9 | * `composer analyse`
10 | * Will run PHPStan and do a static code analysis, this is not adjust completely in build yet, but please try to avoid adding new violations. ;)
11 |
12 | ### Writing documentation
13 |
14 | You can render the documentation in this extension with the command
15 |
16 | ```
17 | make docs
18 | ```
19 |
20 | #### Devbox
21 |
22 | If you don't have a setup already, where you can do development, bugfixing etc. for the crawler, don't worry.
23 |
24 | We have included a [ddev](https://www.ddev.com) devbox to help the development.
25 |
26 | ##### Prerequisites
27 |
28 | * [DDEV](https://www.ddev.com)
29 | * Docker
30 |
31 | ##### How to use the devbox?
32 |
33 | ```shell script
34 | $ git clone git@github.com:tomasnorre/crawler.git
35 | $ cd .devbox
36 | $ ddev start
37 | ```
38 |
39 | Username/password: `admin`/`password`
40 |
41 | And start working.
42 |
43 | **INFO**
44 | xdebug is disabled as default, to speed up the devbox when xdebug isn't needed.
45 |
46 | This can be activated with `ddev xdebug on`.
47 |
48 | #### Running tests without local development environment
49 | If you don't have `php` and/or `composer` installed on your host machine,
50 | you can run the test from withing the `ddev` docker container.
51 |
52 | Do that go into the `.devbox` folder an run `ddev ssh`.
53 | From there you need to switch folder into `/public/typo3conf/ext/crawler`
54 | and run `composer` commands from there (see above).
55 |
--------------------------------------------------------------------------------
/Documentation/Scheduler/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _scheduler:
4 |
5 | =========
6 | Scheduler
7 | =========
8 |
9 |
10 | .. toctree::
11 | :maxdepth: 5
12 | :titlesonly:
13 | :glob:
14 |
15 |
16 | As seen in :ref:`executing-the-queue-label` you can execute the queue in
17 | multiple ways, but it's no fun doing that manually all the time.
18 |
19 | With the Crawler you have the possibility to add Scheduler Tasks to be executed
20 | on a give time. The Crawler commands are implemented with the Symfony Console,
21 | and therefore they can be configured with the Core supported
22 | `Execute console commands (scheduler)` task.
23 |
24 | So how to setup crawler scheduler tasks:
25 |
26 | 1. Add a new Scheduler Task
27 | 2. Select the class :guilabel:`Execute console commands`
28 | 3. Select :guilabel:`Frequency` for the execution
29 | 4. Go to section :guilabel:`Schedulable Command. Save and reopen to define
30 | command arguments` at the bottom.
31 | 5. Select e.g. :guilabel:`crawler:buildQueue` (press save)
32 | 6. Select the options you want to execute the queue with, it's important to
33 | check the checkboxes and not only fill in the values.
34 |
35 | Now you can save and close, and your scheduler tasks will be running as
36 | configured.
37 |
38 | The configured task will look like this:
39 |
40 | .. figure:: /Images/backend_scheduler_record.png
41 | :alt: Task configuration for building the queue
42 |
43 | Task configuration for building the queue
44 |
45 | And after save and close, you can see what command is executed, it would be
46 | the same parameters, you can use when running from cli,
47 | see :ref:`executing-the-queue-cli-label`
48 |
49 | .. figure:: /Images/backend_scheduler_overview.png
50 | :alt: Task in the scheduled tasks overview
51 |
52 | Task in the scheduled tasks overview
53 |
--------------------------------------------------------------------------------
/Classes/Utility/PhpBinaryUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23 | use AOE\Crawler\Exception\CommandNotFoundException;
24 | use AOE\Crawler\Exception\ExtensionSettingsException;
25 | use TYPO3\CMS\Core\Utility\CommandUtility;
26 | use TYPO3\CMS\Core\Utility\GeneralUtility;
27 |
28 | /**
29 | * @internal since v9.2.5
30 | */
31 | class PhpBinaryUtility
32 | {
33 | public static function getPhpBinary(): string
34 | {
35 | $extensionSettings = GeneralUtility::makeInstance(
36 | ExtensionConfigurationProvider::class
37 | )->getExtensionConfiguration();
38 |
39 | if (empty($extensionSettings)) {
40 | throw new ExtensionSettingsException('ExtensionSettings are empty', 1_587_066_853);
41 | }
42 |
43 | if (empty($extensionSettings['phpPath'])) {
44 | $phpPath = CommandUtility::getCommand($extensionSettings['phpBinary']);
45 | if ($phpPath === false) {
46 | throw new CommandNotFoundException(
47 | 'The phpBinary: "' . $extensionSettings['phpBinary'] . '" could not be found!',
48 | 1_587_068_215
49 | );
50 | }
51 | } else {
52 | $phpPath = $extensionSettings['phpPath'];
53 | }
54 |
55 | return $phpPath;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/composer-dependency-analyser.php:
--------------------------------------------------------------------------------
1 | addPathToScan(__DIR__ . '/Classes', isDev: false)
11 | //->addPathToExclude(__DIR__ . '/samples')
12 | ->disableComposerAutoloadPathScan() // disable automatic scan of autoload & autoload-dev paths from composer.json
13 | ->setFileExtensions(['php']); // applies only to directory scanning, not directly listed files
14 |
15 | //// Ignoring errors
16 | //->ignoreErrors([ErrorType::DEV_DEPENDENCY_IN_PROD])
17 | //->ignoreErrorsOnPath(__DIR__ . '/cache/DIC.php', [ErrorType::SHADOW_DEPENDENCY])
18 | //->ignoreErrorsOnPackage('symfony/polyfill-php73', [ErrorType::UNUSED_DEPENDENCY])
19 | //->ignoreErrorsOnPackageAndPath('symfony/console', __DIR__ . '/src/OptionalCommand.php', [ErrorType::SHADOW_DEPENDENCY])
20 | //->ignoreErrorsOnExtension('ext-intl', [ErrorType::SHADOW_DEPENDENCY])
21 | //->ignoreErrorsOnExtensionAndPath('ext-sqlite3', __DIR__ . '/tests', [ErrorType::SHADOW_DEPENDENCY])
22 |
23 | //// Ignoring unknown symbols
24 | //->ignoreUnknownClasses(['Memcached'])
25 | //->ignoreUnknownClassesRegex('~^DDTrace~')
26 | //->ignoreUnknownFunctions(['opcache_invalidate'])
27 | //->ignoreUnknownFunctionsRegex('~^opcache_~')
28 |
29 | //// Adjust analysis
30 | //->enableAnalysisOfUnusedDevDependencies() // dev packages are often used only in CI, so this is not enabled by default
31 | //->disableReportingUnmatchedIgnores() // do not report ignores that never matched any error
32 | //->disableExtensionsAnalysis() // do not analyse ext-* dependencies
33 |
34 | //// Use symbols from yaml/xml/neon files
35 | // - designed for DIC config files (see below)
36 | // - beware that those are not validated and do not even trigger unknown class error
37 | //->addForceUsedSymbols($classesExtractedFromNeonJsonYamlXmlEtc);
38 |
--------------------------------------------------------------------------------
/Classes/Process/Cleaner/OldProcessCleaner.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\ProcessRepository;
23 | use AOE\Crawler\Domain\Repository\QueueRepository;
24 | use AOE\Crawler\Process\ProcessManagerInterface;
25 |
26 | /**
27 | * @internal since v12.0.10
28 | */
29 | class OldProcessCleaner
30 | {
31 | public function __construct(
32 | private readonly ProcessRepository $processRepository,
33 | private readonly QueueRepository $queueRepository,
34 | private readonly ProcessManagerInterface $processManager
35 | ) {
36 | }
37 |
38 | public function clean(): void
39 | {
40 | $results = $this->processRepository->getActiveProcessesOlderThanOneHour();
41 |
42 | if (!is_array($results)) {
43 | throw new \UnexpectedValueException('Expected array, got ' . gettype($results));
44 | }
45 |
46 | foreach ($results as $result) {
47 | $systemProcessId = (int) $result['system_process_id'];
48 | $processId = $result['process_id'];
49 |
50 | if ($systemProcessId <= 1) {
51 | continue;
52 | }
53 |
54 | if ($this->processManager->processExists($systemProcessId)) {
55 | $this->processManager->killProcess($systemProcessId);
56 | }
57 |
58 | $this->processRepository->removeByProcessId($processId);
59 | $this->queueRepository->unsetQueueProcessId($processId);
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/Classes/Service/BackendModuleScriptUrlService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Http\Message\ServerRequestInterface;
23 | use TYPO3\CMS\Backend\Routing\UriBuilder;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | class BackendModuleScriptUrlService
27 | {
28 | public function buildScriptUrl(
29 | ServerRequestInterface $request,
30 | string $elementName,
31 | int $pageUid,
32 | array $queryParameters,
33 | string $queryString = ''
34 | ): string {
35 | $mainParams = [
36 | 'id' => $pageUid,
37 | ];
38 | $uriBuilder = GeneralUtility::makeInstance(UriBuilder::class);
39 | $route = $request->getAttribute('route');
40 | $scriptUrl = (string) $uriBuilder->buildUriFromRoute($route->getOption('_identifier'), $mainParams);
41 |
42 | return $scriptUrl . ($queryString . $this->getAdditionalQueryParams(
43 | $elementName,
44 | $queryParameters
45 | ) . '&' . $elementName . '=${value}');
46 | }
47 |
48 | /*
49 | * Build query string with affected checkbox/dropdown value removed.
50 | */
51 | private function getAdditionalQueryParams(string $keyToBeRemoved, array $queryParameters): string
52 | {
53 | $queryString = '';
54 | unset($queryParameters[$keyToBeRemoved]);
55 | foreach ($queryParameters as $key => $value) {
56 | $queryString .= "&{$key}={$value}";
57 | }
58 | return $queryString;
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/Classes/Utility/TcaUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Package\PackageManager;
23 | use TYPO3\CMS\Core\Utility\GeneralUtility;
24 |
25 | /**
26 | * @internal since v9.2.5
27 | */
28 | class TcaUtility
29 | {
30 | /**
31 | * Get crawler processing instructions.
32 | * This function is called as a itemsProcFunc in tx_crawler_configuration.processing_instruction_filter
33 | *
34 | * @return array
35 | */
36 | public function getProcessingInstructions(array $configuration)
37 | {
38 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] ?? null)) {
39 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $extensionKey => $extensionConfiguration) {
40 | $configuration['items'][] = [
41 | 'label' => $extensionConfiguration['value'] . ' [' . $extensionConfiguration['key'] . ']',
42 | 'value' => $extensionConfiguration['key'],
43 | 'icon' => $this->getExtensionIcon($extensionKey),
44 | ];
45 | }
46 | }
47 |
48 | return $configuration;
49 | }
50 |
51 | private function getExtensionIcon(string $extensionKey): string
52 | {
53 | $packageManager = GeneralUtility::makeInstance(PackageManager::class);
54 | $package = $packageManager->getPackage($extensionKey);
55 | if ($package->getPackageIcon()) {
56 | return $package->getPackagePath() . $package->getPackageIcon();
57 | }
58 | return '';
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/_NewsDetailEventListener.php:
--------------------------------------------------------------------------------
1 | getAssignedValues();
14 | $newsItem = $assignedValues['newsItem'];
15 | $demand = $assignedValues['demand'];
16 | $settings = $assignedValues['settings'];
17 |
18 | if ($newsItem !== null) {
19 | $demandedCategories = $demand->getCategories();
20 | $itemCategories = $newsItem->getCategories()->toArray();
21 | $itemCategoryIds = \array_map(function ($category) {
22 | return (string) $category->getUid();
23 | }, $itemCategories);
24 |
25 | if (count($demandedCategories) > 0 && !$this::itemMatchesCategoryDemand(
26 | $settings['categoryConjunction'],
27 | $itemCategoryIds,
28 | $demandedCategories
29 | )) {
30 | $assignedValues['newsItem'] = null;
31 | $event->setAssignedValues($assignedValues);
32 | }
33 | }
34 | }
35 |
36 | protected static function itemMatchesCategoryDemand(
37 | string $categoryConjunction,
38 | array $itemCategoryIds,
39 | array $demandedCategories
40 | ): bool {
41 | $numOfDemandedCategories = \count($demandedCategories);
42 | $intersection = \array_intersect($itemCategoryIds, $demandedCategories);
43 | $numOfCommonItems = \count($intersection);
44 |
45 | switch ($categoryConjunction) {
46 | case 'AND':
47 | return $numOfCommonItems === $numOfDemandedCategories;
48 | case 'OR':
49 | return $numOfCommonItems > 0;
50 | case 'NOTAND':
51 | return $numOfCommonItems < $numOfDemandedCategories;
52 | case 'NOTOR':
53 | return $numOfCommonItems === 0;
54 | }
55 | return true;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/Classes/Value/QueueRow.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal
24 | */
25 | class QueueRow
26 | {
27 | public string $pageTitleHTML = '';
28 | public string $message = '';
29 | public string $configurationKey = '';
30 | public string $parameterConfig = '';
31 | public string $valuesExpanded = '';
32 | public string $urls = '';
33 | public array $options = [];
34 | public string $parameters = '';
35 |
36 | public function __construct(
37 | public string $pageTitle = ''
38 | ) {
39 | }
40 |
41 | public function setPageTitleHTML(string $pageTitleHTML): void
42 | {
43 | $this->pageTitleHTML = $pageTitleHTML;
44 | }
45 |
46 | public function setMessage(string $message): void
47 | {
48 | $this->message = $message;
49 | }
50 |
51 | public function setConfigurationKey(string $configurationKey): void
52 | {
53 | $this->configurationKey = $configurationKey;
54 | }
55 |
56 | public function setParameterConfig(string $parameterConfig): void
57 | {
58 | $this->parameterConfig = $parameterConfig;
59 | }
60 |
61 | public function setValuesExpanded(string $valuesExpanded): void
62 | {
63 | $this->valuesExpanded = $valuesExpanded;
64 | }
65 |
66 | public function setUrls(string $urls): void
67 | {
68 | $this->urls = $urls;
69 | }
70 |
71 | public function setOptions(array $options): void
72 | {
73 | $this->options = $options;
74 | }
75 |
76 | public function setParameters(string $parameters): void
77 | {
78 | $this->parameters = $parameters;
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/Documentation/Features/PriorityCrawling/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _priority-crawling:
4 |
5 | =================
6 | Priority Crawling
7 | =================
8 |
9 | .. versionadded:: 9.1.0
10 |
11 | Some website has a quite large number of pages. Some pages are logically more
12 | important than others e.g. the start-, support-, product-, you name it-pages.
13 | These important pages are also the pages where we want to have the best caching
14 | and performance, as they will most likely be the pages with the most changes and
15 | the most traffic.
16 |
17 | With TYPO3 10 LTS the `sysext/seo` introduced among other things, the
18 | `sitemap_priority`, which is used to generate an SEO optimised sitemap.xml
19 | where page priorities are listed as well. Their priorities will most likely be higher the
20 | more important the page is for you and the end-user.
21 |
22 | This logic is something that we can benefit from in the Crawler as well. A
23 | Website with let us say 10.000 pages, will have different importance depending on
24 | the page you are at. Therefore we have changed the functionality of the crawler,
25 | to take the value of this field, range from 0.0 to 1.0, into consideration when
26 | processing the crawler queue. This means that if you have a page with high priority
27 | for your sitemap, it will also be crawled first when a new crawler process is
28 | added.
29 |
30 | This ensures that we will always crawl the pages that have the highest importance to
31 | you and your end-user based on your sitemap priority. We choose to
32 | reuse this field, to not have editors doing work that is more or less similar twice.
33 |
34 | If you don't want to use this functionality, it's ok. You can just ignore the
35 | options that the `sysext/seo` gives you and all pages will by default get a priority
36 | 0.5, and therefore do not influence the processing order as everyone will have the
37 | same priority.
38 |
39 | The existing :guilabel:`SEO` tab will be used to set priorities when editing
40 | pages.
41 |
42 | .. image:: /Images/backend_crawler_seo_v10.png
43 |
44 | .. figure:: /Images/backend_crawler_seo_priority_v10.png
45 | :alt: The SEO tab will contain the sitemap_priority field
46 |
47 | The SEO tab will contain the sitemap_priority field
48 |
--------------------------------------------------------------------------------
/Classes/Hooks/DataHandlerHook.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\QueueRepository;
23 | use AOE\Crawler\Service\QueueService;
24 | use TYPO3\CMS\Core\DataHandling\DataHandler;
25 | use TYPO3\CMS\Core\Domain\Repository\PageRepository;
26 | use TYPO3\CMS\Core\Utility\GeneralUtility;
27 |
28 | /**
29 | * @internal since v9.2.5
30 | */
31 | class DataHandlerHook
32 | {
33 | /**
34 | * @noRector \Rector\DeadCode\Rector\ClassMethod\RemoveUnusedParameterRector
35 | */
36 | public function addFlushedPagesToCrawlerQueue(array $parameters, DataHandler $dataHandler): void
37 | {
38 | $pageIdsToBeFlushedFromCache = $parameters['pageIdArray'];
39 | if (empty($pageIdsToBeFlushedFromCache)) {
40 | return;
41 | }
42 | foreach ($pageIdsToBeFlushedFromCache as $pageId) {
43 | $pageId = (int) $pageId;
44 | if ($pageId < 1 || empty($this->getPageRepository()->getPage($pageId))) {
45 | continue;
46 | }
47 | if ($this->getQueueRepository()->isPageInQueue($pageId)) {
48 | continue;
49 | }
50 | $this->getQueueService()->addPageToQueue($pageId);
51 | }
52 | }
53 |
54 | public function getQueueRepository(): QueueRepository
55 | {
56 | return GeneralUtility::makeInstance(QueueRepository::class);
57 | }
58 |
59 | public function getQueueService(): QueueService
60 | {
61 | return GeneralUtility::makeInstance(QueueService::class);
62 | }
63 |
64 | public function getPageRepository(): PageRepository
65 | {
66 | return GeneralUtility::makeInstance(PageRepository::class);
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/Classes/Controller/Ajax/ProcessStatusController.php:
--------------------------------------------------------------------------------
1 |
7 | *
8 | * This file is part of the TYPO3 Crawler Extension.
9 | *
10 | * It is free software; you can redistribute it and/or modify it under
11 | * the terms of the GNU General Public License, either version 2
12 | * of the License, or any later version.
13 | *
14 | * For the full copyright and license information, please read the
15 | * LICENSE.txt file that was distributed with this source code.
16 | *
17 | * The TYPO3 project - inspiring people to share!
18 | */
19 |
20 | namespace AOE\Crawler\Controller\Ajax;
21 |
22 | use AOE\Crawler\Domain\Repository\ProcessRepository;
23 | use Psr\Http\Message\ResponseInterface;
24 | use Psr\Http\Message\ServerRequestInterface;
25 | use TYPO3\CMS\Core\Http\Response;
26 |
27 | /**
28 | * @internal since v12.0.10
29 | */
30 | class ProcessStatusController
31 | {
32 | public function __construct(
33 | private readonly ProcessRepository $processRepository,
34 | ) {
35 | }
36 |
37 | public function getProcessStatus(ServerRequestInterface $request): ResponseInterface
38 | {
39 | $body = $request->getBody()->getContents();
40 | $data = json_decode($body, true);
41 | $id = $data['id'] ?? null;
42 |
43 | $response = new Response();
44 |
45 | if ($id === null) {
46 | return $response->withStatus(400, 'No process ID provided');
47 | }
48 |
49 | $process = $this->processRepository->findByProcessId($id);
50 | if ($process === null) {
51 | return $response->withStatus(404, 'Process with ID: ' . $id . ' not found');
52 | }
53 |
54 | $content = json_encode(
55 | [
56 | 'status' => $process->getProgress(),
57 | 'processedItems' => $process->getAmountOfItemsProcessed(),
58 | 'runtime' => $process->getRuntime(),
59 | 'processId' => $process->getProcessId(),
60 | ]
61 | );
62 | if ($content === false) {
63 | throw new \RuntimeException('Failed to encode JSON response', 1760971184);
64 | }
65 | $response->getBody()->write($content);
66 | return $response;
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/Documentation/Introduction/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _introduction:
4 |
5 | ============
6 | Introduction
7 | ============
8 |
9 | .. _introduction-what:
10 |
11 | What does it do?
12 | ================
13 |
14 | The TYPO3 Crawler is an extension which provides possibilities, from both
15 | the TYPO3 backend and from CLIm that helps you with you cache and e.g.
16 | search index.
17 |
18 | The Crawler implements several PSR-14 events, that you can use to "hook" into
19 | if you have certain requirements for your site at the given time.
20 |
21 | See more :ref:`psr14-modify-skip-page-event`.
22 |
23 | It features an API that other extensions can plug into. Example of this
24 | is "indexed\_search" which uses crawler to index content defined by
25 | its Indexing Configurations. Other extensions supporting it are
26 | "staticpub" (publishing to static pages) or "cachemgm" (allows
27 | recaching of pages).
28 |
29 | The requests of URLs is specially designed to request TYPO3 frontends
30 | with special processing instructions. The requests sends a TYPO3
31 | specific header in the GET requests which identifies a special action.
32 | For instance the action requested could be to publish the URL to a
33 | static file or it could be to index its content - or re-cache the
34 | page. These processing instructions are also defined by third-party
35 | extensions (and indexed search is one of them). In this way a
36 | processing instruction can instruct the frontend to perform an action
37 | (like indexing, publishing etc.) which cannot be done with a request
38 | from outside.
39 |
40 | .. _introduction-screenshots:
41 |
42 | Screenshots
43 | ===========
44 |
45 | The extension provides a backend module which displays the queue and log and
46 | allows execution and status check of the "cronscript" from the backend for
47 | testing purposes.
48 |
49 | .. figure:: /Images/backend_processlist.png
50 |
51 | CLI status display
52 |
53 | CLI = Command Line Interface = shell script = cron script
54 |
55 | .. figure:: /Images/backend_crawlerlog.png
56 |
57 | Crawler queue (before processing) / log (after processing)
58 |
59 | .. figure:: /Images/backend_pendingurls.png
60 |
61 | Interface for submitting a batch of URLs to be crawled
62 |
63 | The parameter combinations are programmable through Page TSconfig or
64 | configuration records.
65 |
--------------------------------------------------------------------------------
/Documentation/Configuration/Examples/News/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _example-configuration-news:
4 |
5 | ========
6 | EXT:news
7 | ========
8 |
9 | The news extensions is one of the most used extensions in the TYPO3 CMS. This
10 | configuration is made under the assumption with a page tree looking similar to this:
11 |
12 | .. figure:: /Images/ext_news_pagetree.png
13 | :alt: Example Pagetree of EXT:news setup
14 |
15 | Example Pagetree of EXT:news setup
16 |
17 | If you want to have a Crawler Configuration that matches this, you can add
18 | following to the :guilabel:`PageTS` for PageId `56`.
19 |
20 | .. literalinclude:: _page.tsconfig
21 | :caption: packages/my_extension/Configuration/Sets/MySet/page.tsconfig
22 |
23 | Now you can add the News detail-view pages to the crawler queue and have them in
24 | the cache and the `indexed_search` index if you are using that.
25 |
26 | .. _example-configuration-news-category:
27 |
28 | Respecting Categories in News
29 | =============================
30 |
31 | On some installations news is configured in such a way, that news of category A
32 | have their detail view on one page and news of category B have their detail view on
33 | another page. In this case it would still be possible to view news of category A on
34 | the detail page for category B (example.com/detail-page-for-category-B/news-of-category-A).
35 | That means that each news article would be crawled twice - once on the detail page
36 | for category A and once on the detail page for category B. It is possible to use a
37 | PSR-14 event with news to prevent this.
38 |
39 | On both detail pages include this typoscript setup:
40 |
41 | .. literalinclude:: _setup.typoscript
42 | :caption: packages/my_extension/Configuration/Sets/MySet/setup.typoscript
43 |
44 | and register an event listener in your site package.
45 |
46 | .. literalinclude:: _services.yaml
47 | :caption: packages/my_extension/Configuration/Services.yaml
48 |
49 | .. literalinclude:: _NewsDetailEventListener.php
50 | :caption: packages/my_extension/Classes/EventListeners/NewsDetailEventListener.php
51 |
52 | .. warning::
53 |
54 | Note that this does more than just prevent articles from being indexed twice. It
55 | actually prevents articles from being displayed on a page that is supposed to show
56 | only articles of a certain category!
57 |
--------------------------------------------------------------------------------
/Resources/Public/JavaScript/ProcessStatus.js:
--------------------------------------------------------------------------------
1 | (function () {
2 | const ajaxKey = 'crawler_process_status';
3 | const ajaxUrl = TYPO3.settings?.ajaxUrls?.[ajaxKey];
4 |
5 | async function fetchStatus(id) {
6 | if (!ajaxUrl) {
7 | console.error('Missing TYPO3 AJAX URL for crawler_process_status');
8 | return;
9 | }
10 | try {
11 | const resp = await fetch(ajaxUrl, {
12 | method: 'POST',
13 | credentials: 'same-origin',
14 | headers: { 'Content-Type': 'application/json' },
15 | body: JSON.stringify({id})
16 | });
17 | if (!resp.ok) {
18 | throw new Error(`HTTP error ${resp.status}`);
19 | }
20 | const data = await resp.json();
21 | updateProgress(id, data);
22 | } catch (err) {
23 | console.error('Error fetching status', err);
24 | }
25 | }
26 |
27 | function updateProgress(id, data) {
28 | const bar = document.getElementById(id);
29 | let status = `${data.status}%`;
30 | bar.style.width = status;
31 | bar.innerHTML = status;
32 | updateTableCellByClass(id, 'processedItems', `${data.processedItems}`);
33 | updateTableCellByClass(id, 'runtime', `${data.runtime}`);
34 |
35 | if (Number(data.status) >= 100) {
36 | bar.classList.remove('crawlerprocessprogress-bar');
37 | // Trigger a refresh of the page to show updated status
38 | document.querySelector('a[title="Refresh"]').click();
39 | }
40 | }
41 |
42 | function updateTableCellByClass(elementId, cellClass, newValue) {
43 | const el = document.getElementById(elementId);
44 | if (!el) return;
45 |
46 | const row = el.closest('tr');
47 | if (!row) return;
48 |
49 | const cell = row.querySelector(`td.${cellClass}`);
50 | if (cell) {
51 | cell.textContent = newValue;
52 | }
53 | }
54 |
55 | async function getElementsToUpdate() {
56 | const progressBars = document.getElementsByClassName('crawlerprocessprogress-bar');
57 | const promises = Array.from(progressBars).map(bar => fetchStatus(bar.id));
58 | await Promise.all(promises);
59 | }
60 | setInterval(getElementsToUpdate, 3000);
61 | })();
62 |
--------------------------------------------------------------------------------
/Documentation/ExecutingTheQueue/ExecutingQueueWithCron-job/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _with-crown:
4 |
5 | =============================
6 | Executing queue with cron-job
7 | =============================
8 |
9 | A "cron-job" refers to a script that runs on the server with time
10 | intervals.
11 |
12 | For this to become reality you must ideally have a cron-job set up.
13 | This assumes you are running on Unix architecture of some sort. The
14 | crontab is often edited by :bash:`crontab -e` and you should insert a line
15 | like this:
16 |
17 | .. code-block:: plaintext
18 |
19 | * * * * * vendor/bin/typo3 crawler:buildQueue > /dev/null
20 |
21 | This will run the script every minute. You should try to run the
22 | script on the command line first to make sure it runs without any
23 | errors. If it doesn't output anything it was successful.
24 |
25 | You will need to have a user called `_cli_` and you must have PHP installed
26 | as a CGI script as well in :path:`/usr/bin/`.
27 |
28 | The user `_cli_` is created by the framework on demand if it does not exist
29 | at the first command line call.
30 |
31 | Make sure that the user `_cli_` has admin-rights.
32 |
33 | In the :guilabel:`CLI status` menu of the :guilabel:`Site Crawler` info module
34 | you can see the status:
35 |
36 | .. figure:: /Images/backend_processlist.png
37 | :alt: Status page in the backend
38 |
39 | Status page in the backend
40 |
41 | This is how it looks just after you ran the script. (You can also see
42 | the full path to the script in the bottom - this is the path to the
43 | script as you should use it on the command line / in the crontab)
44 |
45 | If the cron-script stalls there is a default delay of 1 hour before a
46 | new process will announce the old one dead and run a new one. If a
47 | cron-script takes more than 1 minute and thereby overlaps the next
48 | process, the next process will NOT start if it sees that the "lock-
49 | file" exists (unless that hour has passed).
50 |
51 | The reason why it works like this is to make sure that overlapping
52 | calls to the crawler CLI script will not run parallel processes. So
53 | the second call will just exit if it finds in the status file that the
54 | process is already running. But of course a crashed script will fail
55 | to set the status to "end" and hence this situation can occur.
56 |
--------------------------------------------------------------------------------
/Classes/Utility/MessageUtility.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use TYPO3\CMS\Core\Messaging\FlashMessage;
23 | use TYPO3\CMS\Core\Messaging\FlashMessageService;
24 | use TYPO3\CMS\Core\Type\ContextualFeedbackSeverity;
25 | use TYPO3\CMS\Core\Utility\GeneralUtility;
26 |
27 | /**
28 | * @internal since v9.2.5
29 | */
30 | class MessageUtility
31 | {
32 | /**
33 | * Add notice message to the user interface.
34 | */
35 | public static function addNoticeMessage(string $message): void
36 | {
37 | self::addMessage($message, ContextualFeedbackSeverity::NOTICE);
38 | }
39 |
40 | /**
41 | * Add error message to the user interface.
42 | */
43 | public static function addErrorMessage(string $message): void
44 | {
45 | self::addMessage($message, ContextualFeedbackSeverity::ERROR);
46 | }
47 |
48 | /**
49 | * Add error message to the user interface.
50 | */
51 | public static function addWarningMessage(string $message): void
52 | {
53 | self::addMessage($message, ContextualFeedbackSeverity::WARNING);
54 | }
55 |
56 | /**
57 | * This method is used to add a message to the internal queue
58 | *
59 | * @param string $message the message itself
60 | * @param ContextualFeedbackSeverity $severity message level (0 = success (default), -1 = info, -2 = notice, 1 = warning, 2 = error)
61 | */
62 | private static function addMessage(
63 | string $message,
64 | ContextualFeedbackSeverity $severity = ContextualFeedbackSeverity::OK
65 | ): void {
66 | $message = GeneralUtility::makeInstance(FlashMessage::class, $message, '', $severity);
67 |
68 | $flashMessageService = GeneralUtility::makeInstance(FlashMessageService::class);
69 | $flashMessageService->getMessageQueueByIdentifier()->addMessage($message);
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/Classes/Process/Cleaner/OrphanProcessCleaner.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\ProcessRepository;
23 | use AOE\Crawler\Domain\Repository\QueueRepository;
24 | use AOE\Crawler\Process\ProcessManagerInterface;
25 | use TYPO3\CMS\Core\Utility\GeneralUtility;
26 |
27 | /**
28 | * @internal since v12.0.10
29 | */
30 | class OrphanProcessCleaner
31 | {
32 | public function __construct(
33 | private readonly ProcessRepository $processRepository,
34 | private readonly QueueRepository $queueRepository,
35 | private readonly ProcessManagerInterface $processManager
36 | ) {
37 | }
38 |
39 | public function clean(): void
40 | {
41 | $results = $this->processRepository->getActiveOrphanProcesses();
42 |
43 | foreach ($results as $result) {
44 | $systemProcessId = (int) $result['system_process_id'];
45 | $processId = $result['process_id'];
46 |
47 | if ($systemProcessId <= 1) {
48 | continue;
49 | }
50 |
51 | $dispatcherProcesses = $this->processManager->findDispatcherProcesses();
52 | if (empty($dispatcherProcesses)) {
53 | $this->remove($processId);
54 | return;
55 | }
56 |
57 | $exists = false;
58 | foreach ($dispatcherProcesses as $process) {
59 | $parts = GeneralUtility::trimExplode(' ', $process, true);
60 | if ($systemProcessId === (int) ($parts[1] ?? 0)) {
61 | $exists = true;
62 | break;
63 | }
64 | }
65 |
66 | if (!$exists) {
67 | $this->remove($processId);
68 | }
69 | }
70 | }
71 |
72 | private function remove(string $processId): void
73 | {
74 | $this->processRepository->removeByProcessId($processId);
75 | $this->queueRepository->unsetQueueProcessId($processId);
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/Classes/Controller/Backend/Helper/RequestHelper.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Psr\Http\Message\ServerRequestInterface;
23 |
24 | /**
25 | * @internal since 12.0.10
26 | */
27 | final class RequestHelper
28 | {
29 | public static function getIntFromRequest(ServerRequestInterface $request, string $key, int $default = 0): int
30 | {
31 | $body = $request->getParsedBody();
32 | $query = $request->getQueryParams();
33 |
34 | $value = (is_array($body) ? ($body[$key] ?? null) : null)
35 | ?? ($query[$key] ?? null)
36 | ?? $default;
37 |
38 | return (int) $value;
39 | }
40 |
41 | public static function getBoolFromRequest(ServerRequestInterface $request, string $key): bool
42 | {
43 | $body = $request->getParsedBody();
44 | $query = $request->getQueryParams();
45 |
46 | $value = (is_array($body) ? ($body[$key] ?? null) : null)
47 | ?? ($query[$key] ?? null);
48 |
49 | return !empty($value);
50 | }
51 |
52 | public static function getStringFromRequest(
53 | ServerRequestInterface $request,
54 | string $key,
55 | string $default = ''
56 | ): string {
57 | $body = $request->getParsedBody();
58 | $query = $request->getQueryParams();
59 |
60 | $value = (is_array($body) ? ($body[$key] ?? null) : null)
61 | ?? ($query[$key] ?? null)
62 | ?? $default;
63 |
64 | return is_scalar($value) ? (string) $value : $default;
65 | }
66 |
67 | public static function getArrayFromRequest(ServerRequestInterface $request, string $key): array
68 | {
69 | $body = $request->getParsedBody();
70 | $query = $request->getQueryParams();
71 |
72 | $source = is_array($body) ? $body : $query;
73 | $value = $source[$key] ?? $query[$key] ?? null;
74 |
75 | if (is_string($value)) {
76 | $value = json_decode($value, true);
77 | }
78 |
79 | return is_array($value) ? $value : [];
80 | }
81 |
82 | }
83 |
--------------------------------------------------------------------------------
/Classes/Controller/Backend/Helper/ResultHandler.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | /**
23 | * @internal since v9.2.5
24 | */
25 | class ResultHandler
26 | {
27 | /**
28 | * Extract the log information from the current row and retrieve it as formatted string.
29 | */
30 | public static function getResultLog(array $resultRow): string
31 | {
32 | $content = '';
33 | if (array_key_exists('result_data', $resultRow)) {
34 | $requestContent = json_decode((string) $resultRow['result_data'], true) ?: [];
35 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) {
36 | return $content;
37 | }
38 | $requestResult = json_decode((string) $requestContent['content'], true);
39 |
40 | if (is_array($requestResult) && array_key_exists('log', $requestResult)) {
41 | $content = implode(chr(10), $requestResult['log']);
42 | }
43 | }
44 | return $content;
45 | }
46 |
47 | public static function getResStatus(array|bool $requestContent): string
48 | {
49 | if (empty($requestContent)) {
50 | return '-';
51 | }
52 | if (is_bool($requestContent) || !array_key_exists('content', $requestContent)) {
53 | return 'Content index does not exists in requestContent array';
54 | }
55 |
56 | $requestResult = json_decode((string) $requestContent['content'], true);
57 | if (is_array($requestResult)) {
58 | if (empty($requestResult['errorlog'])) {
59 | return 'OK';
60 | }
61 | return implode("\n", $requestResult['errorlog']);
62 | }
63 |
64 | return 'Error - no info, sorry!';
65 | }
66 |
67 | /**
68 | * Find Fe vars
69 | */
70 | public static function getResFeVars(array $resultData): array
71 | {
72 | if (empty($resultData)) {
73 | return [];
74 | }
75 | $requestResult = json_decode((string) $resultData['content'], true);
76 | if (is_bool($requestResult)) {
77 | return [];
78 | }
79 | return $requestResult['vars'] ?? [];
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TYPO3 Crawler
2 | [](https://packagist.org/packages/tomasnorre/crawler)
3 | [](https://packagist.org/packages/tomasnorre/crawler)
4 | [](https://packagist.org/packages/tomasnorre/crawler)
5 | 
6 | [](https://coveralls.io/github/tomasnorre/crawler)
7 | [](https://dashboard.stryker-mutator.io/reports/github.com/tomasnorre/crawler/main)
8 | 
9 |
10 | TYPO3 Crawler crawls the TYPO3 page tree. Used for cache warmup, indexing, publishing applications etc.
11 |
12 |
13 | You can include the crawler in your TYPO3 project with composer or from the [TYPO3 Extension Repository](https://extensions.typo3.org/extension/crawler)
14 |
15 | ```shell script
16 | composer require tomasnorre/crawler
17 | ```
18 |
19 | **Crawler processes**
20 |
21 | 
22 |
23 | ## Versions and Support
24 |
25 | | Release | TYPO3 | PHP | Fixes will contain
26 | |---------|-----------|---------|---|
27 | | 12.x.y | 12.4-13.4 | 8.1-8.4 |Features, Bugfixes, Security Updates, Since 12.0.6 TYPO3 13.4, Since 12.0.7 PHP 8.4
28 | | 11.x.y | 10.4-11.5 | 7.4-8.1 |Security Updates, Since 11.0.3 PHP 8.1
29 | | 10.x.y | 9.5-11.0 | 7.2-7.4 |Security Updates
30 | | 9.x.y | 9.5-11.0 | 7.2-7.4 |As this version has same requirements as 10.x.y, there will be no further releases of this version, please update instead.
31 | | 8.x.y | | | Releases do not exist
32 | | 7.x.y | | | Releases do not exist
33 | | 6.x.y | 7.6-8.7 | 5.6-7.3 | Security Updates
34 |
35 | ### Documentation
36 | Please read the [documentation](https://docs.typo3.org/p/tomasnorre/crawler/main/en-us/)
37 |
38 | To render the documentation locally, please use the official TYPO3 Documentation rendering Docker Tool.
39 |
40 |
41 | ### Contributions
42 |
43 | Please see [CONTRIBUTING.md](https://github.com/tomasnorre/crawler/blob/main/CONTRIBUTING.md)
44 |
45 | ### Honorable Previous Maintainers
46 |
47 | * Kasper Skaarhoj
48 | * Daniel Poetzinger
49 | * Fabrizio Branca
50 | * Tolleiv Nietsch
51 | * Timo Schmidt
52 | * Michael Klapper
53 | * Stefan Rotsch
54 |
--------------------------------------------------------------------------------
/Classes/Domain/Repository/ConfigurationRepository.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use Doctrine\DBAL\ArrayParameterType;
23 | use TYPO3\CMS\Backend\Utility\BackendUtility;
24 | use TYPO3\CMS\Core\Database\ConnectionPool;
25 | use TYPO3\CMS\Core\Database\Query\QueryBuilder;
26 | use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
27 | use TYPO3\CMS\Core\Database\Query\Restriction\HiddenRestriction;
28 | use TYPO3\CMS\Core\Utility\GeneralUtility;
29 | use TYPO3\CMS\Extbase\Persistence\Repository;
30 |
31 | /**
32 | * @internal since v9.2.5
33 | */
34 | class ConfigurationRepository extends Repository
35 | {
36 | final public const TABLE_NAME = 'tx_crawler_configuration';
37 |
38 | /**
39 | * Traverses up the rootline of a page and fetches all crawler records.
40 | */
41 | public function getCrawlerConfigurationRecordsFromRootLine(int $pageId, array $parentIds = []): array
42 | {
43 | if (empty($parentIds)) {
44 | $pageIdsInRootLine = [];
45 | $rootLine = BackendUtility::BEgetRootLine($pageId);
46 |
47 | foreach ($rootLine as $pageInRootLine) {
48 | $pageIdsInRootLine[] = (int) $pageInRootLine['uid'];
49 | }
50 | } else {
51 | $pageIdsInRootLine = $parentIds;
52 | }
53 |
54 | $queryBuilder = $this->createQueryBuilder();
55 | $queryBuilder
56 | ->getRestrictions()->removeAll()
57 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
58 | ->add(GeneralUtility::makeInstance(HiddenRestriction::class));
59 | return $queryBuilder
60 | ->select('*')
61 | ->from(self::TABLE_NAME)
62 | ->where(
63 | $queryBuilder->expr()->in(
64 | 'pid',
65 | $queryBuilder->createNamedParameter($pageIdsInRootLine, ArrayParameterType::INTEGER)
66 | )
67 | )
68 | ->orderBy('name')
69 | ->executeQuery()
70 | ->fetchAllAssociative();
71 | }
72 |
73 | protected function createQueryBuilder(): QueryBuilder
74 | {
75 | return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(self::TABLE_NAME);
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/Configuration/Backend/Modules.php:
--------------------------------------------------------------------------------
1 |
7 | *
8 | * This file is part of the TYPO3 Crawler Extension.
9 | *
10 | * It is free software; you can redistribute it and/or modify it under
11 | * the terms of the GNU General Public License, either version 2
12 | * of the License, or any later version.
13 | *
14 | * For the full copyright and license information, please read the
15 | * LICENSE.txt file that was distributed with this source code.
16 | *
17 | * The TYPO3 project - inspiring people to share!
18 | */
19 |
20 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerLogController;
21 | use AOE\Crawler\Controller\Backend\BackendModuleCrawlerProcessController;
22 | use AOE\Crawler\Controller\Backend\BackendModuleStartCrawlingController;
23 |
24 | return [
25 | 'web_site_crawler' => [
26 | 'parent' => 'web',
27 | 'position' => [
28 | 'after' => 'web_info',
29 | ],
30 | 'access' => 'user',
31 | 'workspaces' => 'live',
32 | 'path' => '/module/page/crawler',
33 | 'labels' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf',
34 | 'extensionName' => 'Crawler',
35 | 'iconIdentifier' => 'tx-crawler-icon',
36 | 'routes' => [
37 | '_default' => [
38 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest',
39 | ],
40 | ],
41 | ],
42 | 'web_site_crawler_start' => [
43 | 'parent' => 'web_site_crawler',
44 | 'access' => 'user',
45 | 'path' => '/module/page/crawler/start',
46 | 'iconIdentifier' => 'crawler-start',
47 | 'labels' => [
48 | 'title' => 'Start',
49 | ],
50 | 'routes' => [
51 | '_default' => [
52 | 'target' => BackendModuleStartCrawlingController::class . '::handleRequest',
53 | ],
54 | ],
55 | ],
56 | 'web_site_crawler_process' => [
57 | 'parent' => 'web_site_crawler',
58 | 'access' => 'user',
59 | 'path' => '/module/page/crawler/process',
60 | 'iconIdentifier' => 'crawler-process',
61 | 'labels' => [
62 | 'title' => 'Process',
63 | ],
64 | 'routes' => [
65 | '_default' => [
66 | 'target' => BackendModuleCrawlerProcessController::class . '::handleRequest',
67 | ],
68 | ],
69 | ],
70 | 'web_site_crawler_log' => [
71 | 'parent' => 'web_site_crawler',
72 | 'access' => 'user',
73 | 'path' => '/module/page/crawler/log',
74 | 'iconIdentifier' => 'crawler-log',
75 | 'labels' => [
76 | 'title' => 'Log',
77 | ],
78 | 'routes' => [
79 | '_default' => [
80 | 'target' => BackendModuleCrawlerLogController::class . '::handleRequest',
81 | ],
82 | ],
83 | ],
84 | ];
85 |
--------------------------------------------------------------------------------
/Classes/Service/PageService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23 | use AOE\Crawler\Event\ModifySkipPageEvent;
24 | use TYPO3\CMS\Core\Domain\Repository\PageRepository;
25 | use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
26 | use TYPO3\CMS\Core\Utility\GeneralUtility;
27 |
28 | /**
29 | * @internal since v9.2.5
30 | */
31 | class PageService
32 | {
33 | private readonly EventDispatcher $eventDispatcher;
34 |
35 | public function __construct(?EventDispatcher $eventDispatcher = null)
36 | {
37 | $this->eventDispatcher = $eventDispatcher ?? GeneralUtility::makeInstance(EventDispatcher::class);
38 | }
39 |
40 | /**
41 | * Check if the given page should be crawled
42 | *
43 | * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
44 | */
45 | public function checkIfPageShouldBeSkipped(array $pageRow): false|string
46 | {
47 | $extensionSettings = GeneralUtility::makeInstance(
48 | ExtensionConfigurationProvider::class
49 | )->getExtensionConfiguration();
50 |
51 | // if page is hidden
52 | if (!($extensionSettings['crawlHiddenPages'] ?? false) && ($pageRow['hidden'] ?? false)) {
53 | return 'Because page is hidden';
54 | }
55 |
56 | if (in_array($pageRow['doktype'], $this->getDisallowedDokTypes(), true)) {
57 | return sprintf('Because doktype "%d" is not allowed', $pageRow['doktype']);
58 | }
59 |
60 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
61 | if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
62 | return sprintf(
63 | 'Doktype "%d" was excluded by excludeDoktype configuration key "%s"',
64 | $pageRow['doktype'],
65 | $key
66 | );
67 | }
68 | }
69 |
70 | $event = $this->eventDispatcher->dispatch(new ModifySkipPageEvent($pageRow));
71 | return $event->isSkipped();
72 | }
73 |
74 | private function getDisallowedDokTypes(): array
75 | {
76 | return [
77 | PageRepository::DOKTYPE_LINK,
78 | PageRepository::DOKTYPE_SHORTCUT,
79 | PageRepository::DOKTYPE_SPACER,
80 | PageRepository::DOKTYPE_SYSFOLDER,
81 | PageRepository::DOKTYPE_BE_USER_SECTION,
82 | ];
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/Classes/ContextMenu/ItemProvider.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Domain\Repository\ConfigurationRepository;
23 | use TYPO3\CMS\Backend\ContextMenu\ItemProviders\AbstractProvider;
24 | use TYPO3\CMS\Backend\Utility\BackendUtility;
25 | use TYPO3\CMS\Core\Utility\GeneralUtility;
26 |
27 | /**
28 | * Provides a ContextMenu item
29 | * @internal since v9.2.5
30 | */
31 | class ItemProvider extends AbstractProvider
32 | {
33 | /**
34 | * @var array
35 | */
36 | protected $itemsConfiguration = [
37 | 'crawler' => [
38 | 'type' => 'item',
39 | 'label' => 'LLL:EXT:crawler/Resources/Private/Language/Backend.xlf:contextMenu.label',
40 | 'iconIdentifier' => 'tx-crawler',
41 | 'callbackAction' => 'crawler',
42 | ],
43 | ];
44 |
45 | /**
46 | * Item is added only for crawler configurations
47 | */
48 | #[\Override]
49 | public function canHandle(): bool
50 | {
51 | return $this->table === ConfigurationRepository::TABLE_NAME;
52 | }
53 |
54 | /**
55 | * This needs to be lower than priority of the RecordProvider
56 | */
57 | #[\Override]
58 | public function getPriority(): int
59 | {
60 | return 50;
61 | }
62 |
63 | /**
64 | * Adds the crawler info
65 | */
66 | #[\Override]
67 | public function addItems(array $items): array
68 | {
69 | $localItems = $this->prepareItems($this->itemsConfiguration);
70 | return $items + $localItems;
71 | }
72 |
73 | #[\Override]
74 | protected function getAdditionalAttributes(string $itemName): array
75 | {
76 | $crawlerConfiguration = BackendUtility::getRecordWSOL($this->table, (int) $this->identifier);
77 | if ($crawlerConfiguration === null) {
78 | return [];
79 | }
80 |
81 | if (!array_key_exists('name', $crawlerConfiguration)) {
82 | $crawlerConfiguration['name'] = 'No Name found in configuration';
83 | }
84 |
85 | $additionalParameters = [];
86 | $additionalParameters[] = 'SET[function]=AOE\Crawler\Backend\BackendModule';
87 | $additionalParameters[] = 'SET[crawlaction]=start';
88 | $additionalParameters[] = 'configurationSelection[]=' . $crawlerConfiguration['name'];
89 | return [
90 | 'data-dispatch-action' => 'TYPO3.ModuleMenu.showModule',
91 | 'data-dispatch-args-list' => 'web_site_crawler_start,&' . GeneralUtility::quoteJSvalue(
92 | '&' . implode('&', $additionalParameters)
93 | ),
94 | ];
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/Documentation/Configuration/ConfigurationRecords/Index.rst:
--------------------------------------------------------------------------------
1 | .. include:: /Includes.rst.txt
2 |
3 | .. _backend-configuration-record:
4 |
5 | =====================
6 | Configuration records
7 | =====================
8 |
9 | Formerly configuration was done by using pageTS (see below). This is
10 | still possible (fully backwards compatible) but not recommended.
11 | Instead of writing pageTS simply create a configuration record (table:
12 | ``tx_crawler_configuration``) and put it on the topmost page of the
13 | pagetree you want to affect with this configuration.
14 |
15 | The fields in these records are related to the pageTS keys described
16 | below.
17 |
18 | .. _backend-configuration-record-fields:
19 |
20 | Fields and their pageTS equivalents
21 | ===================================
22 |
23 | .. _backend-configuration-record-general:
24 |
25 | General
26 | -------
27 |
28 | .. figure:: /Images/backend_configurationrecord_general.png
29 | :alt: Backend configuration record: General
30 |
31 | Backend configuration record: General
32 |
33 | Name
34 | Corresponds to the "key" part in the pageTS setup e.g.
35 | :typoscript:`tx_crawler.crawlerCfg.paramSets.myConfigurationKeyName`
36 |
37 | Protocol for crawling
38 | Force HTTP, HTTPS or keep the configured protocol
39 |
40 | Processing instruction filter
41 | List of processing instructions. See also:
42 | :ref:`paramSets.[key].procInstrFilter `
43 |
44 | Base URL
45 | Set baseUrl (most likely the same as the entry point configured in your
46 | site configuration)
47 |
48 | Pids only
49 | List of Page Ids to limit this configuration to. See also:
50 | :ref:`paramSets.[key].pidsOnly `
51 |
52 | Exclude pages
53 | Comma separated list of page ids which should not be crawled.
54 | You can do recursive exclusion by adding `uid`+`depth` e.g. 6+3,
55 | this will ensure that all pages including pageUid 6 and 3 levels down
56 | will not be crawled.
57 |
58 | Configuration
59 | Parameter configuration. The values of GET variables are according to a
60 | special syntax. See also: :ref:`paramSets.[key]
61 | `
62 |
63 | Processing instruction parameters
64 | Options for processing instructions. Will be defined in the respective third
65 | party modules. See also: :ref:`paramSets.[key].procInstrParams
66 | `
67 |
68 | Crawl with FE user groups
69 | User groups to set for the request. See also:
70 | :ref:`paramSets.[key].userGroups ` and the hint in :ref:`create-crawler-configuration`
71 |
72 | .. _backend-configuration-record-access:
73 |
74 | Access
75 | ------
76 |
77 | .. figure:: /Images/backend_configurationrecord_access.png
78 | :alt: Backend configuration record: Access
79 |
80 | Backend configuration record: Access
81 |
82 | Hide
83 | If activated the configuration record is not taken into account.
84 |
85 | Restrict access to
86 | Restricts access to this configuration record to selected backend user
87 | groups. Empty means no restriction is set.
88 |
--------------------------------------------------------------------------------
/ext_tables.sql:
--------------------------------------------------------------------------------
1 | #
2 | # Table structure for table 'tx_crawler_queue'
3 | #
4 | CREATE TABLE tx_crawler_queue
5 | (
6 | qid int(11) DEFAULT '0' NOT NULL auto_increment,
7 | page_id int(11) DEFAULT '0' NOT NULL,
8 | parameters text NOT NULL,
9 | parameters_hash varchar(50) DEFAULT '' NOT NULL,
10 | configuration_hash varchar(50) DEFAULT '' NOT NULL,
11 | scheduled int(11) DEFAULT '0' NOT NULL,
12 | exec_time int(11) DEFAULT '0' NOT NULL,
13 | set_id int(11) DEFAULT '0' NOT NULL,
14 | result_data longtext NOT NULL,
15 | process_scheduled int(11) DEFAULT '0' NOT NULL,
16 | process_id varchar(50) DEFAULT '' NOT NULL,
17 | process_id_completed varchar(50) DEFAULT '' NOT NULL,
18 | configuration varchar(250) DEFAULT '' NOT NULL,
19 |
20 | PRIMARY KEY (qid),
21 | KEY page_id (page_id),
22 | KEY set_id (set_id),
23 | KEY exec_time (exec_time),
24 | KEY scheduled (scheduled),
25 | KEY process_id (process_id),
26 | KEY parameters_hash (parameters_hash),
27 | KEY configuration_hash (configuration_hash),
28 | KEY cleanup (exec_time,scheduled)
29 | ) ENGINE=InnoDB;
30 |
31 | #
32 | # Table structure for table 'tx_crawler_process'
33 | #
34 | CREATE TABLE tx_crawler_process
35 | (
36 | process_id varchar(50) DEFAULT '' NOT NULL,
37 | active smallint(6) DEFAULT '0',
38 | ttl int(11) DEFAULT '0' NOT NULL,
39 | assigned_items_count int(11) DEFAULT '0' NOT NULL,
40 | deleted tinyint(4) unsigned DEFAULT '0' NOT NULL,
41 | system_process_id int(11) DEFAULT '0' NOT NULL,
42 |
43 | KEY update_key (active,deleted),
44 | KEY process_id (process_id)
45 | ) ENGINE=InnoDB;
46 |
47 | #
48 | # Table structure for table 'tx_crawler_configuration'
49 | #
50 | CREATE TABLE tx_crawler_configuration
51 | (
52 | name tinytext NOT NULL,
53 | force_ssl tinyint(4) DEFAULT '0' NOT NULL,
54 | processing_instruction_filter varchar(200) DEFAULT '' NOT NULL,
55 | processing_instruction_parameters_ts varchar(200) DEFAULT '' NOT NULL,
56 | configuration text NOT NULL,
57 | base_url tinytext NOT NULL,
58 | pidsonly blob,
59 | begroups varchar(100) DEFAULT '0' NOT NULL,
60 | fegroups varchar(100) DEFAULT '0' NOT NULL,
61 | exclude text NOT NULL
62 |
63 | ) ENGINE=InnoDB;
64 |
65 | #
66 | # Table structure for table 'pages'
67 | # This is added to reuse the information from typo3/cms-seo.
68 | # As we don't have a dependency for typo3/cms-seo it's added here to ensure that the
69 | # database queries isn't breaking
70 | #
71 | CREATE TABLE pages
72 | (
73 | sitemap_priority decimal(2, 1) DEFAULT '0.5' NOT NULL
74 | );
75 |
--------------------------------------------------------------------------------
/Classes/Service/QueueService.php:
--------------------------------------------------------------------------------
1 |
9 | *
10 | * This file is part of the TYPO3 Crawler Extension.
11 | *
12 | * It is free software; you can redistribute it and/or modify it under
13 | * the terms of the GNU General Public License, either version 2
14 | * of the License, or any later version.
15 | *
16 | * For the full copyright and license information, please read the
17 | * LICENSE.txt file that was distributed with this source code.
18 | *
19 | * The TYPO3 project - inspiring people to share!
20 | */
21 |
22 | use AOE\Crawler\Controller\CrawlerController;
23 | use TYPO3\CMS\Core\Domain\Repository\PageRepository;
24 | use TYPO3\CMS\Core\Utility\GeneralUtility;
25 |
26 | /**
27 | * @internal since v9.2.5
28 | */
29 | class QueueService
30 | {
31 | public function __construct(
32 | private readonly CrawlerController $crawlerController
33 | ) {
34 | if ($this->crawlerController->setID <= 0) {
35 | $this->crawlerController->setID = GeneralUtility::md5int(microtime());
36 | }
37 | }
38 |
39 | public function addPageToQueue(int $pageUid, int $time = 0): void
40 | {
41 | $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($pageUid, true);
42 | $configurations = $this->crawlerController->getUrlsForPageRow($pageData);
43 | // Currently this is only used from the DataHandlerHook, and we don't know of any allowed/disallowed configurations,
44 | // when clearing the cache, therefore we allow all configurations in this case.
45 | // This next lines could be skipped as it will return the incoming configurations, but for visibility and
46 | // later implementation it's kept as it do no harm.
47 | $allowedConfigurations = [];
48 | $configurations = ConfigurationService::removeDisallowedConfigurations($allowedConfigurations, $configurations);
49 | $downloadUrls = [];
50 | $duplicateTrack = [];
51 |
52 | foreach ($configurations as $configuration) {
53 | //enable inserting of entries
54 | $this->crawlerController->registerQueueEntriesInternallyOnly = false;
55 | $this->crawlerController->urlListFromUrlArray(
56 | $configuration,
57 | $pageData,
58 | $time,
59 | 300,
60 | true,
61 | false,
62 | $duplicateTrack,
63 | $downloadUrls,
64 | array_keys($this->getCrawlerProcInstructions())
65 | );
66 |
67 | //reset the queue because the entries have been written to the db
68 | unset($this->crawlerController->queueEntries);
69 | }
70 | }
71 |
72 | /**
73 | * Reads the registered processingInstructions of the crawler
74 | */
75 | private function getCrawlerProcInstructions(): array
76 | {
77 | $crawlerProcInstructions = [];
78 | if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
79 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
80 | $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
81 | }
82 | }
83 |
84 | return $crawlerProcInstructions;
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/Classes/Domain/Model/ProcessCollection.php:
--------------------------------------------------------------------------------
1 |
14 | *
15 | * All rights reserved
16 | *
17 | * This script is part of the TYPO3 project. The TYPO3 project is
18 | * free software; you can redistribute it and/or modify
19 | * it under the terms of the GNU General Public License as published by
20 | * the Free Software Foundation; either version 3 of the License, or
21 | * (at your option) any later version.
22 | *
23 | * The GNU General Public License can be found at
24 | * http://www.gnu.org/copyleft/gpl.html.
25 | *
26 | * This script is distributed in the hope that it will be useful,
27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 | * GNU General Public License for more details.
30 | *
31 | * This copyright notice MUST APPEAR in all copies of the script!
32 | ***************************************************************/
33 |
34 | /**
35 | * @internal since v9.2.5
36 | */
37 | class ProcessCollection extends \ArrayObject
38 | {
39 | /**
40 | * Method to retrieve an element from the collection.
41 | * @throws NoIndexFoundException
42 | */
43 | #[\Override]
44 | public function offsetGet(mixed $key): Process
45 | {
46 | if (!parent::offsetExists($key)) {
47 | throw new NoIndexFoundException('Index "' . var_export(
48 | $key,
49 | true
50 | ) . '" for \AOE\Crawler\Domain\Model\Process are not available', 1_593_714_823);
51 | }
52 | return parent::offsetGet($key);
53 | }
54 |
55 | /**
56 | * Method to add an element to the collection-
57 | *
58 | * @param Process $value
59 | * @throws InvalidArgumentException
60 | */
61 | #[\Override]
62 | public function offsetSet(mixed $key, $value): void
63 | {
64 | if (!$value instanceof Process) {
65 | throw new \InvalidArgumentException(
66 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!',
67 | 1_593_714_822
68 | );
69 | }
70 |
71 | parent::offsetSet($key, $value);
72 | }
73 |
74 | /**
75 | * Method to append an element to the collection
76 | * @param Process $value
77 | * @throws InvalidArgumentException
78 | */
79 | #[\Override]
80 | public function append($value): void
81 | {
82 | if (!$value instanceof Process) {
83 | throw new \InvalidArgumentException(
84 | 'Wrong parameter type given, "\AOE\Crawler\Domain\Model\Process" expected!',
85 | 1_593_714_821
86 | );
87 | }
88 |
89 | parent::append($value);
90 | }
91 |
92 | /**
93 | * returns array of process ids of the current collection
94 | */
95 | public function getProcessIds(): array
96 | {
97 | $result = [];
98 | foreach ($this->getIterator() as $value) {
99 | $result[] = $value->getProcessId();
100 | }
101 | return $result;
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/ext_conf_template.txt:
--------------------------------------------------------------------------------
1 | #########
2 | ## Settings
3 | #########
4 |
5 | # cat=Settings; type=string; label=Frontend website base path: Base path of the website frontend (e.g. if you call http://mydomain.com/cms/index.php in the browser the base path is "/cms/"). Leave empty to use the value of config.absRefPrefix instead.
6 | frontendBasePath=/
7 |
8 | # cat=Settings; type=boolean; label= Crawl hidden pages: Crawl hidden pages (By default they won't be crawled)
9 | crawlHiddenPages=0
10 |
11 | # cat=Settings; type=boolean; label= Make direct requests: If checked the crawler will make direct requests by including the index.php file instead of getting the page content via http(s)
12 | makeDirectRequests=0
13 |
14 | #########
15 | ## Queue
16 | #########
17 |
18 | # cat=Queue; type=int [1- 86400]; label= Maximal number of URLs, which can be added to the queue at one time
19 | maxCompileUrls=10000
20 |
21 | # cat=Queue; type=boolean; label= Enabled timeslot for duplication check: When this option is active, items will not be queued twice for the past if their scheduled time is the current time +-100 seconds.
22 | enableTimeslot=1
23 |
24 | #########
25 | ## Processing
26 | #########
27 |
28 | # cat=Processing; type=int [0-10000]; label= Sleep time between requests: Time in microseconds the crawler should sleep between requesting urls: low = faster / high = less stress for the server
29 | sleepTime = 1000
30 |
31 | # cat=Processing; type=int [0-100]; label= Sleep time after finishing: Time in seconds the crawler should sleep before finishing
32 | sleepAfterFinish=10
33 |
34 | # cat=Processing; type=int [1-10000]; label= Entries per run: How many queue entries should be processed in a run
35 | countInARun=100
36 |
37 | # cat=Processing; type=int [1-99]; label= Maximum processes
38 | processLimit=1
39 |
40 | # cat=Processing; type=int [1- 86400]; label= Maximal process runtime: in seconds - only necessary if processLimit > 1
41 | processMaxRunTime=300
42 |
43 | #########
44 | ## Cleanup
45 | #########
46 |
47 | # cat=Cleanup; type=boolean; label=Clean up old queue entries: If checked the older queue entries will be deleted when adding new crawler configurations from CLI.
48 | cleanUpOldQueueEntries=1
49 |
50 | # cat=Cleanup; type=int [1- 99]; label=Processed Age: If Clean up old queue entries is checked, then processed entries older than X days are deleted.
51 | cleanUpProcessedAge=2
52 |
53 | # cat=Cleanup; type=int [1- 99]; label=Scheduled Age: If Clean up old queue entries is checked, then scheduled entries older than X days are deleted.
54 | cleanUpScheduledAge=7
55 |
56 | # cat=Cleanup; type=int [1-365]; label= Delete processed items: Delete processed items from the queue after n days (0 will keep the entries forever - the database may grow very large over time!)
57 | purgeQueueDays=14
58 |
59 | #########
60 | ## System
61 | #########
62 |
63 | # cat=System; type=string; label= Name of the php binary (e.g. PHP72-LATEST-CLI ), default is php
64 | phpBinary=php
65 |
66 | # cat=System; type=string; label= PHP Path: Local path to php binary file (e.g. "/usr/bin/php"), you should ONLY use this when the resolved php-binary isn't the correct one. You can check that in the Info -> Site Crawling -> Crawling Process -> CLI-Path
67 | phpPath=
68 |
69 | #########
70 | ## Debug
71 | #########
72 |
73 | # cat=Debug; type=boolean; label= Debug: Print Multiprocess- processing informations - prints some information whether a process was really executed and which status it has
74 | processDebug=0
75 |
76 | # cat=Debug; type=boolean; label= Make Multiprocess- processing be verbose while running
77 | processVerbose=0
78 |
--------------------------------------------------------------------------------