├── .DS_Store
├── CHANGELOG.md
├── README.md
├── checker-cheerio
    ├── .actor
    │   └── actor.json
    ├── .eslintrc
    ├── .gitignore
    ├── Dockerfile
    ├── INPUT_SCHEMA.json
    ├── README.md
    ├── package-lock.json
    ├── package.json
    ├── src
    │   ├── checkers.ts
    │   ├── handleFailedRequest.ts
    │   ├── handlePage.ts
    │   ├── main.ts
    │   ├── typedefs.ts
    │   └── utils.ts
    └── tsconfig.json
├── checker-playwright
    ├── .eslintrc
    ├── .gitignore
    ├── Dockerfile
    ├── INPUT_SCHEMA.json
    ├── README.md
    ├── apify.json
    ├── package-lock.json
    ├── package.json
    ├── src
    │   ├── checkers.ts
    │   ├── handleFailedRequest.ts
    │   ├── handlePage.ts
    │   ├── main.ts
    │   ├── typedefs.ts
    │   └── utils.ts
    └── tsconfig.json
├── checker-puppeteer
    ├── .gitignore
    ├── Dockerfile
    ├── INPUT_SCHEMA.json
    ├── README.md
    ├── apify.json
    ├── package-lock.json
    ├── package.json
    ├── src
    │   ├── checkers.ts
    │   ├── handleFailedRequest.ts
    │   ├── handlePage.ts
    │   ├── main.ts
    │   ├── typedefs.ts
    │   └── utils.ts
    └── tsconfig.json
└── starter
    ├── .DS_Store
    ├── .actor
        └── actor.json
    ├── .eslintrc
    ├── .gitignore
    ├── Dockerfile
    ├── INPUT_SCHEMA.json
    ├── README.md
    ├── package-lock.json
    ├── package.json
    ├── src
        ├── configs.ts
        ├── constants.ts
        ├── main.ts
        ├── startRunAndPool.ts
        └── typedefs.ts
    └── tsconfig.json


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify-projects/store-website-checker/4600159968d7289e023c071ad72c22bc5f3e4570/.DS_Store


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ### 2021-07-29
 2 | 
 3 | *Features*
 4 | 
 5 | - Pushing metadata about each page to dataset
 6 | - Added recognition of Amazon's `hCaptcha`
 7 | - `success` and `wasSuccess` metrics added to output. Success is measured by status being less than 400 and no captcha
 8 | 
 9 | *Changes*
10 | 
11 | - Removed `useGoogleBotHeaders` option (we don't want to impersonate Google anyway)
12 | - Updated `apify` from `0.18.1` to `1.3.1`
13 | - `saveSnapshots` is `true` by default
14 | - Added recognition of Amazon's `hCaptcha`
15 | - `success` and `wasSuccess` metrics added to output. Success is measured by status being less than 400 and no captcha
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Website Checker
 2 | 
 3 | Website checker is a simple actor that allows you to scan any website for performance and blocking using various scraping methods as Cheerio, Puppeteer and Playwright.
 4 | 
 5 | ### Features
 6 | 
 7 | The actor provides these useful features out of the box:
 8 | 
 9 | - Collects response status codes
10 | - Recognizes the most common captchas
11 | - Saves HTML snapshots and screenshots (if Puppeteer or Playwright is chosen)
12 | - Enables choosing between Cheerio (plain HTTP) and Puppeteer/Playwright (browser) scraper
13 | - Enables choosing different browsers for Playwright - Chrome, Firefox and Webkit (Safari)
14 | - Enables re-scraping start URLs or enqueueing with a familiar link selector + pseudo URLs system
15 | - Handles different failure states like timeouts and network errors
16 | - Enables basic proxy and browser configuration
17 | 
18 | ### How to use
19 | 
20 | The most common use-case is to do a quick check on how aggressively the target site is blocking. In that case just supply a start URL, ideally a category one or product one. You can either set `replicateStartUrls` or add enqueueing with `linkSelector` + `pseudoUrls`, both are good options to test different proxies.
21 | 
22 | You can pick any combination of run options and the checker will spawn runner actor for every combination of scraping tool & proxies and then combine the results into single output.
23 | 
24 | In the end you will get a simple statistics about the blocking rate. It is recommended to check a few screenshots just to make sure the actor correctly recognized the page status. You can get to the detailed output (per URL) via KV store or dataset (the KV output sorts by response status while dataset is simply ordered by scraping order).
25 | 
26 | #### Multiple URLs and configurations
27 | Website checker doesn't have any limitation of how many websites and configs you can check. For each website, it will run each config. You just need to set a reasonable `maxConcurrentDomainsChecked` so that all parallel runs fit into your total memory (4 GB for Cheerio and 8 GB for Puppeteer/Playwright checks).
28 | 
29 | ### Input
30 | 
31 | Please follow the [actor's input page](https://apify.com/lukaskrivka/website-checker/input-schema) for a detailed explanation. Most input fields have reasonable defaults.
32 | 
33 | ### Example output
34 | 
35 | #### Simple output
36 | 
37 | ```
38 | {
39 |     "timeouted": 0,
40 |     "failedToLoadOther": 9,
41 |     "accessDenied": 0,
42 |     "recaptcha": 0,
43 |     "distilCaptcha": 24,
44 |     "hCaptcha": 0, 
45 |     "statusCodes": {
46 |         "200": 3,
47 |         "401": 2,
48 |         "403": 5,
49 |         "405": 24
50 |     },
51 |     "success": 3,
52 |     "total": 43
53 | }
54 | ```
55 | 
56 | #### Detailed output with URLs, screenshots and HTML links
57 | <https://api.apify.com/v2/key-value-stores/zT3zxpd53Wv9m9ukQ/records/DETAILED-OUTPUT?disableRedirect=true>
58 | 
59 | ### Changelog
60 | 
61 | Check history of changes in the [CHANGELOG](https://github.com/metalwarrior665/actor-website-checker/blob/master/CHANGELOG.md)
62 | 


--------------------------------------------------------------------------------
/checker-cheerio/.actor/actor.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"actorSpecification": 1,
3 | 	"name": "checker-cheerio",
4 | 	"version": "0.0",
5 | 	"buildTag": "latest"
6 | }
7 | 


--------------------------------------------------------------------------------
/checker-cheerio/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": true,
 3 |     "env": {
 4 |         "browser": true,
 5 |         "es2020": true,
 6 |         "node": true
 7 |     },
 8 |     "extends": [
 9 |         "@apify/eslint-config-ts"
10 |     ],
11 |     "parserOptions": {
12 |         "project": "./tsconfig.json",
13 |         "ecmaVersion": 2020
14 |     },
15 |     "ignorePatterns": [
16 |         "node_modules",
17 |         "dist",
18 |         "**/*.d.ts"
19 |     ]
20 | }


--------------------------------------------------------------------------------
/checker-cheerio/.gitignore:
--------------------------------------------------------------------------------
 1 | # This file tells Git which files shouldn't be added to source control
 2 | 
 3 | .DS_Store
 4 | .idea
 5 | dist
 6 | node_modules
 7 | apify_storage
 8 | storage
 9 | storage
10 | storage
11 | # Added by Apify CLI
12 | .venv
13 | 


--------------------------------------------------------------------------------
/checker-cheerio/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Specify the base Docker image. You can read more about
 2 | # the available images at https://crawlee.dev/docs/guides/docker-images
 3 | # You can also use any other image from Docker Hub.
 4 | FROM apify/actor-node:16 AS builder
 5 | 
 6 | # Copy just package.json and package-lock.json
 7 | # to speed up the build using Docker layer cache.
 8 | COPY package*.json ./
 9 | 
10 | # Install all dependencies. Don't audit to speed up the installation.
11 | RUN npm install --include=dev --audit=false
12 | 
13 | # Next, copy the source files using the user set
14 | # in the base image.
15 | COPY . ./
16 | 
17 | # Install all dependencies and build the project.
18 | # Don't audit to speed up the installation.
19 | RUN npm run build
20 | 
21 | # Create final image
22 | FROM apify/actor-node:16
23 | 
24 | # Copy only built JS files from builder image
25 | COPY --from=builder /usr/src/app/dist ./dist
26 | 
27 | # Copy just package.json and package-lock.json
28 | # to speed up the build using Docker layer cache.
29 | COPY package*.json ./
30 | 
31 | # Install NPM packages, skip optional and development dependencies to
32 | # keep the image small. Avoid logging too much and print the dependency
33 | # tree for debugging
34 | RUN npm --quiet set progress=false \
35 |     && npm install --omit=dev --omit=optional \
36 |     && echo "Installed NPM packages:" \
37 |     && (npm list --omit=dev --all || true) \
38 |     && echo "Node.js version:" \
39 |     && node --version \
40 |     && echo "NPM version:" \
41 |     && npm --version \
42 |     && rm -r ~/.npm
43 | 
44 | # Next, copy the remaining files and directories with the source code.
45 | # Since we do this after NPM install, quick build will be really fast
46 | # for most source file changes.
47 | COPY . ./
48 | 
49 | 
50 | # Run the image.
51 | CMD npm run start:prod --silent


--------------------------------------------------------------------------------
/checker-cheerio/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "title": "Web Checker",
  3 |   "description": "The web checker actor loads <b>URLs to check</b> and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay. <b>TODO: Needs to be more descriptive!!</b>",
  4 |   "type": "object",
  5 |   "schemaVersion": 1,
  6 |   "properties": {
  7 |     "urlsToCheck": {
  8 |       "title": "URLs to check",
  9 |       "type": "array",
 10 |       "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the <b>Use request queue</b> option.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
 11 |       "sectionCaption": "Checker Options",
 12 |       "sectionDescription": "Options that will be passed to the checkers",
 13 |       "editor": "requestListSources",
 14 |       "prefill": [
 15 |         {
 16 |           "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011"
 17 |         }
 18 |       ]
 19 |     },
 20 |     "proxyConfiguration": {
 21 |       "title": "Proxy Configuration",
 22 |       "type": "object",
 23 |       "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
 24 |       "default": {},
 25 |       "editor": "proxy",
 26 |       "prefill": {
 27 |         "useApifyProxy": false
 28 |       }
 29 |     },
 30 |     "saveSnapshot": {
 31 |       "title": "Enabled",
 32 |       "type": "boolean",
 33 |       "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright",
 34 |       "editor": "checkbox",
 35 |       "groupCaption": "Save Snapshots"
 36 |     },
 37 |     "linkSelector": {
 38 |       "title": "Link Selector",
 39 |       "type": "string",
 40 |       "description": "A CSS selector saying which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) shall be followed and added to the request queue. This setting only applies if <b>Use request queue</b> is enabled. To filter the links added to the queue, use the <b>Pseudo-URLs</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README.",
 41 |       "sectionCaption": "Crawler Options",
 42 |       "sectionDescription": "Specific options that are relevant for crawlers",
 43 |       "editor": "textfield",
 44 |       "prefill": "a[href]",
 45 |       "minLength": 1
 46 |     },
 47 |     "pseudoUrls": {
 48 |       "title": "Pseudo-URLs",
 49 |       "type": "array",
 50 |       "description": "Specifies what kind of URLs found by <b>Link selector</b> should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in <code>[]</code> brackets, e.g. <code>http://www.example.com/[.*]</code>. This setting only applies if the <b>Use request queue</b> option is enabled.<br><br>If <b>Pseudo-URLs</b> are omitted, the actor enqueues all links matched by the <b>Link selector</b>.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#pseudo-urls' target='_blank' rel='noopener'>Pseudo-URLs</a> in README.",
 51 |       "default": [],
 52 |       "editor": "pseudoUrls",
 53 |       "prefill": [
 54 |         {
 55 |           "purl": "https://www.amazon.com[.*]/dp/[.*]"
 56 |         }
 57 |       ]
 58 |     },
 59 |     "repeatChecksOnProvidedUrls": {
 60 |       "title": "Repeat checks on provided URLs",
 61 |       "type": "integer",
 62 |       "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.",
 63 |       "editor": "number"
 64 |     },
 65 |     "maxNumberOfPagesCheckedPerDomain": {
 66 |       "title": "Max number of pages checked per domain",
 67 |       "type": "integer",
 68 |       "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.<br><br>If set to <code>0</code>, there is no limit.",
 69 |       "default": 100,
 70 |       "editor": "number"
 71 |     },
 72 |     "maxConcurrentPagesCheckedPerDomain": {
 73 |       "title": "Maximum concurrent pages checked per domain",
 74 |       "type": "integer",
 75 |       "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.",
 76 |       "default": 50,
 77 |       "editor": "number",
 78 |       "minimum": 1
 79 |     },
 80 |     "maxConcurrentDomainsChecked": {
 81 |       "title": "Maximum number of concurrent domains checked",
 82 |       "type": "integer",
 83 |       "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.",
 84 |       "default": 5,
 85 |       "editor": "number",
 86 |       "minimum": 1,
 87 |       "maximum": 10
 88 |     },
 89 |     "retireBrowserInstanceAfterRequestCount": {
 90 |       "title": "Retire browser instance after request count",
 91 |       "type": "integer",
 92 |       "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.",
 93 |       "default": 10,
 94 |       "editor": "number",
 95 |       "minimum": 1
 96 |     }
 97 |   },
 98 |   "required": ["urlsToCheck"]
 99 | }
100 | 


--------------------------------------------------------------------------------
/checker-cheerio/README.md:
--------------------------------------------------------------------------------
1 | # Website Checker Runner with Cheerio
2 | 
3 | Checks the provided website using cheerio. This is a low level runner, most likely you want to use the high level master actor -  https://apify.com/lukaskrivka/website-checker
4 | 


--------------------------------------------------------------------------------
/checker-cheerio/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "crawlee-cheerio-typescript",
 3 |     "version": "0.0.1",
 4 |     "type": "module",
 5 |     "description": "This is a boilerplate of an Apify actor.",
 6 |     "engines": {
 7 |         "node": ">=16.0.0"
 8 |     },
 9 |     "dependencies": {
10 |         "apify": "^3.1.0",
11 |         "crawlee": "^3.1",
12 |         "cheerio": "^1.0.0-rc.10"
13 |     },
14 |     "devDependencies": {
15 |         "@apify/eslint-config-ts": "^0.2.3",
16 |         "@apify/tsconfig": "^0.1.0",
17 |         "@typescript-eslint/eslint-plugin": "^5.32.0",
18 |         "@typescript-eslint/parser": "^5.32.0",
19 |         "eslint": "^8.20.0",
20 |         "ts-node": "^10.9.1",
21 |         "typescript": "^4.8"
22 |     },
23 |     "scripts": {
24 |         "start": "npm run start:dev",
25 |         "start:prod": "node dist/main.js",
26 |         "start:dev": "ts-node-esm -T src/main.ts",
27 |         "build": "tsc",
28 |         "lint": "eslint ./src --ext .ts",
29 |         "lint:fix": "eslint ./src --ext .ts --fix",
30 |         "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
31 |     },
32 |     "author": "It's not you it's me",
33 |     "license": "ISC"
34 | }
35 | 


--------------------------------------------------------------------------------
/checker-cheerio/src/checkers.ts:
--------------------------------------------------------------------------------
 1 | import type { CheerioAPI } from 'cheerio';
 2 | 
 3 | export function distilCaptcha($: CheerioAPI): boolean {
 4 |     return $('#distilCaptchaForm').length > 0
 5 |         || $('[action*="distil_r_captcha.html"]').length > 0;
 6 | }
 7 | 
 8 | export function recaptcha($: CheerioAPI): boolean {
 9 |     return $('#recaptcha').length > 0
10 |         || $('iframe[src*="/recaptcha/"]').length > 0;
11 | }
12 | 
13 | export function hCaptcha($: CheerioAPI): boolean {
14 |     return $('[action="/errors/validateCaptcha"]').length > 0;
15 | }
16 | 
17 | export function accessDenied($: CheerioAPI): boolean {
18 |     return $('title').text().includes('Access Denied');
19 | }
20 | 
21 | export function testHtml($: CheerioAPI) {
22 |     return {
23 |         accessDenied: accessDenied($),
24 |         distilCaptcha: distilCaptcha($),
25 |         recaptcha: recaptcha($),
26 |         hCaptcha: hCaptcha($),
27 |     };
28 | }
29 | 


--------------------------------------------------------------------------------
/checker-cheerio/src/handleFailedRequest.ts:
--------------------------------------------------------------------------------
 1 | import { log } from 'crawlee';
 2 | 
 3 | import type { CheerioCrawlingContext } from 'crawlee';
 4 | 
 5 | import type { ActorCheckDetailedOutput } from './typedefs.js';
 6 | 
 7 | export async function handleFailedRequest(state: ActorCheckDetailedOutput, { request }: CheerioCrawlingContext) {
 8 |     state.totalPages.push({ url: request.url });
 9 | 
10 |     const [error] = request.errorMessages;
11 |     log.warning(`Request failed --- ${request.url}\n${error}`);
12 | 
13 |     if (error.includes('request timed out')) {
14 |         state.timedOut.push({ url: request.url });
15 |     } else {
16 |         state.failedToLoadOther.push({ url: request.url });
17 |     }
18 | 
19 |     // CheerioCrawler obscures status code >=500 to a string message so we have to parse it
20 |     const maybeStatusCheerio = error.match(/(\d\d\d) - Internal Server Error/);
21 |     if (maybeStatusCheerio) {
22 |         const statusCode = Number(maybeStatusCheerio[1]);
23 |         state.statusCodes[statusCode] ??= [];
24 |         state.statusCodes[statusCode].push({ url: request.url });
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/checker-cheerio/src/handlePage.ts:
--------------------------------------------------------------------------------
 1 | import { Actor } from 'apify';
 2 | 
 3 | import type { RequestQueue } from 'apify';
 4 | import { PseudoUrl, RequestOptions } from 'crawlee';
 5 | 
 6 | import { testHtml } from './checkers.js';
 7 | 
 8 | import type { CheerioActorInput, ActorCheckDetailedOutput, CheerioCheckerHandlePageInputs } from './typedefs.js';
 9 | 
10 | export async function handlePage(
11 |     input: CheerioActorInput,
12 |     requestQueue: RequestQueue,
13 |     state: ActorCheckDetailedOutput,
14 |     { request, $, body, response, crawler, json }: CheerioCheckerHandlePageInputs,
15 | ) {
16 |     /** @type {string | undefined} */
17 |     let htmlUrl;
18 | 
19 |     if (input.saveSnapshot) {
20 |         const key = `SNAPSHOT-${Math.random().toString()}`;
21 |         if (json) {
22 |             await Actor.setValue(key, json);
23 |         } else {
24 |             await Actor.setValue(`${key}.html`, body, { contentType: 'text/html' });
25 |         }
26 |         htmlUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`;
27 |     }
28 | 
29 |     state.totalPages.push({ url: request.url, htmlUrl });
30 | 
31 |     const { statusCode } = response;
32 | 
33 |     state.statusCodes[statusCode!] ??= [];
34 |     state.statusCodes[statusCode!].push({ url: request.url, htmlUrl });
35 | 
36 |     const captchas: string[] = [];
37 |     // We don't have $ for JSON responses nor we can recognize captchas from it
38 |     if ($) {
39 |         const testResult = testHtml($);
40 | 
41 |         for (const testResultEntry of Object.entries(testResult)) {
42 |             const wasFound = testResultEntry[1];
43 |             const testCase = testResultEntry[0] as 'accessDenied' | 'distilCaptcha' | 'recaptcha' | 'hCaptcha';
44 |             if (wasFound) {
45 |                 captchas.push(testCase);
46 | 
47 |                 state[testCase].push({ url: request.url, htmlUrl });
48 |             }
49 |         }
50 |     }
51 | 
52 |     const wasSuccess = statusCode! < 400 && captchas.length === 0;
53 |     if (wasSuccess) {
54 |         state.success.push({ url: request.url, htmlUrl });
55 |     }
56 | 
57 |     await Actor.pushData({
58 |         url: request.url,
59 |         htmlUrl,
60 |         statusCode,
61 |         captchas,
62 |         wasSuccess,
63 |     });
64 | 
65 |     const pageOrigin = new URL(request.url).origin;
66 | 
67 |     if (input.linkSelector && !!$) {
68 |         const info = await requestQueue.getInfo();
69 | 
70 |         const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
71 |         if (maxUrlsToEnqueue > 0) {
72 |             const toEnqueue: RequestOptions[] = [];
73 |             $(input.linkSelector).each((_, el) => {
74 |                 const rawHref = $(el).attr('href');
75 |                 if (!rawHref) {
76 |                     return;
77 |                 }
78 |                 const href = new URL(rawHref, pageOrigin).toString();
79 |                 for (const pseudoUrlInput of input.pseudoUrls) {
80 |                     if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
81 |                         const newUrl = new URL(href, request.loadedUrl).toString();
82 |                         toEnqueue.push({
83 |                             url: newUrl,
84 |                             headers: pseudoUrlInput.headers,
85 |                             method: pseudoUrlInput.method as 'GET' | 'POST',
86 |                             payload: pseudoUrlInput.payload,
87 |                             userData: pseudoUrlInput.userData,
88 |                         });
89 |                     }
90 |                 }
91 |             });
92 |             console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`);
93 |             await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue));
94 |         }
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/checker-cheerio/src/main.ts:
--------------------------------------------------------------------------------
  1 | import { Actor } from 'apify';
  2 | import { log, CheerioCrawler } from 'crawlee';
  3 | import type { RequestOptions } from 'crawlee';
  4 | 
  5 | import { inspect } from 'util';
  6 | import { handleFailedRequest } from './handleFailedRequest.js';
  7 | import { handlePage } from './handlePage.js';
  8 | import { convertDetailedOutputToSimplified } from './utils.js';
  9 | import type { CheerioActorInput, ActorCheckDetailedOutput } from './typedefs.js';
 10 | 
 11 | Actor.main(async () => {
 12 |     const input = await Actor.getInput() as CheerioActorInput;
 13 | 
 14 |     // Log the input
 15 |     // Log the input
 16 |     log.info('Input provided:');
 17 |     log.debug(inspect(input, false, 4));
 18 | 
 19 |     log.info(
 20 |         [
 21 |             'Running a Cheerio Checker. Cheerio downloads only initial HTML.',
 22 |             'If you need to render JavaScript or wait on a page for data to load, enable Puppeteer or Playwright as Checker Type in the Frontend.',
 23 |         ].join('\n'),
 24 |     );
 25 | 
 26 |     const {
 27 |         maxConcurrentPagesCheckedPerDomain,
 28 |         maxNumberOfPagesCheckedPerDomain,
 29 |         proxyConfiguration,
 30 |         urlsToCheck,
 31 |         repeatChecksOnProvidedUrls,
 32 |         navigationTimeoutSecs,
 33 |     } = input;
 34 | 
 35 |     const proxy = await Actor.createProxyConfiguration({
 36 |         groups: proxyConfiguration.apifyProxyGroups,
 37 |         countryCode: proxyConfiguration.apifyProxyCountry,
 38 |     });
 39 | 
 40 |     const requestQueue = await Actor.openRequestQueue();
 41 | 
 42 |     const [urlData] = urlsToCheck;
 43 |     await requestQueue.addRequest(urlData as RequestOptions);
 44 |     for (let _ = 0; _ < (repeatChecksOnProvidedUrls ?? 0); _++) {
 45 |         await requestQueue.addRequest({
 46 |             ...urlData as RequestOptions,
 47 |             uniqueKey: Math.random().toString(),
 48 |         });
 49 |     }
 50 | 
 51 |     const env = Actor.getEnv();
 52 | 
 53 |     const state: ActorCheckDetailedOutput = {
 54 |         url: urlData.url,
 55 |         checkerType: 'cheerio',
 56 |         simplifiedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`,
 57 |         detailedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`,
 58 |         totalPages: [],
 59 |         timedOut: [],
 60 |         failedToLoadOther: [],
 61 |         accessDenied: [],
 62 |         success: [],
 63 |         statusCodes: {},
 64 |         recaptcha: [],
 65 |         distilCaptcha: [],
 66 |         hCaptcha: [],
 67 |     };
 68 | 
 69 |     const crawler = new CheerioCrawler({
 70 |         maxRequestRetries: 0,
 71 |         navigationTimeoutSecs,
 72 |         maxRequestsPerCrawl: maxNumberOfPagesCheckedPerDomain,
 73 |         maxConcurrency: maxConcurrentPagesCheckedPerDomain,
 74 |         requestQueue,
 75 |         requestHandler: (pageInputs) => handlePage(input, requestQueue, state, pageInputs),
 76 |         failedRequestHandler: (requestInput) => handleFailedRequest(state, requestInput),
 77 |         proxyConfiguration: proxy,
 78 |         useSessionPool: false,
 79 |         additionalMimeTypes: ['application/xml'],
 80 |     });
 81 | 
 82 |     // TODO: Consider making this an option in the CheerioCrawler instead of needing to override a function
 83 |     // We don't want the crawler to throw errors on bad statuses
 84 |     Reflect.set(crawler, '_throwOnBlockedRequest', () => {
 85 |         // Do nothing
 86 |     });
 87 | 
 88 |     await crawler.run();
 89 | 
 90 |     await Actor.setValue('OUTPUT', convertDetailedOutputToSimplified(state));
 91 |     await Actor.setValue('DETAILED-OUTPUT', state);
 92 |     log.info('Checker finished.');
 93 |     log.info(
 94 |         `Simplified output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`,
 95 |     );
 96 |     log.info(
 97 |         `Detailed output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`,
 98 |     );
 99 |     log.info(`Preview dataset: https://api.apify.com/v2/datasets/${env.defaultDatasetId}/items?clean=true&format=html`);
100 | });
101 | 


--------------------------------------------------------------------------------
/checker-cheerio/src/typedefs.ts:
--------------------------------------------------------------------------------
  1 | import type { CheerioCrawlingContext } from 'crawlee';
  2 | 
  3 | type KeysNotRequired =
  4 |     | 'checkers.cheerio'
  5 |     | 'checkers.puppeteer'
  6 |     | 'checkers.playwright'
  7 |     | 'puppeteer.headfull'
  8 |     | 'puppeteer.useChrome'
  9 |     | 'puppeteer.waitFor'
 10 |     | 'playwright.chrome'
 11 |     | 'playwright.firefox'
 12 |     | 'playwright.webkit'
 13 |     | 'maxConcurrentDomainsChecked';
 14 | 
 15 | export type CheerioActorInput = Omit<ActorInputData, KeysNotRequired>;
 16 | 
 17 | export type CheerioCheckerHandlePageInputs = CheerioCrawlingContext
 18 | 
 19 | export interface PseudoUrlInputCustom {
 20 |     purl: string;
 21 |     method?: string;
 22 |     payload?: string;
 23 |     userData?: Record<string, unknown>;
 24 |     headers?: Record<string, string>;
 25 | }
 26 | 
 27 | export interface UrlInput {
 28 |     url: string;
 29 |     method?: string;
 30 |     payload?: string;
 31 |     userData?: Record<string, unknown>;
 32 |     headers?: Record<string, string>;
 33 | }
 34 | 
 35 | export interface ProxyConfiguration {
 36 |     useApifyProxy: boolean;
 37 |     apifyProxyGroups?: string[];
 38 |     apifyProxyCountry?: string;
 39 | }
 40 | 
 41 | export interface ActorInputData {
 42 |     // Crawlers to use
 43 |     'checkers.cheerio'?: boolean;
 44 |     'checkers.puppeteer'?: boolean;
 45 |     'checkers.playwright'?: boolean;
 46 | 
 47 |     // Pass these to crawlers
 48 | 
 49 |     // save snapshots
 50 |     saveSnapshot?: boolean;
 51 | 
 52 |     // General options
 53 |     urlsToCheck: UrlInput[];
 54 |     proxyConfiguration: ProxyConfiguration;
 55 |     linkSelector?: string;
 56 |     pseudoUrls: PseudoUrlInputCustom[];
 57 |     repeatChecksOnProvidedUrls?: number;
 58 |     maxNumberOfPagesCheckedPerDomain: number;
 59 |     maxConcurrentPagesCheckedPerDomain: number;
 60 |     maxConcurrentDomainsChecked: number;
 61 |     retireBrowserInstanceAfterRequestCount: number;
 62 |     navigationTimeoutSecs: number;
 63 | 
 64 |     // Pass only to puppeteer
 65 |     'puppeteer.headfull'?: boolean;
 66 |     'puppeteer.useChrome'?: boolean;
 67 |     'puppeteer.waitFor'?: string;
 68 | 
 69 |     // Pass only to playwright
 70 |     'playwright.chrome'?: boolean;
 71 |     'playwright.firefox'?: boolean;
 72 |     'playwright.webkit'?: boolean;
 73 |     'playwright.headfull'?: boolean;
 74 |     'playwright.useChrome'?: boolean;
 75 |     'playwright.waitFor'?: string;
 76 | }
 77 | 
 78 | export interface PreparedActorConfig {
 79 |     actorId: string;
 80 |     proxyUsed?: string;
 81 |     url: string;
 82 |     input: ActorInputData;
 83 |     params: {
 84 |         memory: number;
 85 |         timeout: number;
 86 |     };
 87 |     // This data is set when the config is ran
 88 |     runId?: string;
 89 | }
 90 | 
 91 | export interface CreateActorRunConfig {
 92 |     checkerId: string;
 93 |     input: ActorInputData;
 94 |     urlData: UrlInput;
 95 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
 96 | }
 97 | 
 98 | // --- OUTPUT ---
 99 | 
100 | export interface ActorCheckDetailedOutput {
101 |     // Set by waitForRunToFinishAndPushData
102 |     proxyUsed?: string;
103 |     checkerType: 'cheerio' | 'puppeteer' | 'playwright';
104 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
105 |     computeUnitsUsedForThisCheck?: number;
106 |     // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit
107 |     pagesPerComputeUnit?: number;
108 | 
109 |     // URLs
110 |     url: string;
111 |     simplifiedOutput: string;
112 |     detailedOutput: string;
113 | 
114 |     // Page data
115 |     totalPages: UrlCheckResult[];
116 |     timedOut: UrlCheckResult[];
117 |     failedToLoadOther: UrlCheckResult[];
118 |     accessDenied: UrlCheckResult[];
119 |     success: UrlCheckResult[];
120 | 
121 |     // Status codes
122 |     statusCodes: Record<number, UrlCheckResult[]>;
123 | 
124 |     // Captcha time
125 |     recaptcha: UrlCheckResult[];
126 |     distilCaptcha: UrlCheckResult[];
127 |     hCaptcha: UrlCheckResult[];
128 | }
129 | 
130 | export interface UrlCheckResult {
131 |     url: string;
132 |     screenshotUrl?: string;
133 |     htmlUrl?: string;
134 | }
135 | 
136 | export type ActorCheckSimplifiedOutput = {
137 |     [K in keyof ActorCheckDetailedOutput]:
138 |         ActorCheckDetailedOutput[K] extends Array<any>
139 |             ? number
140 |             : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] }
141 |                 ? Record<number, number>
142 |                 : ActorCheckDetailedOutput[K];
143 | };
144 | 


--------------------------------------------------------------------------------
/checker-cheerio/src/utils.ts:
--------------------------------------------------------------------------------
 1 | import { Dictionary } from 'crawlee';
 2 | import type { ActorCheckDetailedOutput, ActorCheckSimplifiedOutput } from './typedefs.js';
 3 | 
 4 | export function convertDetailedOutputToSimplified(data: ActorCheckDetailedOutput): ActorCheckSimplifiedOutput {
 5 |     const obj: Dictionary = {};
 6 | 
 7 |     for (const [key, value] of Object.entries(data)) {
 8 |         if (Array.isArray(value)) {
 9 |             obj[key] = value.length;
10 |         } else if (typeof value === 'object') {
11 |             if (!obj[key]) {
12 |                 obj[key] = {};
13 |             }
14 |             const nestedObject: Dictionary = obj[key];
15 | 
16 |             for (const [statusCode, statusValue] of Object.entries(value)) {
17 |                 nestedObject[statusCode] = (statusValue as any).length;
18 |             }
19 |         } else {
20 |             obj[key] = value;
21 |         }
22 |     }
23 | 
24 |     // @ts-expect-error We are merging the objects
25 |     return obj;
26 | }
27 | 


--------------------------------------------------------------------------------
/checker-cheerio/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": "@apify/tsconfig",
 3 |     "compilerOptions": {
 4 |         "module": "ES2022",
 5 |         "target": "ES2022",
 6 |         "outDir": "dist",
 7 |         "noUnusedLocals": false,
 8 |         "lib": ["DOM"],
 9 |         "skipLibCheck": true
10 |     },
11 |     "include": [
12 |         "./src/**/*"
13 |     ]
14 | }


--------------------------------------------------------------------------------
/checker-playwright/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": true,
 3 |     "env": {
 4 |         "browser": true,
 5 |         "es2020": true,
 6 |         "node": true
 7 |     },
 8 |     "extends": [
 9 |         "@apify/eslint-config-ts"
10 |     ],
11 |     "parserOptions": {
12 |         "project": "./tsconfig.json",
13 |         "ecmaVersion": 2020
14 |     },
15 |     "ignorePatterns": [
16 |         "node_modules",
17 |         "dist",
18 |         "**/*.d.ts"
19 |     ]
20 | }


--------------------------------------------------------------------------------
/checker-playwright/.gitignore:
--------------------------------------------------------------------------------
 1 | # This file tells Git which files shouldn't be added to source control
 2 | 
 3 | .DS_Store
 4 | .idea
 5 | dist
 6 | node_modules
 7 | apify_storage
 8 | storage
 9 | storage
10 | storage


--------------------------------------------------------------------------------
/checker-playwright/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Specify the base Docker image. You can read more about
 2 | # the available images at https://crawlee.dev/docs/guides/docker-images
 3 | # You can also use any other image from Docker Hub.
 4 | FROM apify/actor-node-playwright-chrome:16 AS builder
 5 | 
 6 | # Copy just package.json and package-lock.json
 7 | # to speed up the build using Docker layer cache.
 8 | COPY --chown=myuser package*.json ./
 9 | 
10 | # Install all dependencies. Don't audit to speed up the installation.
11 | RUN npm install --include=dev --audit=false
12 | 
13 | # Next, copy the source files using the user set
14 | # in the base image.
15 | COPY --chown=myuser . ./
16 | 
17 | # Install all dependencies and build the project.
18 | # Don't audit to speed up the installation.
19 | RUN npm run build
20 | 
21 | # Create final image
22 | FROM apify/actor-node-playwright-chrome:16
23 | 
24 | # Copy only built JS files from builder image
25 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist
26 | 
27 | # Copy just package.json and package-lock.json
28 | # to speed up the build using Docker layer cache.
29 | COPY --chown=myuser package*.json ./
30 | 
31 | # Install NPM packages, skip optional and development dependencies to
32 | # keep the image small. Avoid logging too much and print the dependency
33 | # tree for debugging
34 | RUN npm --quiet set progress=false \
35 |     && npm install --omit=dev --omit=optional \
36 |     && echo "Installed NPM packages:" \
37 |     && (npm list --omit=dev --all || true) \
38 |     && echo "Node.js version:" \
39 |     && node --version \
40 |     && echo "NPM version:" \
41 |     && npm --version \
42 |     && rm -r ~/.npm
43 | 
44 | # Next, copy the remaining files and directories with the source code.
45 | # Since we do this after NPM install, quick build will be really fast
46 | # for most source file changes.
47 | COPY --chown=myuser . ./
48 | 
49 | 
50 | # Run the image. If you know you won't need headful browsers,
51 | # you can remove the XVFB start script for a micro perf gain.
52 | CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent


--------------------------------------------------------------------------------
/checker-playwright/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "title": "Web Checker",
  3 |   "description": "The web checker actor loads <b>URLs to check</b> and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay. <b>TODO: Needs to be more descriptive!!</b>",
  4 |   "type": "object",
  5 |   "schemaVersion": 1,
  6 |   "properties": {
  7 |     "urlsToCheck": {
  8 |       "title": "URLs to check",
  9 |       "type": "array",
 10 |       "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the <b>Use request queue</b> option.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
 11 |       "sectionCaption": "Checker Options",
 12 |       "sectionDescription": "Options that will be passed to the checkers",
 13 |       "editor": "requestListSources",
 14 |       "prefill": [
 15 |         {
 16 |           "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011"
 17 |         }
 18 |       ]
 19 |     },
 20 |     "proxyConfiguration": {
 21 |       "title": "Proxy Configuration",
 22 |       "type": "object",
 23 |       "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
 24 |       "default": {},
 25 |       "editor": "proxy",
 26 |       "prefill": {
 27 |         "useApifyProxy": false
 28 |       }
 29 |     },
 30 |     "saveSnapshot": {
 31 |       "title": "Enabled",
 32 |       "type": "boolean",
 33 |       "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright",
 34 |       "editor": "checkbox",
 35 |       "groupCaption": "Save Snapshots"
 36 |     },
 37 |     "linkSelector": {
 38 |       "title": "Link Selector",
 39 |       "type": "string",
 40 |       "description": "A CSS selector saying which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) shall be followed and added to the request queue. This setting only applies if <b>Use request queue</b> is enabled. To filter the links added to the queue, use the <b>Pseudo-URLs</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README.",
 41 |       "sectionCaption": "Crawler Options",
 42 |       "sectionDescription": "Specific options that are relevant for crawlers",
 43 |       "editor": "textfield",
 44 |       "prefill": "a[href]",
 45 |       "minLength": 1
 46 |     },
 47 |     "pseudoUrls": {
 48 |       "title": "Pseudo-URLs",
 49 |       "type": "array",
 50 |       "description": "Specifies what kind of URLs found by <b>Link selector</b> should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in <code>[]</code> brackets, e.g. <code>http://www.example.com/[.*]</code>. This setting only applies if the <b>Use request queue</b> option is enabled.<br><br>If <b>Pseudo-URLs</b> are omitted, the actor enqueues all links matched by the <b>Link selector</b>.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#pseudo-urls' target='_blank' rel='noopener'>Pseudo-URLs</a> in README.",
 51 |       "default": [],
 52 |       "editor": "pseudoUrls",
 53 |       "prefill": [
 54 |         {
 55 |           "purl": "https://www.amazon.com[.*]/dp/[.*]"
 56 |         }
 57 |       ]
 58 |     },
 59 |     "repeatChecksOnProvidedUrls": {
 60 |       "title": "Repeat checks on provided URLs",
 61 |       "type": "integer",
 62 |       "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.",
 63 |       "editor": "number"
 64 |     },
 65 |     "maxNumberOfPagesCheckedPerDomain": {
 66 |       "title": "Max number of pages checked per domain",
 67 |       "type": "integer",
 68 |       "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.<br><br>If set to <code>0</code>, there is no limit.",
 69 |       "default": 100,
 70 |       "editor": "number"
 71 |     },
 72 |     "maxConcurrentPagesCheckedPerDomain": {
 73 |       "title": "Maximum concurrent pages checked per domain",
 74 |       "type": "integer",
 75 |       "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.",
 76 |       "default": 50,
 77 |       "editor": "number",
 78 |       "minimum": 1
 79 |     },
 80 |     "maxConcurrentDomainsChecked": {
 81 |       "title": "Maximum number of concurrent domains checked",
 82 |       "type": "integer",
 83 |       "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.",
 84 |       "default": 5,
 85 |       "editor": "number",
 86 |       "minimum": 1,
 87 |       "maximum": 10
 88 |     },
 89 |     "retireBrowserInstanceAfterRequestCount": {
 90 |       "title": "Retire browser instance after request count",
 91 |       "type": "integer",
 92 |       "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.",
 93 |       "default": 10,
 94 |       "editor": "number",
 95 |       "minimum": 1
 96 |     },
 97 |     "playwright.chrome": {
 98 |       "title": "Chrome",
 99 |       "type": "boolean",
100 |       "description": "Use Chrome when checking",
101 |       "default": true,
102 |       "sectionCaption": "Playwright options",
103 |       "sectionDescription": "Options passed to playwright when checking",
104 |       "editor": "checkbox",
105 |       "groupCaption": "Browser type",
106 |       "groupDescription": "Which type of browser should the checker use"
107 |     },
108 |     "playwright.firefox": {
109 |       "title": "Firefox",
110 |       "type": "boolean",
111 |       "description": "Use Firefox when checking",
112 |       "editor": "checkbox"
113 |     },
114 |     "playwright.webkit": {
115 |       "title": "Safari (Webkit)",
116 |       "type": "boolean",
117 |       "description": "Use Safari when checking",
118 |       "editor": "checkbox"
119 |     },
120 |     "playwright.useChrome": {
121 |       "title": "Use Chrome instead of Chromium",
122 |       "type": "boolean",
123 |       "description": "Only works for Playwright type! Be careful that Chrome is not guaranteed to work with Playwright.",
124 |       "editor": "checkbox"
125 |     },
126 |     "playwright.headfull": {
127 |       "title": "Headfull browser (XVFB)",
128 |       "type": "boolean",
129 |       "description": "If the browser should be headfull or not",
130 |       "editor": "checkbox"
131 |     },
132 |     "playwright.waitFor": {
133 |       "title": "Wait for",
134 |       "type": "string",
135 |       "description": "Only works for playwright type. Will wait on each page. You can provide number in ms or a selector.",
136 |       "editor": "textfield"
137 |     }
138 |   },
139 |   "required": ["urlsToCheck"]
140 | }
141 | 


--------------------------------------------------------------------------------
/checker-playwright/README.md:
--------------------------------------------------------------------------------
1 | # Website Checker Runner with Playwright
2 | 
3 | Checks the provided website using Playwright. This is a low level runner, most likely you want to use the high level master actor -  https://apify.com/lukaskrivka/website-checker
4 | 


--------------------------------------------------------------------------------
/checker-playwright/apify.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"name": "website-checker-playwright",
3 | 	"version": "0.0.0",
4 | 	"buildTag": "latest",
5 | 	"env": null,
6 | 	"template": "basic"
7 | }
8 | 


--------------------------------------------------------------------------------
/checker-playwright/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "crawlee-puppeteer-typescript",
 3 |     "version": "0.0.1",
 4 |     "type": "module",
 5 |     "description": "This is an example of an Apify actor.",
 6 |     "engines": {
 7 |         "node": ">=16.0.0"
 8 |     },
 9 |     "dependencies": {
10 |         "apify": "^3.0.0",
11 |         "crawlee": "^3.0.0",
12 |         "playwright": "*",
13 |         "cheerio": "^1.0.0-rc.10"
14 |     },
15 |     "devDependencies": {
16 |         "@apify/eslint-config-ts": "^0.2.3",
17 |         "@apify/tsconfig": "^0.1.0",
18 |         "@typescript-eslint/eslint-plugin": "^5.32.0",
19 |         "@typescript-eslint/parser": "^5.32.0",
20 |         "eslint": "^8.20.0",
21 |         "ts-node": "^10.9.1",
22 |         "typescript": "4.7.4"
23 |     },
24 |     "scripts": {
25 |         "start": "npm run start:dev",
26 |         "start:prod": "node dist/main.js",
27 |         "start:dev": "ts-node-esm -T src/main.ts",
28 |         "build": "tsc",
29 |         "lint": "eslint ./src --ext .ts",
30 |         "lint:fix": "eslint ./src --ext .ts --fix",
31 |         "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
32 |     },
33 |     "author": "It's not you it's me",
34 |     "license": "ISC"
35 | }
36 | 


--------------------------------------------------------------------------------
/checker-playwright/src/checkers.ts:
--------------------------------------------------------------------------------
 1 | import type { CheerioAPI } from 'cheerio';
 2 | 
 3 | export function distilCaptcha($: CheerioAPI): boolean {
 4 |     return $('#distilCaptchaForm').length > 0
 5 |         || $('[action*="distil_r_captcha.html"]').length > 0;
 6 | }
 7 | 
 8 | export function recaptcha($: CheerioAPI): boolean {
 9 |     return $('#recaptcha').length > 0
10 |         || $('iframe[src*="/recaptcha/"]').length > 0;
11 | }
12 | 
13 | export function hCaptcha($: CheerioAPI): boolean {
14 |     return $('[action="/errors/validateCaptcha"]').length > 0;
15 | }
16 | 
17 | export function accessDenied($: CheerioAPI): boolean {
18 |     return $('title').text().includes('Access Denied');
19 | }
20 | 
21 | export function testHtml($: CheerioAPI) {
22 |     return {
23 |         accessDenied: accessDenied($),
24 |         distilCaptcha: distilCaptcha($),
25 |         recaptcha: recaptcha($),
26 |         hCaptcha: hCaptcha($),
27 |     };
28 | }
29 | 


--------------------------------------------------------------------------------
/checker-playwright/src/handleFailedRequest.ts:
--------------------------------------------------------------------------------
 1 | import { log } from 'crawlee';
 2 | 
 3 | import type { PlaywrightCrawlingContext } from 'crawlee';
 4 | 
 5 | import type { ActorCheckDetailedOutput } from './typedefs.js';
 6 | 
 7 | export async function handleFailedRequest(state: ActorCheckDetailedOutput, { request }: PlaywrightCrawlingContext) {
 8 |     state.totalPages.push({ url: request.url });
 9 | 
10 |     const [error] = request.errorMessages;
11 |     log.warning(`Request failed --- ${request.url}\n${error}`);
12 | 
13 |     if (error.includes('request timed out')) {
14 |         state.timedOut.push({ url: request.url });
15 |     } else {
16 |         state.failedToLoadOther.push({ url: request.url });
17 |     }
18 | 
19 |     // CheerioCrawler obscures status code >=500 to a string message so we have to parse it
20 |     const maybeStatusCheerio = error.match(/(\d\d\d) - Internal Server Error/);
21 |     if (maybeStatusCheerio) {
22 |         const statusCode = Number(maybeStatusCheerio[1]);
23 |         state.statusCodes[statusCode] ??= [];
24 |         state.statusCodes[statusCode].push({ url: request.url });
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/checker-playwright/src/handlePage.ts:
--------------------------------------------------------------------------------
  1 | import { Actor } from 'apify';
  2 | import Cheerio from 'cheerio';
  3 | 
  4 | import { PseudoUrl } from 'crawlee';
  5 | import type { RequestQueue } from 'apify';
  6 | import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
  7 | 
  8 | import { testHtml } from './checkers.js';
  9 | 
 10 | import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs.js';
 11 | 
 12 | const env = Actor.getEnv();
 13 | 
 14 | export async function handlePage(
 15 |     input: PlaywrightActorInput,
 16 |     requestQueue: RequestQueue,
 17 |     state: ActorCheckDetailedOutput,
 18 |     { request, response, page, crawler }: PlaywrightCrawlingContext,
 19 | ): Promise<void> {
 20 |     let htmlUrl;
 21 |     let screenshotUrl;
 22 | 
 23 |     const waitFor = input['playwright.waitFor'];
 24 | 
 25 |     if (waitFor) {
 26 |         // We wait for number in ms or a selector
 27 |         const maybeNumber = Number(waitFor);
 28 |         if (maybeNumber || maybeNumber === 0) {
 29 |             await page.waitForTimeout(maybeNumber);
 30 |         } else {
 31 |             await page.waitForSelector(waitFor);
 32 |         }
 33 |     }
 34 | 
 35 |     const html = await page.content();
 36 | 
 37 |     if (input.saveSnapshot) {
 38 |         const key = `SNAPSHOT-${Math.random().toString()}`;
 39 |         const screenshot = await page.screenshot({ fullPage: true });
 40 | 
 41 |         // TODO: Create a utils.playwright.saveSnapshot, like we have for puppeteer
 42 |         await Actor.setValue(`${key}.html`, html, { contentType: 'text/html' });
 43 |         await Actor.setValue(`${key}.png`, screenshot, { contentType: 'image/png' });
 44 | 
 45 |         screenshotUrl = `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/${key}.png?disableRedirect=true`;
 46 |         htmlUrl = `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`;
 47 |     }
 48 | 
 49 |     state.totalPages.push({ url: request.url, htmlUrl, screenshotUrl });
 50 | 
 51 |     const statusCode = response!.status();
 52 | 
 53 |     state.statusCodes[statusCode] ??= [];
 54 |     state.statusCodes[statusCode].push({ url: request.url, htmlUrl, screenshotUrl });
 55 | 
 56 |     const $ = Cheerio.load(html);
 57 | 
 58 |     const captchas: string[] = [];
 59 |     const testResult = testHtml($);
 60 | 
 61 |     for (const testResultEntry of Object.entries(testResult)) {
 62 |         const wasFound = testResultEntry[1];
 63 |         const testCase = testResultEntry[0] as 'accessDenied' | 'distilCaptcha' | 'recaptcha' | 'hCaptcha';
 64 |         if (wasFound) {
 65 |             captchas.push(testCase);
 66 | 
 67 |             state[testCase].push({ url: request.url, htmlUrl });
 68 |         }
 69 |     }
 70 | 
 71 |     const wasSuccess = statusCode < 400 && captchas.length === 0;
 72 |     if (wasSuccess) {
 73 |         state.success.push({ url: request.url, htmlUrl, screenshotUrl });
 74 |     }
 75 | 
 76 |     await Actor.pushData({
 77 |         url: request.url,
 78 |         htmlUrl,
 79 |         screenshotUrl,
 80 |         statusCode,
 81 |         captchas,
 82 |         wasSuccess,
 83 |     });
 84 | 
 85 |     const pageOrigin = new URL(request.url).origin;
 86 | 
 87 |     if (input.linkSelector && !!$) {
 88 |         const info = await requestQueue.getInfo();
 89 | 
 90 |         const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
 91 |         if (maxUrlsToEnqueue > 0) {
 92 |             const toEnqueue: RequestOptions[] = [];
 93 |             $(input.linkSelector).each((_, el) => {
 94 |                 const rawHref = $(el).attr('href');
 95 |                 if (!rawHref) {
 96 |                     return;
 97 |                 }
 98 |                 const href = new URL(rawHref, pageOrigin).toString();
 99 |                 for (const pseudoUrlInput of input.pseudoUrls) {
100 |                     if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
101 |                         const newUrl = new URL(href, request.loadedUrl).toString();
102 |                         toEnqueue.push({
103 |                             url: newUrl,
104 |                             headers: pseudoUrlInput.headers,
105 |                             method: pseudoUrlInput.method as 'GET' | 'POST',
106 |                             payload: pseudoUrlInput.payload,
107 |                             userData: pseudoUrlInput.userData,
108 |                         });
109 |                     }
110 |                 }
111 |             });
112 |             console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`);
113 |             await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue));
114 |         }
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------
/checker-playwright/src/main.ts:
--------------------------------------------------------------------------------
  1 | import { Actor } from 'apify';
  2 | import { log, PlaywrightCrawler, RequestOptions } from 'crawlee';
  3 | import { chromium, firefox, webkit } from 'playwright';
  4 | import { inspect } from 'util';
  5 | 
  6 | import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs';
  7 | 
  8 | import { handleFailedRequest } from './handleFailedRequest.js';
  9 | import { handlePage } from './handlePage.js';
 10 | import { convertDetailedOutputToSimplified } from './utils.js';
 11 | 
 12 | const env = Actor.getEnv();
 13 | 
 14 | Actor.main(async () => {
 15 |     const input = await Actor.getInput() as PlaywrightActorInput;
 16 | 
 17 |     log.info('Input provided:');
 18 |     log.debug(inspect(input, false, 4));
 19 | 
 20 |     log.info('Running a Playwright Checker.');
 21 | 
 22 |     const {
 23 |         maxConcurrentPagesCheckedPerDomain,
 24 |         maxNumberOfPagesCheckedPerDomain,
 25 |         proxyConfiguration,
 26 |         urlsToCheck,
 27 |         repeatChecksOnProvidedUrls,
 28 |         retireBrowserInstanceAfterRequestCount,
 29 |         'playwright.useChrome': useChrome,
 30 |         'playwright.headfull': headfull,
 31 |         'playwright.chrome': playwrightChromeLauncher,
 32 |         'playwright.firefox': playwrightFirefoxLauncher,
 33 |         'playwright.webkit': playwrightWebkitLauncher,
 34 |     } = input;
 35 | 
 36 |     let launcher;
 37 | 
 38 |     if (playwrightChromeLauncher) {
 39 |         launcher = chromium;
 40 |     } else if (playwrightFirefoxLauncher) {
 41 |         launcher = firefox;
 42 |     } else if (playwrightWebkitLauncher) {
 43 |         launcher = webkit;
 44 |     }
 45 | 
 46 |     const proxy = await Actor.createProxyConfiguration({
 47 |         groups: proxyConfiguration.apifyProxyGroups,
 48 |         countryCode: proxyConfiguration.apifyProxyCountry,
 49 |     });
 50 | 
 51 |     const requestQueue = await Actor.openRequestQueue();
 52 | 
 53 |     const [urlData] = urlsToCheck;
 54 |     await requestQueue.addRequest(urlData as RequestOptions);
 55 |     for (let _ = 0; _ < (repeatChecksOnProvidedUrls ?? 0); _++) {
 56 |         await requestQueue.addRequest({
 57 |             ...urlData,
 58 |             uniqueKey: Math.random().toString(),
 59 |         } as RequestOptions);
 60 |     }
 61 | 
 62 |     const state: ActorCheckDetailedOutput = {
 63 |         url: urlData.url,
 64 |         checkerType: 'playwright',
 65 |         simplifiedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`,
 66 |         detailedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`,
 67 |         totalPages: [],
 68 |         timedOut: [],
 69 |         failedToLoadOther: [],
 70 |         accessDenied: [],
 71 |         success: [],
 72 |         statusCodes: {},
 73 |         recaptcha: [],
 74 |         distilCaptcha: [],
 75 |         hCaptcha: [],
 76 |     };
 77 | 
 78 |     const crawler = new PlaywrightCrawler({
 79 |         maxRequestRetries: 0,
 80 |         maxRequestsPerCrawl: maxNumberOfPagesCheckedPerDomain,
 81 |         maxConcurrency: maxConcurrentPagesCheckedPerDomain,
 82 |         requestQueue,
 83 |         requestHandler: (pageInputs) => handlePage(input, requestQueue, state, pageInputs),
 84 |         failedRequestHandler: (requestInput) => handleFailedRequest(state, requestInput),
 85 |         proxyConfiguration: proxy,
 86 |         useSessionPool: false,
 87 |         launchContext: {
 88 |             useChrome,
 89 |             launchOptions: {
 90 |                 headless: headfull ? undefined : true,
 91 |             },
 92 |             launcher,
 93 |         },
 94 |         browserPoolOptions: {
 95 |             retireBrowserAfterPageCount: retireBrowserInstanceAfterRequestCount,
 96 |         },
 97 |     });
 98 | 
 99 |     await crawler.run();
100 | 
101 |     await Actor.setValue('OUTPUT', convertDetailedOutputToSimplified(state));
102 |     await Actor.setValue('DETAILED-OUTPUT', state);
103 |     log.info('Checker finished.');
104 |     log.info(
105 |         `Simplified output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`,
106 |     );
107 |     log.info(
108 |         `Detailed output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`,
109 |     );
110 |     log.info(`Preview dataset: https://api.apify.com/v2/datasets/${env.defaultDatasetId}/items?clean=true&format=html`);
111 | });
112 | 


--------------------------------------------------------------------------------
/checker-playwright/src/typedefs.ts:
--------------------------------------------------------------------------------
  1 | type KeysNotRequired =
  2 |     | 'checkers.cheerio'
  3 |     | 'checkers.puppeteer'
  4 |     | 'checkers.playwright'
  5 |     | 'puppeteer.headfull'
  6 |     | 'puppeteer.useChrome'
  7 |     | 'puppeteer.waitFor'
  8 |     | 'maxConcurrentDomainsChecked';
  9 | 
 10 | export type PlaywrightActorInput = Omit<ActorInputData, KeysNotRequired>;
 11 | 
 12 | export interface PseudoUrlInputCustom {
 13 |     purl: string;
 14 |     method?: string;
 15 |     payload?: string;
 16 |     userData?: Record<string, unknown>;
 17 |     headers?: Record<string, string>;
 18 | }
 19 | 
 20 | export interface UrlInput {
 21 |     url: string;
 22 |     method?: string;
 23 |     payload?: string;
 24 |     userData?: Record<string, unknown>;
 25 |     headers?: Record<string, string>;
 26 | }
 27 | 
 28 | export interface ProxyConfiguration {
 29 |     useApifyProxy: boolean;
 30 |     apifyProxyGroups?: string[];
 31 |     apifyProxyCountry?: string;
 32 | }
 33 | 
 34 | export interface ActorInputData {
 35 |     // Crawlers to use
 36 |     'checkers.cheerio'?: boolean;
 37 |     'checkers.puppeteer'?: boolean;
 38 |     'checkers.playwright'?: boolean;
 39 | 
 40 |     // Pass these to crawlers
 41 | 
 42 |     // save snapshots
 43 |     saveSnapshot?: boolean;
 44 | 
 45 |     // General options
 46 |     urlsToCheck: UrlInput[];
 47 |     proxyConfiguration: ProxyConfiguration;
 48 |     linkSelector?: string;
 49 |     pseudoUrls: PseudoUrlInputCustom[];
 50 |     repeatChecksOnProvidedUrls?: number;
 51 |     maxNumberOfPagesCheckedPerDomain: number;
 52 |     maxConcurrentPagesCheckedPerDomain: number;
 53 |     maxConcurrentDomainsChecked: number;
 54 |     retireBrowserInstanceAfterRequestCount: number;
 55 | 
 56 |     // Pass only to puppeteer
 57 |     'puppeteer.headfull'?: boolean;
 58 |     'puppeteer.useChrome'?: boolean;
 59 |     'puppeteer.waitFor'?: string;
 60 | 
 61 |     // Pass only to playwright
 62 |     'playwright.chrome'?: boolean;
 63 |     'playwright.firefox'?: boolean;
 64 |     'playwright.webkit'?: boolean;
 65 |     'playwright.headfull'?: boolean;
 66 |     'playwright.useChrome'?: boolean;
 67 |     'playwright.waitFor'?: string;
 68 | }
 69 | 
 70 | export interface PreparedActorConfig {
 71 |     actorId: string;
 72 |     proxyUsed?: string;
 73 |     url: string;
 74 |     input: ActorInputData;
 75 |     params: {
 76 |         memory: number;
 77 |         timeout: number;
 78 |     };
 79 |     // This data is set when the config is ran
 80 |     runId?: string;
 81 | }
 82 | 
 83 | export interface CreateActorRunConfig {
 84 |     checkerId: string;
 85 |     input: ActorInputData;
 86 |     urlData: UrlInput;
 87 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
 88 | }
 89 | 
 90 | // --- OUTPUT ---
 91 | 
 92 | export interface ActorCheckDetailedOutput {
 93 |     // Set by waitForRunToFinishAndPushData
 94 |     proxyUsed?: string;
 95 |     checkerType: 'cheerio' | 'puppeteer' | 'playwright';
 96 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
 97 |     computeUnitsUsedForThisCheck?: number;
 98 |     // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit
 99 |     pagesPerComputeUnit?: number;
100 | 
101 |     // URLs
102 |     url: string;
103 |     simplifiedOutput: string;
104 |     detailedOutput: string;
105 | 
106 |     // Page data
107 |     totalPages: UrlCheckResult[];
108 |     timedOut: UrlCheckResult[];
109 |     failedToLoadOther: UrlCheckResult[];
110 |     accessDenied: UrlCheckResult[];
111 |     success: UrlCheckResult[];
112 | 
113 |     // Status codes
114 |     statusCodes: Record<number, UrlCheckResult[]>;
115 | 
116 |     // Captcha time
117 |     recaptcha: UrlCheckResult[];
118 |     distilCaptcha: UrlCheckResult[];
119 |     hCaptcha: UrlCheckResult[];
120 | }
121 | 
122 | export interface UrlCheckResult {
123 |     url: string;
124 |     screenshotUrl?: string;
125 |     htmlUrl?: string;
126 | }
127 | 
128 | export type ActorCheckSimplifiedOutput = {
129 |     [K in keyof ActorCheckDetailedOutput]:
130 |         ActorCheckDetailedOutput[K] extends Array<any>
131 |             ? number
132 |             : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] }
133 |                 ? Record<number, number>
134 |                 : ActorCheckDetailedOutput[K];
135 | };
136 | 


--------------------------------------------------------------------------------
/checker-playwright/src/utils.ts:
--------------------------------------------------------------------------------
 1 | import { Dictionary } from 'crawlee';
 2 | import type { ActorCheckDetailedOutput, ActorCheckSimplifiedOutput } from './typedefs.js';
 3 | 
 4 | export function convertDetailedOutputToSimplified(data: ActorCheckDetailedOutput): ActorCheckSimplifiedOutput {
 5 |     const obj: Dictionary = {};
 6 | 
 7 |     for (const [key, value] of Object.entries(data)) {
 8 |         if (Array.isArray(value)) {
 9 |             obj[key] = value.length;
10 |         } else if (typeof value === 'object') {
11 |             if (!obj[key]) {
12 |                 obj[key] = {};
13 |             }
14 |             const nestedObject: Dictionary = obj[key];
15 | 
16 |             for (const [statusCode, statusValue] of Object.entries(value)) {
17 |                 nestedObject[statusCode] = (statusValue as any).length;
18 |             }
19 |         } else {
20 |             obj[key] = value;
21 |         }
22 |     }
23 | 
24 |     // @ts-expect-error We are merging the objects
25 |     return obj;
26 | };
27 | 


--------------------------------------------------------------------------------
/checker-playwright/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": "@apify/tsconfig",
 3 |     "compilerOptions": {
 4 |         "module": "ES2022",
 5 |         "target": "ES2022",
 6 |         "outDir": "dist",
 7 |         "noUnusedLocals": false,
 8 |         "lib": ["DOM"]
 9 |     },
10 |     "include": [
11 |         "./src/**/*"
12 |     ]
13 | }


--------------------------------------------------------------------------------
/checker-puppeteer/.gitignore:
--------------------------------------------------------------------------------
1 | # This file tells Git which files shouldn't be added to source control
2 | 
3 | .DS_Store
4 | .idea
5 | dist
6 | node_modules
7 | apify_storage
8 | storage
9 | storage


--------------------------------------------------------------------------------
/checker-puppeteer/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Specify the base Docker image. You can read more about
 2 | # the available images at https://crawlee.dev/docs/guides/docker-images
 3 | # You can also use any other image from Docker Hub.
 4 | FROM apify/actor-node-puppeteer-chrome:16 AS builder
 5 | 
 6 | # Copy just package.json and package-lock.json
 7 | # to speed up the build using Docker layer cache.
 8 | COPY --chown=myuser package*.json ./
 9 | 
10 | # Install all dependencies. Don't audit to speed up the installation.
11 | RUN npm install --include=dev --audit=false
12 | 
13 | # Next, copy the source files using the user set
14 | # in the base image.
15 | COPY --chown=myuser . ./
16 | 
17 | # Install all dependencies and build the project.
18 | # Don't audit to speed up the installation.
19 | RUN npm run build
20 | 
21 | # Create final image
22 | FROM apify/actor-node-puppeteer-chrome:16
23 | 
24 | # Copy only built JS files from builder image
25 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist
26 | 
27 | # Copy just package.json and package-lock.json
28 | # to speed up the build using Docker layer cache.
29 | COPY --chown=myuser package*.json ./
30 | 
31 | # Install NPM packages, skip optional and development dependencies to
32 | # keep the image small. Avoid logging too much and print the dependency
33 | # tree for debugging
34 | RUN npm --quiet set progress=false \
35 |     && npm install --omit=dev --omit=optional \
36 |     && echo "Installed NPM packages:" \
37 |     && (npm list --omit=dev --all || true) \
38 |     && echo "Node.js version:" \
39 |     && node --version \
40 |     && echo "NPM version:" \
41 |     && npm --version \
42 |     && rm -r ~/.npm
43 | 
44 | # Next, copy the remaining files and directories with the source code.
45 | # Since we do this after NPM install, quick build will be really fast
46 | # for most source file changes.
47 | COPY --chown=myuser . ./
48 | 
49 | 
50 | # Run the image. If you know you won't need headful browsers,
51 | # you can remove the XVFB start script for a micro perf gain.
52 | CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent


--------------------------------------------------------------------------------
/checker-puppeteer/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "title": "Web Checker",
  3 |   "description": "The web checker actor loads <b>URLs to check</b> and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay. <b>TODO: Needs to be more descriptive!!</b>",
  4 |   "type": "object",
  5 |   "schemaVersion": 1,
  6 |   "properties": {
  7 |     "urlsToCheck": {
  8 |       "title": "URLs to check",
  9 |       "type": "array",
 10 |       "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the <b>Use request queue</b> option.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
 11 |       "sectionCaption": "Checker Options",
 12 |       "sectionDescription": "Options that will be passed to the checkers",
 13 |       "editor": "requestListSources",
 14 |       "prefill": [
 15 |         {
 16 |           "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011"
 17 |         }
 18 |       ]
 19 |     },
 20 |     "proxyConfiguration": {
 21 |       "title": "Proxy Configuration",
 22 |       "type": "object",
 23 |       "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
 24 |       "default": {},
 25 |       "editor": "proxy",
 26 |       "prefill": {
 27 |         "useApifyProxy": false
 28 |       }
 29 |     },
 30 |     "saveSnapshot": {
 31 |       "title": "Enabled",
 32 |       "type": "boolean",
 33 |       "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright",
 34 |       "editor": "checkbox",
 35 |       "groupCaption": "Save Snapshots"
 36 |     },
 37 |     "linkSelector": {
 38 |       "title": "Link Selector",
 39 |       "type": "string",
 40 |       "description": "A CSS selector saying which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) shall be followed and added to the request queue. This setting only applies if <b>Use request queue</b> is enabled. To filter the links added to the queue, use the <b>Pseudo-URLs</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README.",
 41 |       "sectionCaption": "Crawler Options",
 42 |       "sectionDescription": "Specific options that are relevant for crawlers",
 43 |       "editor": "textfield",
 44 |       "prefill": "a[href]",
 45 |       "minLength": 1
 46 |     },
 47 |     "pseudoUrls": {
 48 |       "title": "Pseudo-URLs",
 49 |       "type": "array",
 50 |       "description": "Specifies what kind of URLs found by <b>Link selector</b> should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in <code>[]</code> brackets, e.g. <code>http://www.example.com/[.*]</code>. This setting only applies if the <b>Use request queue</b> option is enabled.<br><br>If <b>Pseudo-URLs</b> are omitted, the actor enqueues all links matched by the <b>Link selector</b>.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#pseudo-urls' target='_blank' rel='noopener'>Pseudo-URLs</a> in README.",
 51 |       "default": [],
 52 |       "editor": "pseudoUrls",
 53 |       "prefill": [
 54 |         {
 55 |           "purl": "https://www.amazon.com[.*]/dp/[.*]"
 56 |         }
 57 |       ]
 58 |     },
 59 |     "repeatChecksOnProvidedUrls": {
 60 |       "title": "Repeat checks on provided URLs",
 61 |       "type": "integer",
 62 |       "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.",
 63 |       "editor": "number"
 64 |     },
 65 |     "maxNumberOfPagesCheckedPerDomain": {
 66 |       "title": "Max number of pages checked per domain",
 67 |       "type": "integer",
 68 |       "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.<br><br>If set to <code>0</code>, there is no limit.",
 69 |       "default": 100,
 70 |       "editor": "number"
 71 |     },
 72 |     "maxConcurrentPagesCheckedPerDomain": {
 73 |       "title": "Maximum concurrent pages checked per domain",
 74 |       "type": "integer",
 75 |       "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.",
 76 |       "default": 50,
 77 |       "editor": "number",
 78 |       "minimum": 1
 79 |     },
 80 |     "maxConcurrentDomainsChecked": {
 81 |       "title": "Maximum number of concurrent domains checked",
 82 |       "type": "integer",
 83 |       "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.",
 84 |       "default": 5,
 85 |       "editor": "number",
 86 |       "minimum": 1,
 87 |       "maximum": 10
 88 |     },
 89 |     "retireBrowserInstanceAfterRequestCount": {
 90 |       "title": "Retire browser instance after request count",
 91 |       "type": "integer",
 92 |       "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.",
 93 |       "default": 10,
 94 |       "editor": "number",
 95 |       "minimum": 1
 96 |     },
 97 |     "puppeteer.headfull": {
 98 |       "title": "Headfull browser (XVFB)",
 99 |       "type": "boolean",
100 |       "description": "Only works for Puppeteer type!",
101 |       "sectionCaption": "Puppeteer Options",
102 |       "sectionDescription": "Options that are passed in to puppeteer when checking",
103 |       "editor": "checkbox"
104 |     },
105 |     "puppeteer.useChrome": {
106 |       "title": "Use Chrome",
107 |       "type": "boolean",
108 |       "description": "Only works for Puppeteer type! Be careful that Chrome is not guaranteed to work with Puppeteer.",
109 |       "editor": "checkbox"
110 |     },
111 |     "puppeteer.waitFor": {
112 |       "title": "Wait for",
113 |       "type": "string",
114 |       "description": "Only works for Puppeteer type. Will wait on each page. You can provide number in ms or a selector.",
115 |       "editor": "textfield"
116 |     }
117 |   },
118 |   "required": ["urlsToCheck"]
119 | }
120 | 


--------------------------------------------------------------------------------
/checker-puppeteer/README.md:
--------------------------------------------------------------------------------
1 | # Website Checker Runner with Puppeteer
2 | 
3 | Checks the provided website using Puppeteer. This is a low level runner, most likely you want to use the high level master actor -  https://apify.com/lukaskrivka/website-checker


--------------------------------------------------------------------------------
/checker-puppeteer/apify.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"name": "checker-puppeteer",
3 | 	"version": "0.0",
4 | 	"buildTag": "latest",
5 | 	"env": null
6 | }
7 | 


--------------------------------------------------------------------------------
/checker-puppeteer/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "crawlee-puppeteer-typescript",
 3 |     "version": "0.0.1",
 4 |     "type": "module",
 5 |     "description": "This is an example of an Apify actor.",
 6 |     "engines": {
 7 |         "node": ">=16.0.0"
 8 |     },
 9 |     "dependencies": {
10 |         "apify": "^3.0.0",
11 |         "crawlee": "^3.0.0",
12 |         "puppeteer": "*",
13 |         "cheerio": "^1.0.0-rc.10"
14 |     },
15 |     "devDependencies": {
16 |         "@apify/eslint-config-ts": "^0.2.3",
17 |         "@apify/tsconfig": "^0.1.0",
18 |         "@typescript-eslint/eslint-plugin": "^5.32.0",
19 |         "@typescript-eslint/parser": "^5.32.0",
20 |         "eslint": "^8.20.0",
21 |         "ts-node": "^10.9.1",
22 |         "typescript": "4.7.4"
23 |     },
24 |     "scripts": {
25 |         "start": "npm run start:dev",
26 |         "start:prod": "node dist/main.js",
27 |         "start:dev": "ts-node-esm -T src/main.ts",
28 |         "build": "tsc",
29 |         "lint": "eslint ./src --ext .ts",
30 |         "lint:fix": "eslint ./src --ext .ts --fix",
31 |         "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
32 |     },
33 |     "author": "It's not you it's me",
34 |     "license": "ISC"
35 | }
36 | 


--------------------------------------------------------------------------------
/checker-puppeteer/src/checkers.ts:
--------------------------------------------------------------------------------
 1 | import type { CheerioAPI } from 'cheerio';
 2 | 
 3 | export function distilCaptcha($: CheerioAPI): boolean {
 4 |     return $('#distilCaptchaForm').length > 0
 5 |         || $('[action*="distil_r_captcha.html"]').length > 0;
 6 | }
 7 | 
 8 | export function recaptcha($: CheerioAPI): boolean {
 9 |     return $('#recaptcha').length > 0
10 |         || $('iframe[src*="/recaptcha/"]').length > 0;
11 | }
12 | 
13 | export function hCaptcha($: CheerioAPI): boolean {
14 |     return $('[action="/errors/validateCaptcha"]').length > 0;
15 | }
16 | 
17 | export function accessDenied($: CheerioAPI): boolean {
18 |     return $('title').text().includes('Access Denied');
19 | }
20 | 
21 | export function testHtml($: CheerioAPI) {
22 |     return {
23 |         accessDenied: accessDenied($),
24 |         distilCaptcha: distilCaptcha($),
25 |         recaptcha: recaptcha($),
26 |         hCaptcha: hCaptcha($),
27 |     };
28 | }


--------------------------------------------------------------------------------
/checker-puppeteer/src/handleFailedRequest.ts:
--------------------------------------------------------------------------------
 1 | import { log } from 'crawlee';
 2 | 
 3 | import type { PuppeteerCrawlingContext } from 'crawlee';
 4 | 
 5 | import { ActorCheckDetailedOutput } from './typedefs.js';
 6 | 
 7 | export async function handleFailedRequest(state: ActorCheckDetailedOutput, { request }: PuppeteerCrawlingContext) {
 8 |     state.totalPages.push({ url: request.url });
 9 | 
10 |     const [error] = request.errorMessages;
11 |     log.warning(`Request failed --- ${request.url}\n${error}`);
12 | 
13 |     if (error.includes('request timed out')) {
14 |         state.timedOut.push({ url: request.url });
15 |     } else {
16 |         state.failedToLoadOther.push({ url: request.url });
17 |     }
18 | 
19 |     // CheerioCrawler obscures status code >=500 to a string message so we have to parse it
20 |     const maybeStatusCheerio = error.match(/(\d\d\d) - Internal Server Error/);
21 |     if (maybeStatusCheerio) {
22 |         const statusCode = Number(maybeStatusCheerio[1]);
23 |         state.statusCodes[statusCode] ??= [];
24 |         state.statusCodes[statusCode].push({ url: request.url });
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/checker-puppeteer/src/handlePage.ts:
--------------------------------------------------------------------------------
  1 | import { Actor } from 'apify';
  2 | import Cheerio from 'cheerio';
  3 | import { testHtml } from './checkers.js';
  4 | import { puppeteerUtils, PseudoUrl } from 'crawlee';
  5 | 
  6 | import type { RequestQueue } from 'apify';
  7 | import type { PuppeteerCrawlingContext, RequestOptions } from 'crawlee';
  8 | 
  9 | import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs.js';
 10 | 
 11 | export async function handlePage(
 12 |     input: PuppeteerActorInput,
 13 |     requestQueue: RequestQueue,
 14 |     state: ActorCheckDetailedOutput,
 15 |     { request, response, page, crawler }: PuppeteerCrawlingContext
 16 | ): Promise<void> {
 17 |     let htmlUrl;
 18 |     let screenshotUrl;
 19 | 
 20 |     const waitFor = input['puppeteer.waitFor'];
 21 | 
 22 |     if (waitFor) {
 23 |         // We wait for number in ms or a selector
 24 |         const maybeNumber = Number(waitFor);
 25 |         if (maybeNumber || maybeNumber === 0) {
 26 |             await page.waitForTimeout(maybeNumber);
 27 |         } else {
 28 |             await page.waitForSelector(waitFor);
 29 |         }
 30 |     }
 31 | 
 32 |     if (input.saveSnapshot) {
 33 |         const key = `SNAPSHOT-${Math.random().toString()}`;
 34 |         await puppeteerUtils.saveSnapshot(page, { key });
 35 |         screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${key}.jpg?disableRedirect=true`;
 36 |         htmlUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`;
 37 |     }
 38 | 
 39 |     state.totalPages.push({ url: request.url, htmlUrl, screenshotUrl });
 40 | 
 41 |     const statusCode = response!.status();
 42 | 
 43 |     state.statusCodes[statusCode] ??= [];
 44 |     state.statusCodes[statusCode].push({ url: request.url, htmlUrl, screenshotUrl });
 45 | 
 46 |     const html = await page.content();
 47 |     const $ = Cheerio.load(html);
 48 | 
 49 |     const captchas: string[] = [];
 50 |     const testResult = testHtml($);
 51 | 
 52 |     for (const testResultEntry of Object.entries(testResult)) {
 53 |         const wasFound = testResultEntry[1];
 54 |         const testCase = testResultEntry[0] as 'accessDenied' | 'distilCaptcha' | 'recaptcha' | 'hCaptcha';
 55 |         if (wasFound) {
 56 |             captchas.push(testCase);
 57 | 
 58 |             state[testCase].push({ url: request.url, htmlUrl });
 59 |         }
 60 |     }
 61 | 
 62 |     const wasSuccess = statusCode < 400 && captchas.length === 0;
 63 |     if (wasSuccess) {
 64 |         state.success.push({ url: request.url, htmlUrl, screenshotUrl });
 65 |     }
 66 | 
 67 |     await Actor.pushData({
 68 |         url: request.url,
 69 |         htmlUrl,
 70 |         screenshotUrl,
 71 |         statusCode,
 72 |         captchas,
 73 |         wasSuccess,
 74 |     });
 75 | 
 76 |     const pageOrigin = new URL(request.url).origin;
 77 | 
 78 |     if (input.linkSelector && !!$) {
 79 |         const info = await requestQueue.getInfo();
 80 | 
 81 |         const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount;
 82 |         if (maxUrlsToEnqueue > 0) {
 83 |             const toEnqueue: RequestOptions[] = [];
 84 |             $(input.linkSelector).each((_, el) => {
 85 |                 const rawHref = $(el).attr('href');
 86 |                 if (!rawHref) {
 87 |                     return;
 88 |                 }
 89 |                 const href = new URL(rawHref, pageOrigin).toString();
 90 |                 for (const pseudoUrlInput of input.pseudoUrls) {
 91 |                     if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) {
 92 |                         const newUrl = new URL(href, request.loadedUrl).toString();
 93 |                         toEnqueue.push({
 94 |                             url: newUrl,
 95 |                             headers: pseudoUrlInput.headers,
 96 |                             method: pseudoUrlInput.method as 'GET' | 'POST',
 97 |                             payload: pseudoUrlInput.payload,
 98 |                             userData: pseudoUrlInput.userData,
 99 |                         });
100 |                     }
101 |                 }
102 |             });
103 |             console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`);
104 |             await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue));
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/checker-puppeteer/src/main.ts:
--------------------------------------------------------------------------------
  1 | import { Actor } from 'apify';
  2 | import { log, PuppeteerCrawler, RequestOptions } from 'crawlee';
  3 | 
  4 | import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs';
  5 | 
  6 | import { inspect } from 'util';
  7 | import { handleFailedRequest } from './handleFailedRequest.js';
  8 | import { handlePage } from './handlePage.js';
  9 | import { convertDetailedOutputToSimplified } from './utils.js';
 10 | 
 11 | Actor.main(async () => {
 12 |     const input = await Actor.getInput() as PuppeteerActorInput;
 13 | 
 14 |     // Log the input
 15 |     // Log the input
 16 |     log.info('Input provided:');
 17 |     log.debug(inspect(input, false, 4));
 18 | 
 19 |     log.info('Running a Puppeteer Checker.');
 20 | 
 21 |     const env = Actor.getEnv();
 22 | 
 23 |     const {
 24 |         maxConcurrentPagesCheckedPerDomain,
 25 |         maxNumberOfPagesCheckedPerDomain,
 26 |         proxyConfiguration,
 27 |         urlsToCheck,
 28 |         repeatChecksOnProvidedUrls,
 29 |         retireBrowserInstanceAfterRequestCount,
 30 |         'puppeteer.useChrome': useChrome,
 31 |         'puppeteer.headfull': headfull,
 32 |     } = input;
 33 | 
 34 |     const proxy = await Actor.createProxyConfiguration({
 35 |         groups: proxyConfiguration.apifyProxyGroups,
 36 |         countryCode: proxyConfiguration.apifyProxyCountry,
 37 |     });
 38 | 
 39 |     const requestQueue = await Actor.openRequestQueue();
 40 | 
 41 |     const [urlData] = urlsToCheck;
 42 |     await requestQueue.addRequest(urlData as RequestOptions);
 43 |     for (let _ = 0; _ < (repeatChecksOnProvidedUrls ?? 0); _++) {
 44 |         await requestQueue.addRequest({
 45 |             ...urlData,
 46 |             uniqueKey: Math.random().toString(),
 47 |         } as RequestOptions);
 48 |     }
 49 | 
 50 |     const state: ActorCheckDetailedOutput = {
 51 |         url: urlData.url,
 52 |         checkerType: 'puppeteer',
 53 |         simplifiedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`,
 54 |         detailedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`,
 55 |         totalPages: [],
 56 |         timedOut: [],
 57 |         failedToLoadOther: [],
 58 |         accessDenied: [],
 59 |         success: [],
 60 |         statusCodes: {},
 61 |         recaptcha: [],
 62 |         distilCaptcha: [],
 63 |         hCaptcha: [],
 64 |     };
 65 | 
 66 |     const crawler = new PuppeteerCrawler({
 67 |         maxRequestRetries: 0,
 68 |         maxRequestsPerCrawl: maxNumberOfPagesCheckedPerDomain,
 69 |         maxConcurrency: maxConcurrentPagesCheckedPerDomain,
 70 |         requestQueue,
 71 |         requestHandler: (pageInputs) => handlePage(input, requestQueue, state, pageInputs),
 72 |         failedRequestHandler: (requestInput) => handleFailedRequest(state, requestInput),
 73 |         proxyConfiguration: proxy,
 74 |         useSessionPool: false,
 75 |         launchContext: {
 76 |             useChrome,
 77 |             launchOptions: {
 78 |                 headless: headfull ? undefined : true,
 79 |             },
 80 |         },
 81 |         browserPoolOptions: {
 82 |             retireBrowserAfterPageCount: retireBrowserInstanceAfterRequestCount,
 83 |         },
 84 |     });
 85 | 
 86 |     await crawler.run();
 87 | 
 88 |     await Actor.setValue('OUTPUT', convertDetailedOutputToSimplified(state));
 89 |     await Actor.setValue('DETAILED-OUTPUT', state);
 90 | 
 91 |     log.info('Checker finished.');
 92 |     log.info(
 93 |         `Simplified output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`,
 94 |     );
 95 |     log.info(
 96 |         `Detailed output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`,
 97 |     );
 98 |     log.info(`Preview dataset: https://api.apify.com/v2/datasets/${env.defaultDatasetId}/items?clean=true&format=html`);
 99 | });
100 | 


--------------------------------------------------------------------------------
/checker-puppeteer/src/typedefs.ts:
--------------------------------------------------------------------------------
  1 | import type { CheerioCrawlingContext } from 'crawlee';
  2 | 
  3 | type KeysNotRequired =
  4 |     | 'checkers.cheerio'
  5 |     | 'checkers.puppeteer'
  6 |     | 'checkers.playwright'
  7 |     | 'playwright.chrome'
  8 |     | 'playwright.firefox'
  9 |     | 'playwright.webkit'
 10 |     | 'maxConcurrentDomainsChecked';
 11 | 
 12 | export type PuppeteerActorInput = Omit<ActorInputData, KeysNotRequired>;
 13 | 
 14 | export type CheerioCheckerHandlePageInputs = CheerioCrawlingContext
 15 | 
 16 | export interface PseudoUrlInputCustom {
 17 |     purl: string;
 18 |     method?: string;
 19 |     payload?: string;
 20 |     userData?: Record<string, unknown>;
 21 |     headers?: Record<string, string>;
 22 | }
 23 | 
 24 | export interface UrlInput {
 25 |     url: string;
 26 |     method?: string;
 27 |     payload?: string;
 28 |     userData?: Record<string, unknown>;
 29 |     headers?: Record<string, string>;
 30 | }
 31 | 
 32 | export interface ProxyConfiguration {
 33 |     useApifyProxy: boolean;
 34 |     apifyProxyGroups?: string[];
 35 |     apifyProxyCountry?: string;
 36 | }
 37 | 
 38 | export interface ActorInputData {
 39 |     // Crawlers to use
 40 |     'checkers.cheerio'?: boolean;
 41 |     'checkers.puppeteer'?: boolean;
 42 |     'checkers.playwright'?: boolean;
 43 | 
 44 |     // Pass these to crawlers
 45 | 
 46 |     // save snapshots
 47 |     saveSnapshot?: boolean;
 48 | 
 49 |     // General options
 50 |     urlsToCheck: UrlInput[];
 51 |     proxyConfiguration: ProxyConfiguration;
 52 |     linkSelector?: string;
 53 |     pseudoUrls: PseudoUrlInputCustom[];
 54 |     repeatChecksOnProvidedUrls?: number;
 55 |     maxNumberOfPagesCheckedPerDomain: number;
 56 |     maxConcurrentPagesCheckedPerDomain: number;
 57 |     maxConcurrentDomainsChecked: number;
 58 |     retireBrowserInstanceAfterRequestCount: number;
 59 | 
 60 |     // Pass only to puppeteer
 61 |     'puppeteer.headfull'?: boolean;
 62 |     'puppeteer.useChrome'?: boolean;
 63 |     'puppeteer.waitFor'?: string;
 64 | 
 65 |     // Pass only to playwright
 66 |     'playwright.chrome'?: boolean;
 67 |     'playwright.firefox'?: boolean;
 68 |     'playwright.webkit'?: boolean;
 69 |     'playwright.headfull'?: boolean;
 70 |     'playwright.useChrome'?: boolean;
 71 |     'playwright.waitFor'?: string;
 72 | }
 73 | 
 74 | export interface PreparedActorConfig {
 75 |     actorId: string;
 76 |     proxyUsed?: string;
 77 |     url: string;
 78 |     input: ActorInputData;
 79 |     params: {
 80 |         memory: number;
 81 |         timeout: number;
 82 |     };
 83 |     // This data is set when the config is ran
 84 |     runId?: string;
 85 | }
 86 | 
 87 | export interface CreateActorRunConfig {
 88 |     checkerId: string;
 89 |     input: ActorInputData;
 90 |     urlData: UrlInput;
 91 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
 92 | }
 93 | 
 94 | // --- OUTPUT ---
 95 | 
 96 | export interface ActorCheckDetailedOutput {
 97 |     // Set by waitForRunToFinishAndPushData
 98 |     proxyUsed?: string;
 99 |     checkerType: 'cheerio' | 'puppeteer' | 'playwright';
100 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
101 |     computeUnitsUsedForThisCheck?: number;
102 |     // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit
103 |     pagesPerComputeUnit?: number;
104 | 
105 |     // URLs
106 |     url: string;
107 |     simplifiedOutput: string;
108 |     detailedOutput: string;
109 | 
110 |     // Page data
111 |     totalPages: UrlCheckResult[];
112 |     timedOut: UrlCheckResult[];
113 |     failedToLoadOther: UrlCheckResult[];
114 |     accessDenied: UrlCheckResult[];
115 |     success: UrlCheckResult[];
116 | 
117 |     // Status codes
118 |     statusCodes: Record<number, UrlCheckResult[]>;
119 | 
120 |     // Captcha time
121 |     recaptcha: UrlCheckResult[];
122 |     distilCaptcha: UrlCheckResult[];
123 |     hCaptcha: UrlCheckResult[];
124 | }
125 | 
126 | export interface UrlCheckResult {
127 |     url: string;
128 |     screenshotUrl?: string;
129 |     htmlUrl?: string;
130 | }
131 | 
132 | export type ActorCheckSimplifiedOutput = {
133 |     [K in keyof ActorCheckDetailedOutput]:
134 |         ActorCheckDetailedOutput[K] extends Array<any>
135 |             ? number
136 |             : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] }
137 |                 ? Record<number, number>
138 |                 : ActorCheckDetailedOutput[K];
139 | };


--------------------------------------------------------------------------------
/checker-puppeteer/src/utils.ts:
--------------------------------------------------------------------------------
 1 | import { Dictionary } from 'crawlee';
 2 | import type { ActorCheckDetailedOutput, ActorCheckSimplifiedOutput } from './typedefs.js';
 3 | 
 4 | export function convertDetailedOutputToSimplified(data: ActorCheckDetailedOutput): ActorCheckSimplifiedOutput {
 5 |     const obj: Dictionary = {};
 6 | 
 7 |     for (const [key, value] of Object.entries(data)) {
 8 |         if (Array.isArray(value)) {
 9 |             obj[key] = value.length;
10 |         } else if (typeof value === 'object') {
11 |             if (!obj[key]) {
12 |                 obj[key] = {};
13 |             }
14 |             const nestedObject: Dictionary = obj[key];
15 | 
16 |             for (const [statusCode, statusValue] of Object.entries(value)) {
17 |                 nestedObject[statusCode] = (statusValue as any).length;
18 |             }
19 |         } else {
20 |             obj[key] = value;
21 |         }
22 |     }
23 | 
24 |     // @ts-expect-error We are merging the objects
25 |     return obj;
26 | }


--------------------------------------------------------------------------------
/checker-puppeteer/tsconfig.json:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {
 4 |     "extends": "@apify/tsconfig",
 5 |     "compilerOptions": {
 6 |         "module": "ES2022",
 7 |         "target": "ES2022",
 8 |         "outDir": "dist",
 9 |         "noUnusedLocals": false,
10 |         "lib": ["DOM"],
11 |         "skipLibCheck": true
12 |     },
13 |     "include": [
14 |         "./src/**/*"
15 |     ]
16 | }


--------------------------------------------------------------------------------
/starter/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apify-projects/store-website-checker/4600159968d7289e023c071ad72c22bc5f3e4570/starter/.DS_Store


--------------------------------------------------------------------------------
/starter/.actor/actor.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"actorSpecification": 1,
3 | 	"name": "starter",
4 | 	"version": "0.0",
5 | 	"buildTag": "latest"
6 | }
7 | 


--------------------------------------------------------------------------------
/starter/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": true,
 3 |     "env": {
 4 |         "browser": true,
 5 |         "es2020": true,
 6 |         "node": true
 7 |     },
 8 |     "extends": [
 9 |         "@apify/eslint-config-ts"
10 |     ],
11 |     "parserOptions": {
12 |         "project": "./tsconfig.json",
13 |         "ecmaVersion": 2020
14 |     },
15 |     "ignorePatterns": [
16 |         "node_modules",
17 |         "dist",
18 |         "**/*.d.ts"
19 |     ]
20 | }


--------------------------------------------------------------------------------
/starter/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | apify_storage
3 | dist
4 | 
5 | storage
6 | # Added by Apify CLI
7 | .venv
8 | 


--------------------------------------------------------------------------------
/starter/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Specify the base Docker image. You can read more about
 2 | # the available images at https://crawlee.dev/docs/guides/docker-images
 3 | # You can also use any other image from Docker Hub.
 4 | FROM apify/actor-node:16 AS builder
 5 | 
 6 | # Copy just package.json and package-lock.json
 7 | # to speed up the build using Docker layer cache.
 8 | COPY package*.json ./
 9 | 
10 | # Install all dependencies. Don't audit to speed up the installation.
11 | RUN npm install --include=dev --audit=false
12 | 
13 | # Next, copy the source files using the user set
14 | # in the base image.
15 | COPY . ./
16 | 
17 | # Install all dependencies and build the project.
18 | # Don't audit to speed up the installation.
19 | RUN npm run build
20 | 
21 | # Create final image
22 | FROM apify/actor-node:16
23 | 
24 | # Copy only built JS files from builder image
25 | COPY --from=builder /usr/src/app/dist ./dist
26 | 
27 | # Copy just package.json and package-lock.json
28 | # to speed up the build using Docker layer cache.
29 | COPY package*.json ./
30 | 
31 | # Install NPM packages, skip optional and development dependencies to
32 | # keep the image small. Avoid logging too much and print the dependency
33 | # tree for debugging
34 | RUN npm --quiet set progress=false \
35 |     && npm install --omit=dev --omit=optional \
36 |     && echo "Installed NPM packages:" \
37 |     && (npm list --omit=dev --all || true) \
38 |     && echo "Node.js version:" \
39 |     && node --version \
40 |     && echo "NPM version:" \
41 |     && npm --version \
42 |     && rm -r ~/.npm
43 | 
44 | # Next, copy the remaining files and directories with the source code.
45 | # Since we do this after NPM install, quick build will be really fast
46 | # for most source file changes.
47 | COPY . ./
48 | 
49 | 
50 | # Run the image.
51 | CMD npm run start:prod --silent


--------------------------------------------------------------------------------
/starter/INPUT_SCHEMA.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "title": "Web Checker",
  3 |   "description": "The web checker actor loads <b>URLs to check</b> and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay.",
  4 |   "type": "object",
  5 |   "schemaVersion": 1,
  6 |   "properties": {
  7 |     "urlsToCheck": {
  8 |       "title": "URLs to check",
  9 |       "type": "array",
 10 |       "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the <b>Use request queue</b> option.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
 11 |       "sectionCaption": "Checker Options",
 12 |       "sectionDescription": "Options that will be passed to the checkers",
 13 |       "editor": "requestListSources",
 14 |       "prefill": [
 15 |         {
 16 |           "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011"
 17 |         }
 18 |       ]
 19 |     },
 20 |     "proxyConfiguration": {
 21 |       "title": "Proxy Configuration",
 22 |       "type": "object",
 23 |       "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
 24 |       "default": {},
 25 |       "editor": "proxy",
 26 |       "prefill": {
 27 |         "useApifyProxy": true,
 28 |         "apifyProxyGroups": [
 29 |           "SHADER",
 30 |           "BUYPROXIES94952",
 31 |           "RESIDENTIAL"
 32 |         ]
 33 |       }
 34 |     },
 35 |     "checkers.cheerio": {
 36 |       "title": "Cheerio",
 37 |       "type": "boolean",
 38 |       "description": "Crawl with Cheerio",
 39 |       "default": true,
 40 |       "editor": "checkbox",
 41 |       "groupCaption": "Crawlers to use",
 42 |       "groupDescription": "Select which crawler types should be used for checking these domains"
 43 |     },
 44 |     "checkers.puppeteer": {
 45 |       "title": "Puppeteer",
 46 |       "type": "boolean",
 47 |       "description": "Crawl with Puppeteer",
 48 |       "default": true,
 49 |       "editor": "checkbox"
 50 |     },
 51 |     "checkers.playwright": {
 52 |       "title": "Playwright",
 53 |       "type": "boolean",
 54 |       "description": "Crawl with Playwright",
 55 |       "editor": "checkbox",
 56 |       "default": true
 57 |     },
 58 |     "saveSnapshot": {
 59 |       "title": "Enabled",
 60 |       "type": "boolean",
 61 |       "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright",
 62 |       "editor": "checkbox",
 63 |       "groupCaption": "Save Snapshots",
 64 |       "default": true
 65 |     },
 66 |     "enqueueAllOnDomain": {
 67 |       "title": "Enqueue any URL on domain (no need for link selector or pseudo URLs)",
 68 |       "type": "boolean",
 69 |       "description": "Will enqueue any URLs on the domain",
 70 |       "default": true
 71 |     },
 72 |     "linkSelector": {
 73 |       "title": "Link Selector",
 74 |       "type": "string",
 75 |       "description": "A CSS selector saying which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) shall be followed and added to the request queue. This setting only applies if <b>Use request queue</b> is enabled. To filter the links added to the queue, use the <b>Pseudo-URLs</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README.",
 76 |       "sectionCaption": "Crawler Options",
 77 |       "sectionDescription": "Specific options that are relevant for crawlers",
 78 |       "editor": "textfield"
 79 |     },
 80 |     "pseudoUrls": {
 81 |       "title": "Pseudo-URLs",
 82 |       "type": "array",
 83 |       "description": "Specifies what kind of URLs found by <b>Link selector</b> should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in <code>[]</code> brackets, e.g. <code>http://www.example.com/[.*]</code>. This setting only applies if the <b>Use request queue</b> option is enabled.<br><br>If <b>Pseudo-URLs</b> are omitted, the actor enqueues all links matched by the <b>Link selector</b>.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#pseudo-urls' target='_blank' rel='noopener'>Pseudo-URLs</a> in README.",
 84 |       "default": [],
 85 |       "editor": "pseudoUrls"
 86 |     },
 87 |     "repeatChecksOnProvidedUrls": {
 88 |       "title": "Repeat checks on provided URLs",
 89 |       "type": "integer",
 90 |       "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.",
 91 |       "editor": "number",
 92 |       "prefill": 10
 93 |     },
 94 |     "maxNumberOfPagesCheckedPerDomain": {
 95 |       "title": "Max number of pages checked per domain",
 96 |       "type": "integer",
 97 |       "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.<br><br>If set to <code>0</code>, there is no limit.",
 98 |       "prefill": 1000,
 99 |       "editor": "number"
100 |     },
101 |     "maxConcurrentPagesCheckedPerDomain": {
102 |       "title": "Maximum concurrent pages checked per domain",
103 |       "type": "integer",
104 |       "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.",
105 |       "default": 500,
106 |       "editor": "number",
107 |       "minimum": 1
108 |     },
109 |     "maxConcurrentDomainsChecked": {
110 |       "title": "Maximum number of concurrent domains checked",
111 |       "type": "integer",
112 |       "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.",
113 |       "default": 5,
114 |       "editor": "number",
115 |       "minimum": 1,
116 |       "maximum": 10
117 |     },
118 |     "retireBrowserInstanceAfterRequestCount": {
119 |       "title": "Retire browser instance after request count",
120 |       "type": "integer",
121 |       "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.",
122 |       "default": 10,
123 |       "editor": "number",
124 |       "minimum": 1
125 |     },
126 |     "navigationTimeoutSecs": {
127 |         "title": "Navigation timeout (seconds)",
128 |         "type": "integer",
129 |         "description": "Specifies the maximum time in seconds the request will wait for the page to load. If the page is not loaded within this time, the browser will throw an error and the page will be marked as failed.",
130 |         "default": 60,
131 |         "minimum": 1
132 |     },
133 |     "puppeteer.headfull": {
134 |       "title": "Headfull browser (XVFB)",
135 |       "type": "boolean",
136 |       "description": "Only works for Puppeteer type!",
137 |       "sectionCaption": "Puppeteer Options",
138 |       "sectionDescription": "Options that are passed in to puppeteer when checking",
139 |       "editor": "checkbox"
140 |     },
141 |     "puppeteer.useChrome": {
142 |       "title": "Use Chrome",
143 |       "type": "boolean",
144 |       "description": "Only works for Puppeteer type! Be careful that Chrome is not guaranteed to work with Puppeteer.",
145 |       "editor": "checkbox"
146 |     },
147 |     "puppeteer.waitFor": {
148 |       "title": "Wait for",
149 |       "type": "string",
150 |       "description": "Only works for Puppeteer type. Will wait on each page. You can provide number in ms or a selector.",
151 |       "editor": "textfield",
152 |       "default": "2000"
153 |     },
154 |     "puppeteer.memory": {
155 |         "title": "Memory",
156 |         "type": "integer",
157 |         "unit": "MB",
158 |         "default": 4096,
159 |         "minimum": 1024,
160 |         "maximum": 32768,
161 |         "description": "Must be power of 2 between 128 and 32768."
162 |     },
163 |     "playwright.chrome": {
164 |       "title": "Chrome",
165 |       "type": "boolean",
166 |       "description": "Use Chrome when checking",
167 |       "default": false,
168 |       "sectionCaption": "Playwright options",
169 |       "sectionDescription": "Options passed to playwright when checking",
170 |       "editor": "checkbox",
171 |       "groupCaption": "Browser type",
172 |       "groupDescription": "Which type of browser should the checker use"
173 |     },
174 |     "playwright.firefox": {
175 |       "title": "Firefox",
176 |       "type": "boolean",
177 |       "description": "Use Firefox when checking",
178 |       "editor": "checkbox",
179 |       "default": true
180 |     },
181 |     "playwright.webkit": {
182 |       "title": "Safari (Webkit)",
183 |       "type": "boolean",
184 |       "description": "Use Safari when checking",
185 |       "editor": "checkbox"
186 |     },
187 |     "playwright.useChrome": {
188 |       "title": "Use Chrome instead of Chromium",
189 |       "type": "boolean",
190 |       "description": "Only works for Playwright type! Be careful that Chrome is not guaranteed to work with Playwright.",
191 |       "editor": "checkbox"
192 |     },
193 |     "playwright.headfull": {
194 |       "title": "Headfull browser (XVFB)",
195 |       "type": "boolean",
196 |       "description": "If the browser should be headfull or not",
197 |       "editor": "checkbox"
198 |     },
199 |     "playwright.waitFor": {
200 |       "title": "Wait for",
201 |       "type": "string",
202 |       "description": "Only works for playwright type. Will wait on each page. You can provide number in ms or a selector.",
203 |       "editor": "textfield",
204 |         "default": "2000"
205 |     },
206 |     "playwright.memory": {
207 |         "title": "Memory",
208 |         "type": "integer",
209 |         "unit": "MB",
210 |         "default": 4096,
211 |         "minimum": 1024,
212 |         "maximum": 32768,
213 |         "description": "Must be power of 2 between 128 and 32768."
214 |     }
215 |   },
216 |   "required": ["urlsToCheck"]
217 | }
218 | 


--------------------------------------------------------------------------------
/starter/README.md:
--------------------------------------------------------------------------------
 1 | ## Website Checker
 2 | 
 3 | Website checker is a simple actor that allows you to scan any website for performance and blocking using various scraping methods as Cheerio, Puppeteer and Playwright.
 4 | 
 5 | ### Features
 6 | 
 7 | The actor provides these useful features out of the box:
 8 | 
 9 | - Collects response status codes
10 | - Recognizes the most common captchas
11 | - Saves HTML snapshots and screenshots (if Puppeteer or Playwright is chosen)
12 | - Enables choosing between Cheerio (plain HTTP) and Puppeteer/Playwright (browser) scraper
13 | - Enables choosing different browsers for Playwright - Chrome, Firefox and Webkit (Safari)
14 | - Enables re-scraping start URLs or enqueueing with a familiar link selector + pseudo URLs system
15 | - Handles different failure states like timeouts and network errors
16 | - Enables basic proxy and browser configuration
17 | 
18 | ### How to use
19 | 
20 | The most common use-case is to do a quick check on how aggressively the target site is blocking. In that case just supply a start URL, ideally a category one or product one. You can either set `replicateStartUrls` or add enqueueing with `linkSelector` + `pseudoUrls`, both are good options to test different proxies.
21 | 
22 | You can pick any combination of run options and the checker will spawn runner actor for every combination of scraping tool & proxies and then combine the results into single output.
23 | 
24 | In the end you will get a simple statistics about the blocking rate. It is recommended to check a few screenshots just to make sure the actor correctly recognized the page status. You can get to the detailed output (per URL) via KV store or dataset (the KV output sorts by response status while dataset is simply ordered by scraping order).
25 | 
26 | #### Multiple URLs and configurations
27 | Website checker doesn't have any limitation of how many websites and configs you can check. For each website, it will run each config. You just need to set a reasonable `maxConcurrentDomainsChecked` so that all parallel runs fit into your total memory (4 GB for Cheerio and 8 GB for Puppeteer/Playwright checks).
28 | 
29 | ### Input
30 | 
31 | Please follow the [actor's input page](https://apify.com/lukaskrivka/website-checker/input-schema) for a detailed explanation. Most input fields have reasonable defaults.
32 | 
33 | ### Example output
34 | 
35 | #### Simple output
36 | 
37 | ```
38 | {
39 |     "timeouted": 0,
40 |     "failedToLoadOther": 9,
41 |     "accessDenied": 0,
42 |     "recaptcha": 0,
43 |     "distilCaptcha": 24,
44 |     "hCaptcha": 0, 
45 |     "statusCodes": {
46 |         "200": 3,
47 |         "401": 2,
48 |         "403": 5,
49 |         "405": 24
50 |     },
51 |     "success": 3,
52 |     "total": 43
53 | }
54 | ```
55 | 
56 | ### Changelog
57 | 
58 | Check history of changes in the [CHANGELOG](https://github.com/metalwarrior665/actor-website-checker/blob/master/CHANGELOG.md)
59 | 


--------------------------------------------------------------------------------
/starter/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "crawlee-cheerio-typescript",
 3 |     "version": "0.0.1",
 4 |     "type": "module",
 5 |     "description": "This is a boilerplate of an Apify actor.",
 6 |     "engines": {
 7 |         "node": ">=16.0.0"
 8 |     },
 9 |     "dependencies": {
10 |         "apify": "^3.0.0",
11 |         "crawlee": "^3.0.0"
12 |     },
13 |     "devDependencies": {
14 |         "@apify/eslint-config-ts": "^0.2.3",
15 |         "@apify/tsconfig": "^0.1.0",
16 |         "@typescript-eslint/eslint-plugin": "^5.32.0",
17 |         "@typescript-eslint/parser": "^5.32.0",
18 |         "eslint": "^8.20.0",
19 |         "ts-node": "^10.9.1",
20 |         "typescript": "4.7.4"
21 |     },
22 |     "scripts": {
23 |         "start": "npm run start:dev",
24 |         "start:prod": "node dist/main.js",
25 |         "start:dev": "ts-node-esm -T src/main.ts",
26 |         "build": "tsc",
27 |         "lint": "eslint ./src --ext .ts",
28 |         "lint:fix": "eslint ./src --ext .ts --fix",
29 |         "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
30 |     },
31 |     "author": "It's not you it's me",
32 |     "license": "ISC"
33 | }


--------------------------------------------------------------------------------
/starter/src/configs.ts:
--------------------------------------------------------------------------------
 1 | import { ACTOR_CHEERIO_CHECKER_NAME, ACTOR_PLAYWRIGHT_CHECKER_NAME, ACTOR_PUPPETEER_CHECKER_NAME } from './constants.js';
 2 | import type { PreparedActorConfig, ActorInputData, CreateActorRunConfig } from './typedefs.js';
 3 | 
 4 | export function convertInputToActorConfigs(input: ActorInputData): PreparedActorConfig[] {
 5 |     const configs: PreparedActorConfig[] = [];
 6 | 
 7 |     for (const urlData of input.urlsToCheck) {
 8 |         if (input['checkers.cheerio']) {
 9 |             configs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_CHEERIO_CHECKER_NAME }));
10 |         }
11 |         if (input['checkers.puppeteer']) {
12 |             configs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_PUPPETEER_CHECKER_NAME, memory: input['puppeteer.memory'] }));
13 |         }
14 |         if (input['checkers.playwright']) {
15 |             // Create a run config for each playwright browser
16 |             if (input['playwright.chrome']) {
17 |                 configs.push(...createActorRunConfigForCrawler({
18 |                     input,
19 |                     urlData,
20 |                     checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME,
21 |                     playwrightBrowser: 'chrome',
22 |                     memory: input['playwright.memory'],
23 |                 }));
24 |             }
25 |             if (input['playwright.firefox']) {
26 |                 configs.push(...createActorRunConfigForCrawler({
27 |                     input,
28 |                     urlData,
29 |                     checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME,
30 |                     playwrightBrowser: 'firefox',
31 |                     memory: input['playwright.memory'],
32 |                 }));
33 |             }
34 |             if (input['playwright.webkit']) {
35 |                 configs.push(...createActorRunConfigForCrawler({
36 |                     input,
37 |                     urlData,
38 |                     checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME,
39 |                     playwrightBrowser: 'webkit',
40 |                     memory: input['playwright.memory'],
41 |                 }));
42 |             }
43 |         }
44 |     }
45 | 
46 |     return configs;
47 | }
48 | 
49 | function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwrightBrowser, memory }: CreateActorRunConfig) {
50 |     const proxyGroups = input.proxyConfiguration.apifyProxyGroups?.length
51 |         ? input.proxyConfiguration.apifyProxyGroups
52 |         : ['auto'];  
53 |     for (const group of proxyGroups) {
54 |         const { url } = urlData;
55 |         const config: PreparedActorConfig = {
56 |             actorId: checkerId,
57 |             proxyUsed: group === 'auto' ? undefined : group,
58 |             url,
59 |             input: {
60 |                 saveSnapshot: input.saveSnapshot,
61 |                 urlsToCheck: [urlData],
62 |                 proxyConfiguration: {
63 |                     useApifyProxy: input.proxyConfiguration.useApifyProxy,
64 |                     apifyProxyCountry: input.proxyConfiguration.apifyProxyCountry,
65 |                     apifyProxyGroups: group === 'auto' ? undefined : [group],
66 |                 },
67 |                 linkSelector: input.enqueueAllOnDomain ? 'a[href]' : input.linkSelector,
68 |                 pseudoUrls: input.enqueueAllOnDomain
69 |                     ? [{ purl: `${new URL(url).origin}[.*]` }]
70 |                     : input.pseudoUrls,
71 |                 repeatChecksOnProvidedUrls: input.repeatChecksOnProvidedUrls,
72 |                 maxNumberOfPagesCheckedPerDomain: input.maxNumberOfPagesCheckedPerDomain,
73 |                 maxConcurrentPagesCheckedPerDomain: input.maxConcurrentPagesCheckedPerDomain,
74 |                 maxConcurrentDomainsChecked: input.maxConcurrentDomainsChecked,
75 |                 retireBrowserInstanceAfterRequestCount: input.retireBrowserInstanceAfterRequestCount,
76 |                 navigationTimeoutSecs: input.navigationTimeoutSecs,
77 |             },
78 |             params: {
79 |                 memory: memory || (checkerId === ACTOR_CHEERIO_CHECKER_NAME ? 4096 : 8192),
80 |                 timeout: 24 * 3600,
81 |             },
82 |         };
83 | 
84 |         if (checkerId === ACTOR_PUPPETEER_CHECKER_NAME) {
85 |             config.input['puppeteer.headfull'] = input['puppeteer.headfull'];
86 |             config.input['puppeteer.useChrome'] = input['puppeteer.useChrome'];
87 |             config.input['puppeteer.waitFor'] = input['puppeteer.waitFor'];
88 |         } else if (checkerId === ACTOR_PLAYWRIGHT_CHECKER_NAME && playwrightBrowser) {
89 |             config.input[`playwright.${playwrightBrowser}`] = input[`playwright.${playwrightBrowser}`];
90 |             config.input['playwright.headfull'] = input[`playwright.headfull`];
91 |             config.input['playwright.useChrome'] = input['playwright.useChrome'];
92 |             config.input['playwright.waitFor'] = input['playwright.waitFor'];
93 |         }
94 | 
95 |         yield config;
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/starter/src/constants.ts:
--------------------------------------------------------------------------------
 1 | export const ACTOR_CHEERIO_CHECKER_NAME = 'lukaskrivka/website-checker-cheerio';
 2 | 
 3 | export const ACTOR_PUPPETEER_CHECKER_NAME = 'lukaskrivka/website-checker-puppeteer';
 4 | 
 5 | export const ACTOR_PLAYWRIGHT_CHECKER_NAME = 'lukaskrivka/website-checker-playwright';
 6 | 
 7 | export const DEFAULT_COSTS = {
 8 |     COMPUTE_UNIT: 0.25,
 9 |     RESIDENTIAL_GB: 12.5,
10 | }
11 | 
12 | export const TABLE_FIELDS_ORDER = [
13 |     'url',
14 |     'checkerType',
15 |     'proxyUsed',
16 |     'totalPages',
17 |     'success',
18 |     'successRate',
19 |     'estimatedCostPerRequest',
20 |     'computeUnitsPerRequest',
21 |     'residentialGBsPerRequest',
22 |     'runUrl'
23 | ]
24 | 


--------------------------------------------------------------------------------
/starter/src/main.ts:
--------------------------------------------------------------------------------
 1 | import { Actor } from 'apify';
 2 | import { log, RequestList, BasicCrawler } from 'crawlee';
 3 | 
 4 | import { inspect } from 'util';
 5 | import { convertInputToActorConfigs } from './configs.js';
 6 | import { waitForRunToFinishAndPushData, startRun } from './startRunAndPool.js';
 7 | 
 8 | import type { ActorInputData, FrontendActorState, PreparedActorConfig } from './typedefs.js';
 9 | import { TABLE_FIELDS_ORDER } from './constants.js';
10 | 
11 | const env = Actor.getEnv();
12 | 
13 | Actor.main(async () => {
14 |     const input = await Actor.getInput() as ActorInputData;
15 |     log.debug('Provided inputs:');
16 |     log.debug(inspect(input));
17 | 
18 |     // TODO: Add utilization of all user memory instead of having to rely on maxConcurrentDomainsChecked
19 |     const { maxConcurrentDomainsChecked, urlsToCheck } = input;
20 | 
21 |     // Log the input
22 |     log.info('Input provided:');
23 |     log.debug(inspect(input, false, 4));
24 | 
25 |     const state: FrontendActorState = await Actor.getValue('STATE') ?? {
26 |         runConfigurations: [],
27 |         totalUrls: urlsToCheck.length,
28 |         checkerFinished: false,
29 |     };
30 | 
31 |     Actor.on('persistState', async () => {
32 |         await Actor.setValue('STATE', state);
33 |     });
34 | 
35 |     // If we haven't initialized the state yet, do it now
36 |     if (state.runConfigurations.length === 0 && !state.checkerFinished) {
37 |         state.runConfigurations = convertInputToActorConfigs(input);
38 |     }
39 | 
40 |     // Sort state based on started runs
41 |     state.runConfigurations = state.runConfigurations.sort((_, b) => Number(Boolean(b.runId)));
42 |     await Actor.setValue('STATE', state);
43 | 
44 |     log.info(`Preparing to process ${state.totalUrls} URLs...\n`);
45 | 
46 |     const sources = state.runConfigurations.map((actorInput, index) => ({
47 |         url: 'https://localhost',
48 |         uniqueKey: index.toString(),
49 |         userData: { actorInput },
50 |     }));
51 | 
52 |     
53 | 
54 |     const requestList = await RequestList.open(null, sources);
55 | 
56 |     const runner = new BasicCrawler({
57 |         maxConcurrency: maxConcurrentDomainsChecked,
58 |         requestList,
59 |         requestHandler: async ({ request }) => {
60 |             const { userData } = request;
61 |             const actorInput = (userData.actorInput) as PreparedActorConfig;
62 | 
63 |             if (actorInput.runId) {
64 |                 log.info(`Found run ${actorInput.runId} with actor ${actorInput.actorId} for URL "${actorInput.url}" - waiting for it to finish.`);
65 |                 log.info(`You can monitor the status of the run by going to https://console.apify.com/actors/runs/${actorInput.runId}`);
66 |             } else {
67 |                 const result = await startRun(actorInput);
68 |                 log.info(
69 |                     `Starting run for "${actorInput.url}" with actor ${actorInput.actorId} and ${
70 |                         actorInput.input.proxyConfiguration.useApifyProxy ? `proxy ${actorInput.proxyUsed ?? 'auto'}` : 'no proxy'
71 |                     }.`,
72 |                 );
73 |                 log.info(`You can monitor the status of the run by going to https://console.apify.com/actors/runs/${result.id}`);
74 |                 actorInput.runId = result.id;
75 |             }
76 | 
77 |             // Wait for the run to finish
78 |             await waitForRunToFinishAndPushData(actorInput);
79 |         },
80 |         requestHandlerTimeoutSecs: 999_999,
81 |     });
82 | 
83 |     // Run the checker
84 |     await runner.run();
85 | 
86 |     // Save the state as done, to prevent resurrection doing requests it doesn't have to do
87 |     state.runConfigurations = [];
88 |     state.checkerFinished = true;
89 |     await Actor.setValue('STATE', state);
90 | 
91 |     log.info(`\nChecking ${state.totalUrls} URLs completed!`);
92 |     log.info(`NICER TABLE VIEW:\nhttps://api.apify.com/v2/datasets/${Actor.getEnv().defaultDatasetId}/items?clean=true&format=html`
93 |         + `&fields=${TABLE_FIELDS_ORDER.join(',')}`);
94 | });
95 | 


--------------------------------------------------------------------------------
/starter/src/startRunAndPool.ts:
--------------------------------------------------------------------------------
 1 | import { Actor, ActorRun } from 'apify';
 2 | 
 3 | import { DEFAULT_COSTS } from './constants.js';
 4 | import type { PreparedActorConfig, ActorCheckSimplifiedOutput, FixedActorRun } from './typedefs.js';
 5 | 
 6 | export async function startRun(run: PreparedActorConfig) {
 7 |     const client = Actor.newClient();
 8 |     const result = await client.actor(run.actorId).start(run.input, run.params);
 9 | 
10 |     return result;
11 | }
12 | 
13 | export async function waitForRunToFinishAndPushData(runConfig: PreparedActorConfig) {
14 |     const client = Actor.newClient();
15 |     const run = client.run(runConfig.runId!);
16 | 
17 |     const finishedRun = await run.waitForFinish() as FixedActorRun;
18 |     const {
19 |         ACTOR_COMPUTE_UNITS: computeUnits,
20 |         PROXY_RESIDENTIAL_TRANSFER_GBYTES: residentialGBs,
21 |     } = finishedRun.usage;
22 | 
23 |     const value = (await run.keyValueStore().getRecord('OUTPUT'))!.value as ActorCheckSimplifiedOutput;
24 | 
25 |     value.computeUnitsUsedForThisCheck = Number(computeUnits.toFixed(4));
26 |     value.pagesPerComputeUnit = Number((value.totalPages / computeUnits).toFixed(2));
27 |     value.computeUnitsPerRequest = Number((computeUnits / value.totalPages).toFixed(6));
28 |     // 8 decimals gives all the precision we need (level of 10 Bytes)
29 |     value.residentialGBs = Number(residentialGBs.toFixed(8));
30 |     value.residentialGBsPerRequest = Number((residentialGBs / value.totalPages).toFixed(8));
31 |     value.proxyUsed = runConfig.proxyUsed;
32 |     value.estimatedCost = Number((computeUnits * DEFAULT_COSTS.COMPUTE_UNIT + residentialGBs * DEFAULT_COSTS.RESIDENTIAL_GB).toFixed(4));
33 |     value.estimatedCostPerRequest = Number((value.estimatedCost / value.totalPages).toFixed(6));
34 | 
35 |     if (runConfig.input['playwright.chrome']) {
36 |         value.playwrightBrowser = 'chrome';
37 |     } else if (runConfig.input['playwright.firefox']) {
38 |         value.playwrightBrowser = 'firefox';
39 |     } else if (runConfig.input['playwright.webkit']) {
40 |         value.playwrightBrowser = 'webkit';
41 |     }
42 | 
43 |     value.successRate = Number(((value.success / value.totalPages) * 100).toFixed(2));
44 |     value.runUrl = `https://console.apify.com/actors/runs/${runConfig.runId}`;
45 | 
46 |     await Actor.pushData(value);
47 | }
48 | 


--------------------------------------------------------------------------------
/starter/src/typedefs.ts:
--------------------------------------------------------------------------------
  1 | import { ActorRun } from "apify";
  2 | 
  3 | export interface FrontendActorState {
  4 |     totalUrls: number;
  5 |     runConfigurations: PreparedActorConfig[];
  6 |     checkerFinished: boolean;
  7 | }
  8 | 
  9 | export interface PseudoUrlInput {
 10 |     purl: string;
 11 |     method?: string;
 12 |     payload?: string;
 13 |     userData?: Record<string, unknown>;
 14 |     headers?: Record<string, string>;
 15 | }
 16 | 
 17 | export interface UrlInput {
 18 |     url: string;
 19 |     method?: string;
 20 |     payload?: string;
 21 |     userData?: Record<string, unknown>;
 22 |     headers?: Record<string, string>;
 23 | }
 24 | 
 25 | export interface ProxyConfiguration {
 26 |     useApifyProxy: boolean;
 27 |     apifyProxyGroups?: string[];
 28 |     apifyProxyCountry?: string;
 29 | }
 30 | 
 31 | export interface ActorInputData {
 32 |     // Crawlers to use
 33 |     'checkers.cheerio'?: boolean;
 34 |     'checkers.puppeteer'?: boolean;
 35 |     'checkers.playwright'?: boolean;
 36 | 
 37 |     // Pass these to crawlers
 38 | 
 39 |     // save snapshots
 40 |     saveSnapshot?: boolean;
 41 | 
 42 |     // General options
 43 |     urlsToCheck: UrlInput[];
 44 |     proxyConfiguration: ProxyConfiguration;
 45 |     enqueueAllOnDomain?: boolean;
 46 |     linkSelector?: string;
 47 |     pseudoUrls: PseudoUrlInput[];
 48 |     repeatChecksOnProvidedUrls?: number;
 49 |     maxNumberOfPagesCheckedPerDomain: number;
 50 |     maxConcurrentPagesCheckedPerDomain: number;
 51 |     maxConcurrentDomainsChecked: number;
 52 |     retireBrowserInstanceAfterRequestCount: number;
 53 |     navigationTimeoutSecs: number;
 54 | 
 55 |     // Pass only to puppeteer
 56 |     'puppeteer.headfull'?: boolean;
 57 |     'puppeteer.useChrome'?: boolean;
 58 |     'puppeteer.waitFor'?: string;
 59 |     'puppeteer.memory'?: number;
 60 | 
 61 |     // Pass only to playwright
 62 |     'playwright.chrome'?: boolean;
 63 |     'playwright.firefox'?: boolean;
 64 |     'playwright.webkit'?: boolean;
 65 |     'playwright.headfull'?: boolean;
 66 |     'playwright.useChrome'?: boolean;
 67 |     'playwright.waitFor'?: string;
 68 |     'playwright.memory'?: number;
 69 | }
 70 | 
 71 | export interface PreparedActorConfig {
 72 |     actorId: string;
 73 |     proxyUsed?: string;
 74 |     url: string;
 75 |     input: ActorInputData;
 76 |     params: {
 77 |         memory: number;
 78 |         timeout: number;
 79 |     };
 80 |     // This data is set when the config is ran
 81 |     runId?: string;
 82 | }
 83 | 
 84 | export interface CreateActorRunConfig {
 85 |     checkerId: string;
 86 |     input: ActorInputData;
 87 |     urlData: UrlInput;
 88 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
 89 |     memory?: number;
 90 | }
 91 | 
 92 | export interface UrlCheckResult {
 93 |     url: string;
 94 |     screenshotUrl?: string;
 95 |     htmlUrl?: string;
 96 | }
 97 | 
 98 | export interface ActorCheckDetailedOutput {
 99 |     // Set by waitForRunToFinishAndPushData
100 |     proxyUsed?: string;
101 |     checkerType: 'cheerio' | 'puppeteer' | 'playwright';
102 |     playwrightBrowser?: 'chrome' | 'firefox' | 'webkit';
103 |     computeUnitsUsedForThisCheck?: number;
104 |     // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit
105 |     pagesPerComputeUnit: number;
106 |     computeUnitsPerRequest: number;
107 |     residentialGBs: number;
108 |     residentialGBsPerRequest: number;
109 |     estimatedCost: number;
110 |     estimatedCostPerRequest: number;
111 | 
112 | 
113 |     // URLs
114 |     url: string;
115 |     simplifiedOutput: string;
116 |     detailedOutput: string;
117 |     runUrl: string;
118 | 
119 |     successRate?: number;
120 | 
121 |     // Page data
122 |     totalPages: UrlCheckResult[];
123 |     timedOut: UrlCheckResult[];
124 |     failedToLoadOther: UrlCheckResult[];
125 |     accessDenied: UrlCheckResult[];
126 |     success: UrlCheckResult[];
127 | 
128 |     // Status codes
129 |     statusCodes: Record<number, UrlCheckResult[]>;
130 | 
131 |     // Captcha time
132 |     recaptcha: UrlCheckResult[];
133 |     distilCaptcha: UrlCheckResult[];
134 |     hCaptcha: UrlCheckResult[];
135 | }
136 | 
137 | export type ActorCheckSimplifiedOutput = {
138 |     [K in keyof ActorCheckDetailedOutput]:
139 |         ActorCheckDetailedOutput[K] extends Array<any>
140 |             ? number
141 |             : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] }
142 |                 ? Record<number, number>
143 |                 : ActorCheckDetailedOutput[K];
144 | };
145 | 
146 | export interface FixedActorRun extends ActorRun {
147 |     usage: {
148 |         ACTOR_COMPUTE_UNITS: number,
149 |         DATASET_READS: number,
150 |         DATASET_WRITES: number,
151 |         KEY_VALUE_STORE_READS: number,
152 |         KEY_VALUE_STORE_WRITES: number,
153 |         KEY_VALUE_STORE_LISTS: number,
154 |         REQUEST_QUEUE_READS: number,
155 |         REQUEST_QUEUE_WRITES: number,
156 |         DATA_TRANSFER_INTERNAL_GBYTES: number,
157 |         DATA_TRANSFER_EXTERNAL_GBYTES: number,
158 |         PROXY_RESIDENTIAL_TRANSFER_GBYTES: number,
159 |         PROXY_SERPS: number,
160 |     }
161 | };
162 | 


--------------------------------------------------------------------------------
/starter/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": "@apify/tsconfig",
 3 |     "compilerOptions": {
 4 |         "module": "ES2022",
 5 |         "target": "ES2022",
 6 |         "outDir": "dist",
 7 |         "noUnusedLocals": false,
 8 |         "lib": ["DOM"],
 9 |         "skipLibCheck": true
10 |     },
11 |     "include": [
12 |         "./src/**/*"
13 |     ]
14 | }


--------------------------------------------------------------------------------