├── .DS_Store ├── CHANGELOG.md ├── README.md ├── checker-cheerio ├── .actor │ └── actor.json ├── .eslintrc ├── .gitignore ├── Dockerfile ├── INPUT_SCHEMA.json ├── README.md ├── package-lock.json ├── package.json ├── src │ ├── checkers.ts │ ├── handleFailedRequest.ts │ ├── handlePage.ts │ ├── main.ts │ ├── typedefs.ts │ └── utils.ts └── tsconfig.json ├── checker-playwright ├── .eslintrc ├── .gitignore ├── Dockerfile ├── INPUT_SCHEMA.json ├── README.md ├── apify.json ├── package-lock.json ├── package.json ├── src │ ├── checkers.ts │ ├── handleFailedRequest.ts │ ├── handlePage.ts │ ├── main.ts │ ├── typedefs.ts │ └── utils.ts └── tsconfig.json ├── checker-puppeteer ├── .gitignore ├── Dockerfile ├── INPUT_SCHEMA.json ├── README.md ├── apify.json ├── package-lock.json ├── package.json ├── src │ ├── checkers.ts │ ├── handleFailedRequest.ts │ ├── handlePage.ts │ ├── main.ts │ ├── typedefs.ts │ └── utils.ts └── tsconfig.json └── starter ├── .DS_Store ├── .actor └── actor.json ├── .eslintrc ├── .gitignore ├── Dockerfile ├── INPUT_SCHEMA.json ├── README.md ├── package-lock.json ├── package.json ├── src ├── configs.ts ├── constants.ts ├── main.ts ├── startRunAndPool.ts └── typedefs.ts └── tsconfig.json /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify-projects/store-website-checker/4600159968d7289e023c071ad72c22bc5f3e4570/.DS_Store -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 2021-07-29 2 | 3 | *Features* 4 | 5 | - Pushing metadata about each page to dataset 6 | - Added recognition of Amazon's `hCaptcha` 7 | - `success` and `wasSuccess` metrics added to output. Success is measured by status being less than 400 and no captcha 8 | 9 | *Changes* 10 | 11 | - Removed `useGoogleBotHeaders` option (we don't want to impersonate Google anyway) 12 | - Updated `apify` from `0.18.1` to `1.3.1` 13 | - `saveSnapshots` is `true` by default 14 | - Added recognition of Amazon's `hCaptcha` 15 | - `success` and `wasSuccess` metrics added to output. Success is measured by status being less than 400 and no captcha 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Website Checker 2 | 3 | Website checker is a simple actor that allows you to scan any website for performance and blocking using various scraping methods as Cheerio, Puppeteer and Playwright. 4 | 5 | ### Features 6 | 7 | The actor provides these useful features out of the box: 8 | 9 | - Collects response status codes 10 | - Recognizes the most common captchas 11 | - Saves HTML snapshots and screenshots (if Puppeteer or Playwright is chosen) 12 | - Enables choosing between Cheerio (plain HTTP) and Puppeteer/Playwright (browser) scraper 13 | - Enables choosing different browsers for Playwright - Chrome, Firefox and Webkit (Safari) 14 | - Enables re-scraping start URLs or enqueueing with a familiar link selector + pseudo URLs system 15 | - Handles different failure states like timeouts and network errors 16 | - Enables basic proxy and browser configuration 17 | 18 | ### How to use 19 | 20 | The most common use-case is to do a quick check on how aggressively the target site is blocking. In that case just supply a start URL, ideally a category one or product one. You can either set `replicateStartUrls` or add enqueueing with `linkSelector` + `pseudoUrls`, both are good options to test different proxies. 21 | 22 | You can pick any combination of run options and the checker will spawn runner actor for every combination of scraping tool & proxies and then combine the results into single output. 23 | 24 | In the end you will get a simple statistics about the blocking rate. It is recommended to check a few screenshots just to make sure the actor correctly recognized the page status. You can get to the detailed output (per URL) via KV store or dataset (the KV output sorts by response status while dataset is simply ordered by scraping order). 25 | 26 | #### Multiple URLs and configurations 27 | Website checker doesn't have any limitation of how many websites and configs you can check. For each website, it will run each config. You just need to set a reasonable `maxConcurrentDomainsChecked` so that all parallel runs fit into your total memory (4 GB for Cheerio and 8 GB for Puppeteer/Playwright checks). 28 | 29 | ### Input 30 | 31 | Please follow the [actor's input page](https://apify.com/lukaskrivka/website-checker/input-schema) for a detailed explanation. Most input fields have reasonable defaults. 32 | 33 | ### Example output 34 | 35 | #### Simple output 36 | 37 | ``` 38 | { 39 | "timeouted": 0, 40 | "failedToLoadOther": 9, 41 | "accessDenied": 0, 42 | "recaptcha": 0, 43 | "distilCaptcha": 24, 44 | "hCaptcha": 0, 45 | "statusCodes": { 46 | "200": 3, 47 | "401": 2, 48 | "403": 5, 49 | "405": 24 50 | }, 51 | "success": 3, 52 | "total": 43 53 | } 54 | ``` 55 | 56 | #### Detailed output with URLs, screenshots and HTML links 57 | 58 | 59 | ### Changelog 60 | 61 | Check history of changes in the [CHANGELOG](https://github.com/metalwarrior665/actor-website-checker/blob/master/CHANGELOG.md) 62 | -------------------------------------------------------------------------------- /checker-cheerio/.actor/actor.json: -------------------------------------------------------------------------------- 1 | { 2 | "actorSpecification": 1, 3 | "name": "checker-cheerio", 4 | "version": "0.0", 5 | "buildTag": "latest" 6 | } 7 | -------------------------------------------------------------------------------- /checker-cheerio/.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "env": { 4 | "browser": true, 5 | "es2020": true, 6 | "node": true 7 | }, 8 | "extends": [ 9 | "@apify/eslint-config-ts" 10 | ], 11 | "parserOptions": { 12 | "project": "./tsconfig.json", 13 | "ecmaVersion": 2020 14 | }, 15 | "ignorePatterns": [ 16 | "node_modules", 17 | "dist", 18 | "**/*.d.ts" 19 | ] 20 | } -------------------------------------------------------------------------------- /checker-cheerio/.gitignore: -------------------------------------------------------------------------------- 1 | # This file tells Git which files shouldn't be added to source control 2 | 3 | .DS_Store 4 | .idea 5 | dist 6 | node_modules 7 | apify_storage 8 | storage 9 | storage 10 | storage 11 | # Added by Apify CLI 12 | .venv 13 | -------------------------------------------------------------------------------- /checker-cheerio/Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://crawlee.dev/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node:16 AS builder 5 | 6 | # Copy just package.json and package-lock.json 7 | # to speed up the build using Docker layer cache. 8 | COPY package*.json ./ 9 | 10 | # Install all dependencies. Don't audit to speed up the installation. 11 | RUN npm install --include=dev --audit=false 12 | 13 | # Next, copy the source files using the user set 14 | # in the base image. 15 | COPY . ./ 16 | 17 | # Install all dependencies and build the project. 18 | # Don't audit to speed up the installation. 19 | RUN npm run build 20 | 21 | # Create final image 22 | FROM apify/actor-node:16 23 | 24 | # Copy only built JS files from builder image 25 | COPY --from=builder /usr/src/app/dist ./dist 26 | 27 | # Copy just package.json and package-lock.json 28 | # to speed up the build using Docker layer cache. 29 | COPY package*.json ./ 30 | 31 | # Install NPM packages, skip optional and development dependencies to 32 | # keep the image small. Avoid logging too much and print the dependency 33 | # tree for debugging 34 | RUN npm --quiet set progress=false \ 35 | && npm install --omit=dev --omit=optional \ 36 | && echo "Installed NPM packages:" \ 37 | && (npm list --omit=dev --all || true) \ 38 | && echo "Node.js version:" \ 39 | && node --version \ 40 | && echo "NPM version:" \ 41 | && npm --version \ 42 | && rm -r ~/.npm 43 | 44 | # Next, copy the remaining files and directories with the source code. 45 | # Since we do this after NPM install, quick build will be really fast 46 | # for most source file changes. 47 | COPY . ./ 48 | 49 | 50 | # Run the image. 51 | CMD npm run start:prod --silent -------------------------------------------------------------------------------- /checker-cheerio/INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Web Checker", 3 | "description": "The web checker actor loads URLs to check and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay. TODO: Needs to be more descriptive!!", 4 | "type": "object", 5 | "schemaVersion": 1, 6 | "properties": { 7 | "urlsToCheck": { 8 | "title": "URLs to check", 9 | "type": "array", 10 | "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the Use request queue option.

For details, see Start URLs in README.", 11 | "sectionCaption": "Checker Options", 12 | "sectionDescription": "Options that will be passed to the checkers", 13 | "editor": "requestListSources", 14 | "prefill": [ 15 | { 16 | "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011" 17 | } 18 | ] 19 | }, 20 | "proxyConfiguration": { 21 | "title": "Proxy Configuration", 22 | "type": "object", 23 | "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.

For details, see Proxy configuration in README.", 24 | "default": {}, 25 | "editor": "proxy", 26 | "prefill": { 27 | "useApifyProxy": false 28 | } 29 | }, 30 | "saveSnapshot": { 31 | "title": "Enabled", 32 | "type": "boolean", 33 | "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright", 34 | "editor": "checkbox", 35 | "groupCaption": "Save Snapshots" 36 | }, 37 | "linkSelector": { 38 | "title": "Link Selector", 39 | "type": "string", 40 | "description": "A CSS selector saying which links on the page (<a> elements with href attribute) shall be followed and added to the request queue. This setting only applies if Use request queue is enabled. To filter the links added to the queue, use the Pseudo-URLs setting.

If Link selector is empty, the page links are ignored.

For details, see Link selector in README.", 41 | "sectionCaption": "Crawler Options", 42 | "sectionDescription": "Specific options that are relevant for crawlers", 43 | "editor": "textfield", 44 | "prefill": "a[href]", 45 | "minLength": 1 46 | }, 47 | "pseudoUrls": { 48 | "title": "Pseudo-URLs", 49 | "type": "array", 50 | "description": "Specifies what kind of URLs found by Link selector should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in [] brackets, e.g. http://www.example.com/[.*]. This setting only applies if the Use request queue option is enabled.

If Pseudo-URLs are omitted, the actor enqueues all links matched by the Link selector.

For details, see Pseudo-URLs in README.", 51 | "default": [], 52 | "editor": "pseudoUrls", 53 | "prefill": [ 54 | { 55 | "purl": "https://www.amazon.com[.*]/dp/[.*]" 56 | } 57 | ] 58 | }, 59 | "repeatChecksOnProvidedUrls": { 60 | "title": "Repeat checks on provided URLs", 61 | "type": "integer", 62 | "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.", 63 | "editor": "number" 64 | }, 65 | "maxNumberOfPagesCheckedPerDomain": { 66 | "title": "Max number of pages checked per domain", 67 | "type": "integer", 68 | "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.

If set to 0, there is no limit.", 69 | "default": 100, 70 | "editor": "number" 71 | }, 72 | "maxConcurrentPagesCheckedPerDomain": { 73 | "title": "Maximum concurrent pages checked per domain", 74 | "type": "integer", 75 | "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.", 76 | "default": 50, 77 | "editor": "number", 78 | "minimum": 1 79 | }, 80 | "maxConcurrentDomainsChecked": { 81 | "title": "Maximum number of concurrent domains checked", 82 | "type": "integer", 83 | "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.", 84 | "default": 5, 85 | "editor": "number", 86 | "minimum": 1, 87 | "maximum": 10 88 | }, 89 | "retireBrowserInstanceAfterRequestCount": { 90 | "title": "Retire browser instance after request count", 91 | "type": "integer", 92 | "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.", 93 | "default": 10, 94 | "editor": "number", 95 | "minimum": 1 96 | } 97 | }, 98 | "required": ["urlsToCheck"] 99 | } 100 | -------------------------------------------------------------------------------- /checker-cheerio/README.md: -------------------------------------------------------------------------------- 1 | # Website Checker Runner with Cheerio 2 | 3 | Checks the provided website using cheerio. This is a low level runner, most likely you want to use the high level master actor - https://apify.com/lukaskrivka/website-checker 4 | -------------------------------------------------------------------------------- /checker-cheerio/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawlee-cheerio-typescript", 3 | "version": "0.0.1", 4 | "type": "module", 5 | "description": "This is a boilerplate of an Apify actor.", 6 | "engines": { 7 | "node": ">=16.0.0" 8 | }, 9 | "dependencies": { 10 | "apify": "^3.1.0", 11 | "crawlee": "^3.1", 12 | "cheerio": "^1.0.0-rc.10" 13 | }, 14 | "devDependencies": { 15 | "@apify/eslint-config-ts": "^0.2.3", 16 | "@apify/tsconfig": "^0.1.0", 17 | "@typescript-eslint/eslint-plugin": "^5.32.0", 18 | "@typescript-eslint/parser": "^5.32.0", 19 | "eslint": "^8.20.0", 20 | "ts-node": "^10.9.1", 21 | "typescript": "^4.8" 22 | }, 23 | "scripts": { 24 | "start": "npm run start:dev", 25 | "start:prod": "node dist/main.js", 26 | "start:dev": "ts-node-esm -T src/main.ts", 27 | "build": "tsc", 28 | "lint": "eslint ./src --ext .ts", 29 | "lint:fix": "eslint ./src --ext .ts --fix", 30 | "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 31 | }, 32 | "author": "It's not you it's me", 33 | "license": "ISC" 34 | } 35 | -------------------------------------------------------------------------------- /checker-cheerio/src/checkers.ts: -------------------------------------------------------------------------------- 1 | import type { CheerioAPI } from 'cheerio'; 2 | 3 | export function distilCaptcha($: CheerioAPI): boolean { 4 | return $('#distilCaptchaForm').length > 0 5 | || $('[action*="distil_r_captcha.html"]').length > 0; 6 | } 7 | 8 | export function recaptcha($: CheerioAPI): boolean { 9 | return $('#recaptcha').length > 0 10 | || $('iframe[src*="/recaptcha/"]').length > 0; 11 | } 12 | 13 | export function hCaptcha($: CheerioAPI): boolean { 14 | return $('[action="/errors/validateCaptcha"]').length > 0; 15 | } 16 | 17 | export function accessDenied($: CheerioAPI): boolean { 18 | return $('title').text().includes('Access Denied'); 19 | } 20 | 21 | export function testHtml($: CheerioAPI) { 22 | return { 23 | accessDenied: accessDenied($), 24 | distilCaptcha: distilCaptcha($), 25 | recaptcha: recaptcha($), 26 | hCaptcha: hCaptcha($), 27 | }; 28 | } 29 | -------------------------------------------------------------------------------- /checker-cheerio/src/handleFailedRequest.ts: -------------------------------------------------------------------------------- 1 | import { log } from 'crawlee'; 2 | 3 | import type { CheerioCrawlingContext } from 'crawlee'; 4 | 5 | import type { ActorCheckDetailedOutput } from './typedefs.js'; 6 | 7 | export async function handleFailedRequest(state: ActorCheckDetailedOutput, { request }: CheerioCrawlingContext) { 8 | state.totalPages.push({ url: request.url }); 9 | 10 | const [error] = request.errorMessages; 11 | log.warning(`Request failed --- ${request.url}\n${error}`); 12 | 13 | if (error.includes('request timed out')) { 14 | state.timedOut.push({ url: request.url }); 15 | } else { 16 | state.failedToLoadOther.push({ url: request.url }); 17 | } 18 | 19 | // CheerioCrawler obscures status code >=500 to a string message so we have to parse it 20 | const maybeStatusCheerio = error.match(/(\d\d\d) - Internal Server Error/); 21 | if (maybeStatusCheerio) { 22 | const statusCode = Number(maybeStatusCheerio[1]); 23 | state.statusCodes[statusCode] ??= []; 24 | state.statusCodes[statusCode].push({ url: request.url }); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /checker-cheerio/src/handlePage.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | 3 | import type { RequestQueue } from 'apify'; 4 | import { PseudoUrl, RequestOptions } from 'crawlee'; 5 | 6 | import { testHtml } from './checkers.js'; 7 | 8 | import type { CheerioActorInput, ActorCheckDetailedOutput, CheerioCheckerHandlePageInputs } from './typedefs.js'; 9 | 10 | export async function handlePage( 11 | input: CheerioActorInput, 12 | requestQueue: RequestQueue, 13 | state: ActorCheckDetailedOutput, 14 | { request, $, body, response, crawler, json }: CheerioCheckerHandlePageInputs, 15 | ) { 16 | /** @type {string | undefined} */ 17 | let htmlUrl; 18 | 19 | if (input.saveSnapshot) { 20 | const key = `SNAPSHOT-${Math.random().toString()}`; 21 | if (json) { 22 | await Actor.setValue(key, json); 23 | } else { 24 | await Actor.setValue(`${key}.html`, body, { contentType: 'text/html' }); 25 | } 26 | htmlUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`; 27 | } 28 | 29 | state.totalPages.push({ url: request.url, htmlUrl }); 30 | 31 | const { statusCode } = response; 32 | 33 | state.statusCodes[statusCode!] ??= []; 34 | state.statusCodes[statusCode!].push({ url: request.url, htmlUrl }); 35 | 36 | const captchas: string[] = []; 37 | // We don't have $ for JSON responses nor we can recognize captchas from it 38 | if ($) { 39 | const testResult = testHtml($); 40 | 41 | for (const testResultEntry of Object.entries(testResult)) { 42 | const wasFound = testResultEntry[1]; 43 | const testCase = testResultEntry[0] as 'accessDenied' | 'distilCaptcha' | 'recaptcha' | 'hCaptcha'; 44 | if (wasFound) { 45 | captchas.push(testCase); 46 | 47 | state[testCase].push({ url: request.url, htmlUrl }); 48 | } 49 | } 50 | } 51 | 52 | const wasSuccess = statusCode! < 400 && captchas.length === 0; 53 | if (wasSuccess) { 54 | state.success.push({ url: request.url, htmlUrl }); 55 | } 56 | 57 | await Actor.pushData({ 58 | url: request.url, 59 | htmlUrl, 60 | statusCode, 61 | captchas, 62 | wasSuccess, 63 | }); 64 | 65 | const pageOrigin = new URL(request.url).origin; 66 | 67 | if (input.linkSelector && !!$) { 68 | const info = await requestQueue.getInfo(); 69 | 70 | const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount; 71 | if (maxUrlsToEnqueue > 0) { 72 | const toEnqueue: RequestOptions[] = []; 73 | $(input.linkSelector).each((_, el) => { 74 | const rawHref = $(el).attr('href'); 75 | if (!rawHref) { 76 | return; 77 | } 78 | const href = new URL(rawHref, pageOrigin).toString(); 79 | for (const pseudoUrlInput of input.pseudoUrls) { 80 | if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) { 81 | const newUrl = new URL(href, request.loadedUrl).toString(); 82 | toEnqueue.push({ 83 | url: newUrl, 84 | headers: pseudoUrlInput.headers, 85 | method: pseudoUrlInput.method as 'GET' | 'POST', 86 | payload: pseudoUrlInput.payload, 87 | userData: pseudoUrlInput.userData, 88 | }); 89 | } 90 | } 91 | }); 92 | console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`); 93 | await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue)); 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /checker-cheerio/src/main.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | import { log, CheerioCrawler } from 'crawlee'; 3 | import type { RequestOptions } from 'crawlee'; 4 | 5 | import { inspect } from 'util'; 6 | import { handleFailedRequest } from './handleFailedRequest.js'; 7 | import { handlePage } from './handlePage.js'; 8 | import { convertDetailedOutputToSimplified } from './utils.js'; 9 | import type { CheerioActorInput, ActorCheckDetailedOutput } from './typedefs.js'; 10 | 11 | Actor.main(async () => { 12 | const input = await Actor.getInput() as CheerioActorInput; 13 | 14 | // Log the input 15 | // Log the input 16 | log.info('Input provided:'); 17 | log.debug(inspect(input, false, 4)); 18 | 19 | log.info( 20 | [ 21 | 'Running a Cheerio Checker. Cheerio downloads only initial HTML.', 22 | 'If you need to render JavaScript or wait on a page for data to load, enable Puppeteer or Playwright as Checker Type in the Frontend.', 23 | ].join('\n'), 24 | ); 25 | 26 | const { 27 | maxConcurrentPagesCheckedPerDomain, 28 | maxNumberOfPagesCheckedPerDomain, 29 | proxyConfiguration, 30 | urlsToCheck, 31 | repeatChecksOnProvidedUrls, 32 | navigationTimeoutSecs, 33 | } = input; 34 | 35 | const proxy = await Actor.createProxyConfiguration({ 36 | groups: proxyConfiguration.apifyProxyGroups, 37 | countryCode: proxyConfiguration.apifyProxyCountry, 38 | }); 39 | 40 | const requestQueue = await Actor.openRequestQueue(); 41 | 42 | const [urlData] = urlsToCheck; 43 | await requestQueue.addRequest(urlData as RequestOptions); 44 | for (let _ = 0; _ < (repeatChecksOnProvidedUrls ?? 0); _++) { 45 | await requestQueue.addRequest({ 46 | ...urlData as RequestOptions, 47 | uniqueKey: Math.random().toString(), 48 | }); 49 | } 50 | 51 | const env = Actor.getEnv(); 52 | 53 | const state: ActorCheckDetailedOutput = { 54 | url: urlData.url, 55 | checkerType: 'cheerio', 56 | simplifiedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`, 57 | detailedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`, 58 | totalPages: [], 59 | timedOut: [], 60 | failedToLoadOther: [], 61 | accessDenied: [], 62 | success: [], 63 | statusCodes: {}, 64 | recaptcha: [], 65 | distilCaptcha: [], 66 | hCaptcha: [], 67 | }; 68 | 69 | const crawler = new CheerioCrawler({ 70 | maxRequestRetries: 0, 71 | navigationTimeoutSecs, 72 | maxRequestsPerCrawl: maxNumberOfPagesCheckedPerDomain, 73 | maxConcurrency: maxConcurrentPagesCheckedPerDomain, 74 | requestQueue, 75 | requestHandler: (pageInputs) => handlePage(input, requestQueue, state, pageInputs), 76 | failedRequestHandler: (requestInput) => handleFailedRequest(state, requestInput), 77 | proxyConfiguration: proxy, 78 | useSessionPool: false, 79 | additionalMimeTypes: ['application/xml'], 80 | }); 81 | 82 | // TODO: Consider making this an option in the CheerioCrawler instead of needing to override a function 83 | // We don't want the crawler to throw errors on bad statuses 84 | Reflect.set(crawler, '_throwOnBlockedRequest', () => { 85 | // Do nothing 86 | }); 87 | 88 | await crawler.run(); 89 | 90 | await Actor.setValue('OUTPUT', convertDetailedOutputToSimplified(state)); 91 | await Actor.setValue('DETAILED-OUTPUT', state); 92 | log.info('Checker finished.'); 93 | log.info( 94 | `Simplified output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`, 95 | ); 96 | log.info( 97 | `Detailed output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`, 98 | ); 99 | log.info(`Preview dataset: https://api.apify.com/v2/datasets/${env.defaultDatasetId}/items?clean=true&format=html`); 100 | }); 101 | -------------------------------------------------------------------------------- /checker-cheerio/src/typedefs.ts: -------------------------------------------------------------------------------- 1 | import type { CheerioCrawlingContext } from 'crawlee'; 2 | 3 | type KeysNotRequired = 4 | | 'checkers.cheerio' 5 | | 'checkers.puppeteer' 6 | | 'checkers.playwright' 7 | | 'puppeteer.headfull' 8 | | 'puppeteer.useChrome' 9 | | 'puppeteer.waitFor' 10 | | 'playwright.chrome' 11 | | 'playwright.firefox' 12 | | 'playwright.webkit' 13 | | 'maxConcurrentDomainsChecked'; 14 | 15 | export type CheerioActorInput = Omit; 16 | 17 | export type CheerioCheckerHandlePageInputs = CheerioCrawlingContext 18 | 19 | export interface PseudoUrlInputCustom { 20 | purl: string; 21 | method?: string; 22 | payload?: string; 23 | userData?: Record; 24 | headers?: Record; 25 | } 26 | 27 | export interface UrlInput { 28 | url: string; 29 | method?: string; 30 | payload?: string; 31 | userData?: Record; 32 | headers?: Record; 33 | } 34 | 35 | export interface ProxyConfiguration { 36 | useApifyProxy: boolean; 37 | apifyProxyGroups?: string[]; 38 | apifyProxyCountry?: string; 39 | } 40 | 41 | export interface ActorInputData { 42 | // Crawlers to use 43 | 'checkers.cheerio'?: boolean; 44 | 'checkers.puppeteer'?: boolean; 45 | 'checkers.playwright'?: boolean; 46 | 47 | // Pass these to crawlers 48 | 49 | // save snapshots 50 | saveSnapshot?: boolean; 51 | 52 | // General options 53 | urlsToCheck: UrlInput[]; 54 | proxyConfiguration: ProxyConfiguration; 55 | linkSelector?: string; 56 | pseudoUrls: PseudoUrlInputCustom[]; 57 | repeatChecksOnProvidedUrls?: number; 58 | maxNumberOfPagesCheckedPerDomain: number; 59 | maxConcurrentPagesCheckedPerDomain: number; 60 | maxConcurrentDomainsChecked: number; 61 | retireBrowserInstanceAfterRequestCount: number; 62 | navigationTimeoutSecs: number; 63 | 64 | // Pass only to puppeteer 65 | 'puppeteer.headfull'?: boolean; 66 | 'puppeteer.useChrome'?: boolean; 67 | 'puppeteer.waitFor'?: string; 68 | 69 | // Pass only to playwright 70 | 'playwright.chrome'?: boolean; 71 | 'playwright.firefox'?: boolean; 72 | 'playwright.webkit'?: boolean; 73 | 'playwright.headfull'?: boolean; 74 | 'playwright.useChrome'?: boolean; 75 | 'playwright.waitFor'?: string; 76 | } 77 | 78 | export interface PreparedActorConfig { 79 | actorId: string; 80 | proxyUsed?: string; 81 | url: string; 82 | input: ActorInputData; 83 | params: { 84 | memory: number; 85 | timeout: number; 86 | }; 87 | // This data is set when the config is ran 88 | runId?: string; 89 | } 90 | 91 | export interface CreateActorRunConfig { 92 | checkerId: string; 93 | input: ActorInputData; 94 | urlData: UrlInput; 95 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 96 | } 97 | 98 | // --- OUTPUT --- 99 | 100 | export interface ActorCheckDetailedOutput { 101 | // Set by waitForRunToFinishAndPushData 102 | proxyUsed?: string; 103 | checkerType: 'cheerio' | 'puppeteer' | 'playwright'; 104 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 105 | computeUnitsUsedForThisCheck?: number; 106 | // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit 107 | pagesPerComputeUnit?: number; 108 | 109 | // URLs 110 | url: string; 111 | simplifiedOutput: string; 112 | detailedOutput: string; 113 | 114 | // Page data 115 | totalPages: UrlCheckResult[]; 116 | timedOut: UrlCheckResult[]; 117 | failedToLoadOther: UrlCheckResult[]; 118 | accessDenied: UrlCheckResult[]; 119 | success: UrlCheckResult[]; 120 | 121 | // Status codes 122 | statusCodes: Record; 123 | 124 | // Captcha time 125 | recaptcha: UrlCheckResult[]; 126 | distilCaptcha: UrlCheckResult[]; 127 | hCaptcha: UrlCheckResult[]; 128 | } 129 | 130 | export interface UrlCheckResult { 131 | url: string; 132 | screenshotUrl?: string; 133 | htmlUrl?: string; 134 | } 135 | 136 | export type ActorCheckSimplifiedOutput = { 137 | [K in keyof ActorCheckDetailedOutput]: 138 | ActorCheckDetailedOutput[K] extends Array 139 | ? number 140 | : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] } 141 | ? Record 142 | : ActorCheckDetailedOutput[K]; 143 | }; 144 | -------------------------------------------------------------------------------- /checker-cheerio/src/utils.ts: -------------------------------------------------------------------------------- 1 | import { Dictionary } from 'crawlee'; 2 | import type { ActorCheckDetailedOutput, ActorCheckSimplifiedOutput } from './typedefs.js'; 3 | 4 | export function convertDetailedOutputToSimplified(data: ActorCheckDetailedOutput): ActorCheckSimplifiedOutput { 5 | const obj: Dictionary = {}; 6 | 7 | for (const [key, value] of Object.entries(data)) { 8 | if (Array.isArray(value)) { 9 | obj[key] = value.length; 10 | } else if (typeof value === 'object') { 11 | if (!obj[key]) { 12 | obj[key] = {}; 13 | } 14 | const nestedObject: Dictionary = obj[key]; 15 | 16 | for (const [statusCode, statusValue] of Object.entries(value)) { 17 | nestedObject[statusCode] = (statusValue as any).length; 18 | } 19 | } else { 20 | obj[key] = value; 21 | } 22 | } 23 | 24 | // @ts-expect-error We are merging the objects 25 | return obj; 26 | } 27 | -------------------------------------------------------------------------------- /checker-cheerio/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "module": "ES2022", 5 | "target": "ES2022", 6 | "outDir": "dist", 7 | "noUnusedLocals": false, 8 | "lib": ["DOM"], 9 | "skipLibCheck": true 10 | }, 11 | "include": [ 12 | "./src/**/*" 13 | ] 14 | } -------------------------------------------------------------------------------- /checker-playwright/.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "env": { 4 | "browser": true, 5 | "es2020": true, 6 | "node": true 7 | }, 8 | "extends": [ 9 | "@apify/eslint-config-ts" 10 | ], 11 | "parserOptions": { 12 | "project": "./tsconfig.json", 13 | "ecmaVersion": 2020 14 | }, 15 | "ignorePatterns": [ 16 | "node_modules", 17 | "dist", 18 | "**/*.d.ts" 19 | ] 20 | } -------------------------------------------------------------------------------- /checker-playwright/.gitignore: -------------------------------------------------------------------------------- 1 | # This file tells Git which files shouldn't be added to source control 2 | 3 | .DS_Store 4 | .idea 5 | dist 6 | node_modules 7 | apify_storage 8 | storage 9 | storage 10 | storage -------------------------------------------------------------------------------- /checker-playwright/Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://crawlee.dev/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node-playwright-chrome:16 AS builder 5 | 6 | # Copy just package.json and package-lock.json 7 | # to speed up the build using Docker layer cache. 8 | COPY --chown=myuser package*.json ./ 9 | 10 | # Install all dependencies. Don't audit to speed up the installation. 11 | RUN npm install --include=dev --audit=false 12 | 13 | # Next, copy the source files using the user set 14 | # in the base image. 15 | COPY --chown=myuser . ./ 16 | 17 | # Install all dependencies and build the project. 18 | # Don't audit to speed up the installation. 19 | RUN npm run build 20 | 21 | # Create final image 22 | FROM apify/actor-node-playwright-chrome:16 23 | 24 | # Copy only built JS files from builder image 25 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist 26 | 27 | # Copy just package.json and package-lock.json 28 | # to speed up the build using Docker layer cache. 29 | COPY --chown=myuser package*.json ./ 30 | 31 | # Install NPM packages, skip optional and development dependencies to 32 | # keep the image small. Avoid logging too much and print the dependency 33 | # tree for debugging 34 | RUN npm --quiet set progress=false \ 35 | && npm install --omit=dev --omit=optional \ 36 | && echo "Installed NPM packages:" \ 37 | && (npm list --omit=dev --all || true) \ 38 | && echo "Node.js version:" \ 39 | && node --version \ 40 | && echo "NPM version:" \ 41 | && npm --version \ 42 | && rm -r ~/.npm 43 | 44 | # Next, copy the remaining files and directories with the source code. 45 | # Since we do this after NPM install, quick build will be really fast 46 | # for most source file changes. 47 | COPY --chown=myuser . ./ 48 | 49 | 50 | # Run the image. If you know you won't need headful browsers, 51 | # you can remove the XVFB start script for a micro perf gain. 52 | CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent -------------------------------------------------------------------------------- /checker-playwright/INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Web Checker", 3 | "description": "The web checker actor loads URLs to check and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay. TODO: Needs to be more descriptive!!", 4 | "type": "object", 5 | "schemaVersion": 1, 6 | "properties": { 7 | "urlsToCheck": { 8 | "title": "URLs to check", 9 | "type": "array", 10 | "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the Use request queue option.

For details, see Start URLs in README.", 11 | "sectionCaption": "Checker Options", 12 | "sectionDescription": "Options that will be passed to the checkers", 13 | "editor": "requestListSources", 14 | "prefill": [ 15 | { 16 | "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011" 17 | } 18 | ] 19 | }, 20 | "proxyConfiguration": { 21 | "title": "Proxy Configuration", 22 | "type": "object", 23 | "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.

For details, see Proxy configuration in README.", 24 | "default": {}, 25 | "editor": "proxy", 26 | "prefill": { 27 | "useApifyProxy": false 28 | } 29 | }, 30 | "saveSnapshot": { 31 | "title": "Enabled", 32 | "type": "boolean", 33 | "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright", 34 | "editor": "checkbox", 35 | "groupCaption": "Save Snapshots" 36 | }, 37 | "linkSelector": { 38 | "title": "Link Selector", 39 | "type": "string", 40 | "description": "A CSS selector saying which links on the page (<a> elements with href attribute) shall be followed and added to the request queue. This setting only applies if Use request queue is enabled. To filter the links added to the queue, use the Pseudo-URLs setting.

If Link selector is empty, the page links are ignored.

For details, see Link selector in README.", 41 | "sectionCaption": "Crawler Options", 42 | "sectionDescription": "Specific options that are relevant for crawlers", 43 | "editor": "textfield", 44 | "prefill": "a[href]", 45 | "minLength": 1 46 | }, 47 | "pseudoUrls": { 48 | "title": "Pseudo-URLs", 49 | "type": "array", 50 | "description": "Specifies what kind of URLs found by Link selector should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in [] brackets, e.g. http://www.example.com/[.*]. This setting only applies if the Use request queue option is enabled.

If Pseudo-URLs are omitted, the actor enqueues all links matched by the Link selector.

For details, see Pseudo-URLs in README.", 51 | "default": [], 52 | "editor": "pseudoUrls", 53 | "prefill": [ 54 | { 55 | "purl": "https://www.amazon.com[.*]/dp/[.*]" 56 | } 57 | ] 58 | }, 59 | "repeatChecksOnProvidedUrls": { 60 | "title": "Repeat checks on provided URLs", 61 | "type": "integer", 62 | "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.", 63 | "editor": "number" 64 | }, 65 | "maxNumberOfPagesCheckedPerDomain": { 66 | "title": "Max number of pages checked per domain", 67 | "type": "integer", 68 | "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.

If set to 0, there is no limit.", 69 | "default": 100, 70 | "editor": "number" 71 | }, 72 | "maxConcurrentPagesCheckedPerDomain": { 73 | "title": "Maximum concurrent pages checked per domain", 74 | "type": "integer", 75 | "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.", 76 | "default": 50, 77 | "editor": "number", 78 | "minimum": 1 79 | }, 80 | "maxConcurrentDomainsChecked": { 81 | "title": "Maximum number of concurrent domains checked", 82 | "type": "integer", 83 | "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.", 84 | "default": 5, 85 | "editor": "number", 86 | "minimum": 1, 87 | "maximum": 10 88 | }, 89 | "retireBrowserInstanceAfterRequestCount": { 90 | "title": "Retire browser instance after request count", 91 | "type": "integer", 92 | "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.", 93 | "default": 10, 94 | "editor": "number", 95 | "minimum": 1 96 | }, 97 | "playwright.chrome": { 98 | "title": "Chrome", 99 | "type": "boolean", 100 | "description": "Use Chrome when checking", 101 | "default": true, 102 | "sectionCaption": "Playwright options", 103 | "sectionDescription": "Options passed to playwright when checking", 104 | "editor": "checkbox", 105 | "groupCaption": "Browser type", 106 | "groupDescription": "Which type of browser should the checker use" 107 | }, 108 | "playwright.firefox": { 109 | "title": "Firefox", 110 | "type": "boolean", 111 | "description": "Use Firefox when checking", 112 | "editor": "checkbox" 113 | }, 114 | "playwright.webkit": { 115 | "title": "Safari (Webkit)", 116 | "type": "boolean", 117 | "description": "Use Safari when checking", 118 | "editor": "checkbox" 119 | }, 120 | "playwright.useChrome": { 121 | "title": "Use Chrome instead of Chromium", 122 | "type": "boolean", 123 | "description": "Only works for Playwright type! Be careful that Chrome is not guaranteed to work with Playwright.", 124 | "editor": "checkbox" 125 | }, 126 | "playwright.headfull": { 127 | "title": "Headfull browser (XVFB)", 128 | "type": "boolean", 129 | "description": "If the browser should be headfull or not", 130 | "editor": "checkbox" 131 | }, 132 | "playwright.waitFor": { 133 | "title": "Wait for", 134 | "type": "string", 135 | "description": "Only works for playwright type. Will wait on each page. You can provide number in ms or a selector.", 136 | "editor": "textfield" 137 | } 138 | }, 139 | "required": ["urlsToCheck"] 140 | } 141 | -------------------------------------------------------------------------------- /checker-playwright/README.md: -------------------------------------------------------------------------------- 1 | # Website Checker Runner with Playwright 2 | 3 | Checks the provided website using Playwright. This is a low level runner, most likely you want to use the high level master actor - https://apify.com/lukaskrivka/website-checker 4 | -------------------------------------------------------------------------------- /checker-playwright/apify.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "website-checker-playwright", 3 | "version": "0.0.0", 4 | "buildTag": "latest", 5 | "env": null, 6 | "template": "basic" 7 | } 8 | -------------------------------------------------------------------------------- /checker-playwright/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawlee-puppeteer-typescript", 3 | "version": "0.0.1", 4 | "type": "module", 5 | "description": "This is an example of an Apify actor.", 6 | "engines": { 7 | "node": ">=16.0.0" 8 | }, 9 | "dependencies": { 10 | "apify": "^3.0.0", 11 | "crawlee": "^3.0.0", 12 | "playwright": "*", 13 | "cheerio": "^1.0.0-rc.10" 14 | }, 15 | "devDependencies": { 16 | "@apify/eslint-config-ts": "^0.2.3", 17 | "@apify/tsconfig": "^0.1.0", 18 | "@typescript-eslint/eslint-plugin": "^5.32.0", 19 | "@typescript-eslint/parser": "^5.32.0", 20 | "eslint": "^8.20.0", 21 | "ts-node": "^10.9.1", 22 | "typescript": "4.7.4" 23 | }, 24 | "scripts": { 25 | "start": "npm run start:dev", 26 | "start:prod": "node dist/main.js", 27 | "start:dev": "ts-node-esm -T src/main.ts", 28 | "build": "tsc", 29 | "lint": "eslint ./src --ext .ts", 30 | "lint:fix": "eslint ./src --ext .ts --fix", 31 | "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 32 | }, 33 | "author": "It's not you it's me", 34 | "license": "ISC" 35 | } 36 | -------------------------------------------------------------------------------- /checker-playwright/src/checkers.ts: -------------------------------------------------------------------------------- 1 | import type { CheerioAPI } from 'cheerio'; 2 | 3 | export function distilCaptcha($: CheerioAPI): boolean { 4 | return $('#distilCaptchaForm').length > 0 5 | || $('[action*="distil_r_captcha.html"]').length > 0; 6 | } 7 | 8 | export function recaptcha($: CheerioAPI): boolean { 9 | return $('#recaptcha').length > 0 10 | || $('iframe[src*="/recaptcha/"]').length > 0; 11 | } 12 | 13 | export function hCaptcha($: CheerioAPI): boolean { 14 | return $('[action="/errors/validateCaptcha"]').length > 0; 15 | } 16 | 17 | export function accessDenied($: CheerioAPI): boolean { 18 | return $('title').text().includes('Access Denied'); 19 | } 20 | 21 | export function testHtml($: CheerioAPI) { 22 | return { 23 | accessDenied: accessDenied($), 24 | distilCaptcha: distilCaptcha($), 25 | recaptcha: recaptcha($), 26 | hCaptcha: hCaptcha($), 27 | }; 28 | } 29 | -------------------------------------------------------------------------------- /checker-playwright/src/handleFailedRequest.ts: -------------------------------------------------------------------------------- 1 | import { log } from 'crawlee'; 2 | 3 | import type { PlaywrightCrawlingContext } from 'crawlee'; 4 | 5 | import type { ActorCheckDetailedOutput } from './typedefs.js'; 6 | 7 | export async function handleFailedRequest(state: ActorCheckDetailedOutput, { request }: PlaywrightCrawlingContext) { 8 | state.totalPages.push({ url: request.url }); 9 | 10 | const [error] = request.errorMessages; 11 | log.warning(`Request failed --- ${request.url}\n${error}`); 12 | 13 | if (error.includes('request timed out')) { 14 | state.timedOut.push({ url: request.url }); 15 | } else { 16 | state.failedToLoadOther.push({ url: request.url }); 17 | } 18 | 19 | // CheerioCrawler obscures status code >=500 to a string message so we have to parse it 20 | const maybeStatusCheerio = error.match(/(\d\d\d) - Internal Server Error/); 21 | if (maybeStatusCheerio) { 22 | const statusCode = Number(maybeStatusCheerio[1]); 23 | state.statusCodes[statusCode] ??= []; 24 | state.statusCodes[statusCode].push({ url: request.url }); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /checker-playwright/src/handlePage.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | import Cheerio from 'cheerio'; 3 | 4 | import { PseudoUrl } from 'crawlee'; 5 | import type { RequestQueue } from 'apify'; 6 | import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee'; 7 | 8 | import { testHtml } from './checkers.js'; 9 | 10 | import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs.js'; 11 | 12 | const env = Actor.getEnv(); 13 | 14 | export async function handlePage( 15 | input: PlaywrightActorInput, 16 | requestQueue: RequestQueue, 17 | state: ActorCheckDetailedOutput, 18 | { request, response, page, crawler }: PlaywrightCrawlingContext, 19 | ): Promise { 20 | let htmlUrl; 21 | let screenshotUrl; 22 | 23 | const waitFor = input['playwright.waitFor']; 24 | 25 | if (waitFor) { 26 | // We wait for number in ms or a selector 27 | const maybeNumber = Number(waitFor); 28 | if (maybeNumber || maybeNumber === 0) { 29 | await page.waitForTimeout(maybeNumber); 30 | } else { 31 | await page.waitForSelector(waitFor); 32 | } 33 | } 34 | 35 | const html = await page.content(); 36 | 37 | if (input.saveSnapshot) { 38 | const key = `SNAPSHOT-${Math.random().toString()}`; 39 | const screenshot = await page.screenshot({ fullPage: true }); 40 | 41 | // TODO: Create a utils.playwright.saveSnapshot, like we have for puppeteer 42 | await Actor.setValue(`${key}.html`, html, { contentType: 'text/html' }); 43 | await Actor.setValue(`${key}.png`, screenshot, { contentType: 'image/png' }); 44 | 45 | screenshotUrl = `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/${key}.png?disableRedirect=true`; 46 | htmlUrl = `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`; 47 | } 48 | 49 | state.totalPages.push({ url: request.url, htmlUrl, screenshotUrl }); 50 | 51 | const statusCode = response!.status(); 52 | 53 | state.statusCodes[statusCode] ??= []; 54 | state.statusCodes[statusCode].push({ url: request.url, htmlUrl, screenshotUrl }); 55 | 56 | const $ = Cheerio.load(html); 57 | 58 | const captchas: string[] = []; 59 | const testResult = testHtml($); 60 | 61 | for (const testResultEntry of Object.entries(testResult)) { 62 | const wasFound = testResultEntry[1]; 63 | const testCase = testResultEntry[0] as 'accessDenied' | 'distilCaptcha' | 'recaptcha' | 'hCaptcha'; 64 | if (wasFound) { 65 | captchas.push(testCase); 66 | 67 | state[testCase].push({ url: request.url, htmlUrl }); 68 | } 69 | } 70 | 71 | const wasSuccess = statusCode < 400 && captchas.length === 0; 72 | if (wasSuccess) { 73 | state.success.push({ url: request.url, htmlUrl, screenshotUrl }); 74 | } 75 | 76 | await Actor.pushData({ 77 | url: request.url, 78 | htmlUrl, 79 | screenshotUrl, 80 | statusCode, 81 | captchas, 82 | wasSuccess, 83 | }); 84 | 85 | const pageOrigin = new URL(request.url).origin; 86 | 87 | if (input.linkSelector && !!$) { 88 | const info = await requestQueue.getInfo(); 89 | 90 | const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount; 91 | if (maxUrlsToEnqueue > 0) { 92 | const toEnqueue: RequestOptions[] = []; 93 | $(input.linkSelector).each((_, el) => { 94 | const rawHref = $(el).attr('href'); 95 | if (!rawHref) { 96 | return; 97 | } 98 | const href = new URL(rawHref, pageOrigin).toString(); 99 | for (const pseudoUrlInput of input.pseudoUrls) { 100 | if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) { 101 | const newUrl = new URL(href, request.loadedUrl).toString(); 102 | toEnqueue.push({ 103 | url: newUrl, 104 | headers: pseudoUrlInput.headers, 105 | method: pseudoUrlInput.method as 'GET' | 'POST', 106 | payload: pseudoUrlInput.payload, 107 | userData: pseudoUrlInput.userData, 108 | }); 109 | } 110 | } 111 | }); 112 | console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`); 113 | await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue)); 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /checker-playwright/src/main.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | import { log, PlaywrightCrawler, RequestOptions } from 'crawlee'; 3 | import { chromium, firefox, webkit } from 'playwright'; 4 | import { inspect } from 'util'; 5 | 6 | import type { ActorCheckDetailedOutput, PlaywrightActorInput } from './typedefs'; 7 | 8 | import { handleFailedRequest } from './handleFailedRequest.js'; 9 | import { handlePage } from './handlePage.js'; 10 | import { convertDetailedOutputToSimplified } from './utils.js'; 11 | 12 | const env = Actor.getEnv(); 13 | 14 | Actor.main(async () => { 15 | const input = await Actor.getInput() as PlaywrightActorInput; 16 | 17 | log.info('Input provided:'); 18 | log.debug(inspect(input, false, 4)); 19 | 20 | log.info('Running a Playwright Checker.'); 21 | 22 | const { 23 | maxConcurrentPagesCheckedPerDomain, 24 | maxNumberOfPagesCheckedPerDomain, 25 | proxyConfiguration, 26 | urlsToCheck, 27 | repeatChecksOnProvidedUrls, 28 | retireBrowserInstanceAfterRequestCount, 29 | 'playwright.useChrome': useChrome, 30 | 'playwright.headfull': headfull, 31 | 'playwright.chrome': playwrightChromeLauncher, 32 | 'playwright.firefox': playwrightFirefoxLauncher, 33 | 'playwright.webkit': playwrightWebkitLauncher, 34 | } = input; 35 | 36 | let launcher; 37 | 38 | if (playwrightChromeLauncher) { 39 | launcher = chromium; 40 | } else if (playwrightFirefoxLauncher) { 41 | launcher = firefox; 42 | } else if (playwrightWebkitLauncher) { 43 | launcher = webkit; 44 | } 45 | 46 | const proxy = await Actor.createProxyConfiguration({ 47 | groups: proxyConfiguration.apifyProxyGroups, 48 | countryCode: proxyConfiguration.apifyProxyCountry, 49 | }); 50 | 51 | const requestQueue = await Actor.openRequestQueue(); 52 | 53 | const [urlData] = urlsToCheck; 54 | await requestQueue.addRequest(urlData as RequestOptions); 55 | for (let _ = 0; _ < (repeatChecksOnProvidedUrls ?? 0); _++) { 56 | await requestQueue.addRequest({ 57 | ...urlData, 58 | uniqueKey: Math.random().toString(), 59 | } as RequestOptions); 60 | } 61 | 62 | const state: ActorCheckDetailedOutput = { 63 | url: urlData.url, 64 | checkerType: 'playwright', 65 | simplifiedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`, 66 | detailedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`, 67 | totalPages: [], 68 | timedOut: [], 69 | failedToLoadOther: [], 70 | accessDenied: [], 71 | success: [], 72 | statusCodes: {}, 73 | recaptcha: [], 74 | distilCaptcha: [], 75 | hCaptcha: [], 76 | }; 77 | 78 | const crawler = new PlaywrightCrawler({ 79 | maxRequestRetries: 0, 80 | maxRequestsPerCrawl: maxNumberOfPagesCheckedPerDomain, 81 | maxConcurrency: maxConcurrentPagesCheckedPerDomain, 82 | requestQueue, 83 | requestHandler: (pageInputs) => handlePage(input, requestQueue, state, pageInputs), 84 | failedRequestHandler: (requestInput) => handleFailedRequest(state, requestInput), 85 | proxyConfiguration: proxy, 86 | useSessionPool: false, 87 | launchContext: { 88 | useChrome, 89 | launchOptions: { 90 | headless: headfull ? undefined : true, 91 | }, 92 | launcher, 93 | }, 94 | browserPoolOptions: { 95 | retireBrowserAfterPageCount: retireBrowserInstanceAfterRequestCount, 96 | }, 97 | }); 98 | 99 | await crawler.run(); 100 | 101 | await Actor.setValue('OUTPUT', convertDetailedOutputToSimplified(state)); 102 | await Actor.setValue('DETAILED-OUTPUT', state); 103 | log.info('Checker finished.'); 104 | log.info( 105 | `Simplified output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`, 106 | ); 107 | log.info( 108 | `Detailed output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`, 109 | ); 110 | log.info(`Preview dataset: https://api.apify.com/v2/datasets/${env.defaultDatasetId}/items?clean=true&format=html`); 111 | }); 112 | -------------------------------------------------------------------------------- /checker-playwright/src/typedefs.ts: -------------------------------------------------------------------------------- 1 | type KeysNotRequired = 2 | | 'checkers.cheerio' 3 | | 'checkers.puppeteer' 4 | | 'checkers.playwright' 5 | | 'puppeteer.headfull' 6 | | 'puppeteer.useChrome' 7 | | 'puppeteer.waitFor' 8 | | 'maxConcurrentDomainsChecked'; 9 | 10 | export type PlaywrightActorInput = Omit; 11 | 12 | export interface PseudoUrlInputCustom { 13 | purl: string; 14 | method?: string; 15 | payload?: string; 16 | userData?: Record; 17 | headers?: Record; 18 | } 19 | 20 | export interface UrlInput { 21 | url: string; 22 | method?: string; 23 | payload?: string; 24 | userData?: Record; 25 | headers?: Record; 26 | } 27 | 28 | export interface ProxyConfiguration { 29 | useApifyProxy: boolean; 30 | apifyProxyGroups?: string[]; 31 | apifyProxyCountry?: string; 32 | } 33 | 34 | export interface ActorInputData { 35 | // Crawlers to use 36 | 'checkers.cheerio'?: boolean; 37 | 'checkers.puppeteer'?: boolean; 38 | 'checkers.playwright'?: boolean; 39 | 40 | // Pass these to crawlers 41 | 42 | // save snapshots 43 | saveSnapshot?: boolean; 44 | 45 | // General options 46 | urlsToCheck: UrlInput[]; 47 | proxyConfiguration: ProxyConfiguration; 48 | linkSelector?: string; 49 | pseudoUrls: PseudoUrlInputCustom[]; 50 | repeatChecksOnProvidedUrls?: number; 51 | maxNumberOfPagesCheckedPerDomain: number; 52 | maxConcurrentPagesCheckedPerDomain: number; 53 | maxConcurrentDomainsChecked: number; 54 | retireBrowserInstanceAfterRequestCount: number; 55 | 56 | // Pass only to puppeteer 57 | 'puppeteer.headfull'?: boolean; 58 | 'puppeteer.useChrome'?: boolean; 59 | 'puppeteer.waitFor'?: string; 60 | 61 | // Pass only to playwright 62 | 'playwright.chrome'?: boolean; 63 | 'playwright.firefox'?: boolean; 64 | 'playwright.webkit'?: boolean; 65 | 'playwright.headfull'?: boolean; 66 | 'playwright.useChrome'?: boolean; 67 | 'playwright.waitFor'?: string; 68 | } 69 | 70 | export interface PreparedActorConfig { 71 | actorId: string; 72 | proxyUsed?: string; 73 | url: string; 74 | input: ActorInputData; 75 | params: { 76 | memory: number; 77 | timeout: number; 78 | }; 79 | // This data is set when the config is ran 80 | runId?: string; 81 | } 82 | 83 | export interface CreateActorRunConfig { 84 | checkerId: string; 85 | input: ActorInputData; 86 | urlData: UrlInput; 87 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 88 | } 89 | 90 | // --- OUTPUT --- 91 | 92 | export interface ActorCheckDetailedOutput { 93 | // Set by waitForRunToFinishAndPushData 94 | proxyUsed?: string; 95 | checkerType: 'cheerio' | 'puppeteer' | 'playwright'; 96 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 97 | computeUnitsUsedForThisCheck?: number; 98 | // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit 99 | pagesPerComputeUnit?: number; 100 | 101 | // URLs 102 | url: string; 103 | simplifiedOutput: string; 104 | detailedOutput: string; 105 | 106 | // Page data 107 | totalPages: UrlCheckResult[]; 108 | timedOut: UrlCheckResult[]; 109 | failedToLoadOther: UrlCheckResult[]; 110 | accessDenied: UrlCheckResult[]; 111 | success: UrlCheckResult[]; 112 | 113 | // Status codes 114 | statusCodes: Record; 115 | 116 | // Captcha time 117 | recaptcha: UrlCheckResult[]; 118 | distilCaptcha: UrlCheckResult[]; 119 | hCaptcha: UrlCheckResult[]; 120 | } 121 | 122 | export interface UrlCheckResult { 123 | url: string; 124 | screenshotUrl?: string; 125 | htmlUrl?: string; 126 | } 127 | 128 | export type ActorCheckSimplifiedOutput = { 129 | [K in keyof ActorCheckDetailedOutput]: 130 | ActorCheckDetailedOutput[K] extends Array 131 | ? number 132 | : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] } 133 | ? Record 134 | : ActorCheckDetailedOutput[K]; 135 | }; 136 | -------------------------------------------------------------------------------- /checker-playwright/src/utils.ts: -------------------------------------------------------------------------------- 1 | import { Dictionary } from 'crawlee'; 2 | import type { ActorCheckDetailedOutput, ActorCheckSimplifiedOutput } from './typedefs.js'; 3 | 4 | export function convertDetailedOutputToSimplified(data: ActorCheckDetailedOutput): ActorCheckSimplifiedOutput { 5 | const obj: Dictionary = {}; 6 | 7 | for (const [key, value] of Object.entries(data)) { 8 | if (Array.isArray(value)) { 9 | obj[key] = value.length; 10 | } else if (typeof value === 'object') { 11 | if (!obj[key]) { 12 | obj[key] = {}; 13 | } 14 | const nestedObject: Dictionary = obj[key]; 15 | 16 | for (const [statusCode, statusValue] of Object.entries(value)) { 17 | nestedObject[statusCode] = (statusValue as any).length; 18 | } 19 | } else { 20 | obj[key] = value; 21 | } 22 | } 23 | 24 | // @ts-expect-error We are merging the objects 25 | return obj; 26 | }; 27 | -------------------------------------------------------------------------------- /checker-playwright/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "module": "ES2022", 5 | "target": "ES2022", 6 | "outDir": "dist", 7 | "noUnusedLocals": false, 8 | "lib": ["DOM"] 9 | }, 10 | "include": [ 11 | "./src/**/*" 12 | ] 13 | } -------------------------------------------------------------------------------- /checker-puppeteer/.gitignore: -------------------------------------------------------------------------------- 1 | # This file tells Git which files shouldn't be added to source control 2 | 3 | .DS_Store 4 | .idea 5 | dist 6 | node_modules 7 | apify_storage 8 | storage 9 | storage -------------------------------------------------------------------------------- /checker-puppeteer/Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://crawlee.dev/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node-puppeteer-chrome:16 AS builder 5 | 6 | # Copy just package.json and package-lock.json 7 | # to speed up the build using Docker layer cache. 8 | COPY --chown=myuser package*.json ./ 9 | 10 | # Install all dependencies. Don't audit to speed up the installation. 11 | RUN npm install --include=dev --audit=false 12 | 13 | # Next, copy the source files using the user set 14 | # in the base image. 15 | COPY --chown=myuser . ./ 16 | 17 | # Install all dependencies and build the project. 18 | # Don't audit to speed up the installation. 19 | RUN npm run build 20 | 21 | # Create final image 22 | FROM apify/actor-node-puppeteer-chrome:16 23 | 24 | # Copy only built JS files from builder image 25 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist 26 | 27 | # Copy just package.json and package-lock.json 28 | # to speed up the build using Docker layer cache. 29 | COPY --chown=myuser package*.json ./ 30 | 31 | # Install NPM packages, skip optional and development dependencies to 32 | # keep the image small. Avoid logging too much and print the dependency 33 | # tree for debugging 34 | RUN npm --quiet set progress=false \ 35 | && npm install --omit=dev --omit=optional \ 36 | && echo "Installed NPM packages:" \ 37 | && (npm list --omit=dev --all || true) \ 38 | && echo "Node.js version:" \ 39 | && node --version \ 40 | && echo "NPM version:" \ 41 | && npm --version \ 42 | && rm -r ~/.npm 43 | 44 | # Next, copy the remaining files and directories with the source code. 45 | # Since we do this after NPM install, quick build will be really fast 46 | # for most source file changes. 47 | COPY --chown=myuser . ./ 48 | 49 | 50 | # Run the image. If you know you won't need headful browsers, 51 | # you can remove the XVFB start script for a micro perf gain. 52 | CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent -------------------------------------------------------------------------------- /checker-puppeteer/INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Web Checker", 3 | "description": "The web checker actor loads URLs to check and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay. TODO: Needs to be more descriptive!!", 4 | "type": "object", 5 | "schemaVersion": 1, 6 | "properties": { 7 | "urlsToCheck": { 8 | "title": "URLs to check", 9 | "type": "array", 10 | "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the Use request queue option.

For details, see Start URLs in README.", 11 | "sectionCaption": "Checker Options", 12 | "sectionDescription": "Options that will be passed to the checkers", 13 | "editor": "requestListSources", 14 | "prefill": [ 15 | { 16 | "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011" 17 | } 18 | ] 19 | }, 20 | "proxyConfiguration": { 21 | "title": "Proxy Configuration", 22 | "type": "object", 23 | "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.

For details, see Proxy configuration in README.", 24 | "default": {}, 25 | "editor": "proxy", 26 | "prefill": { 27 | "useApifyProxy": false 28 | } 29 | }, 30 | "saveSnapshot": { 31 | "title": "Enabled", 32 | "type": "boolean", 33 | "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright", 34 | "editor": "checkbox", 35 | "groupCaption": "Save Snapshots" 36 | }, 37 | "linkSelector": { 38 | "title": "Link Selector", 39 | "type": "string", 40 | "description": "A CSS selector saying which links on the page (<a> elements with href attribute) shall be followed and added to the request queue. This setting only applies if Use request queue is enabled. To filter the links added to the queue, use the Pseudo-URLs setting.

If Link selector is empty, the page links are ignored.

For details, see Link selector in README.", 41 | "sectionCaption": "Crawler Options", 42 | "sectionDescription": "Specific options that are relevant for crawlers", 43 | "editor": "textfield", 44 | "prefill": "a[href]", 45 | "minLength": 1 46 | }, 47 | "pseudoUrls": { 48 | "title": "Pseudo-URLs", 49 | "type": "array", 50 | "description": "Specifies what kind of URLs found by Link selector should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in [] brackets, e.g. http://www.example.com/[.*]. This setting only applies if the Use request queue option is enabled.

If Pseudo-URLs are omitted, the actor enqueues all links matched by the Link selector.

For details, see Pseudo-URLs in README.", 51 | "default": [], 52 | "editor": "pseudoUrls", 53 | "prefill": [ 54 | { 55 | "purl": "https://www.amazon.com[.*]/dp/[.*]" 56 | } 57 | ] 58 | }, 59 | "repeatChecksOnProvidedUrls": { 60 | "title": "Repeat checks on provided URLs", 61 | "type": "integer", 62 | "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.", 63 | "editor": "number" 64 | }, 65 | "maxNumberOfPagesCheckedPerDomain": { 66 | "title": "Max number of pages checked per domain", 67 | "type": "integer", 68 | "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.

If set to 0, there is no limit.", 69 | "default": 100, 70 | "editor": "number" 71 | }, 72 | "maxConcurrentPagesCheckedPerDomain": { 73 | "title": "Maximum concurrent pages checked per domain", 74 | "type": "integer", 75 | "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.", 76 | "default": 50, 77 | "editor": "number", 78 | "minimum": 1 79 | }, 80 | "maxConcurrentDomainsChecked": { 81 | "title": "Maximum number of concurrent domains checked", 82 | "type": "integer", 83 | "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.", 84 | "default": 5, 85 | "editor": "number", 86 | "minimum": 1, 87 | "maximum": 10 88 | }, 89 | "retireBrowserInstanceAfterRequestCount": { 90 | "title": "Retire browser instance after request count", 91 | "type": "integer", 92 | "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.", 93 | "default": 10, 94 | "editor": "number", 95 | "minimum": 1 96 | }, 97 | "puppeteer.headfull": { 98 | "title": "Headfull browser (XVFB)", 99 | "type": "boolean", 100 | "description": "Only works for Puppeteer type!", 101 | "sectionCaption": "Puppeteer Options", 102 | "sectionDescription": "Options that are passed in to puppeteer when checking", 103 | "editor": "checkbox" 104 | }, 105 | "puppeteer.useChrome": { 106 | "title": "Use Chrome", 107 | "type": "boolean", 108 | "description": "Only works for Puppeteer type! Be careful that Chrome is not guaranteed to work with Puppeteer.", 109 | "editor": "checkbox" 110 | }, 111 | "puppeteer.waitFor": { 112 | "title": "Wait for", 113 | "type": "string", 114 | "description": "Only works for Puppeteer type. Will wait on each page. You can provide number in ms or a selector.", 115 | "editor": "textfield" 116 | } 117 | }, 118 | "required": ["urlsToCheck"] 119 | } 120 | -------------------------------------------------------------------------------- /checker-puppeteer/README.md: -------------------------------------------------------------------------------- 1 | # Website Checker Runner with Puppeteer 2 | 3 | Checks the provided website using Puppeteer. This is a low level runner, most likely you want to use the high level master actor - https://apify.com/lukaskrivka/website-checker -------------------------------------------------------------------------------- /checker-puppeteer/apify.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "checker-puppeteer", 3 | "version": "0.0", 4 | "buildTag": "latest", 5 | "env": null 6 | } 7 | -------------------------------------------------------------------------------- /checker-puppeteer/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawlee-puppeteer-typescript", 3 | "version": "0.0.1", 4 | "type": "module", 5 | "description": "This is an example of an Apify actor.", 6 | "engines": { 7 | "node": ">=16.0.0" 8 | }, 9 | "dependencies": { 10 | "apify": "^3.0.0", 11 | "crawlee": "^3.0.0", 12 | "puppeteer": "*", 13 | "cheerio": "^1.0.0-rc.10" 14 | }, 15 | "devDependencies": { 16 | "@apify/eslint-config-ts": "^0.2.3", 17 | "@apify/tsconfig": "^0.1.0", 18 | "@typescript-eslint/eslint-plugin": "^5.32.0", 19 | "@typescript-eslint/parser": "^5.32.0", 20 | "eslint": "^8.20.0", 21 | "ts-node": "^10.9.1", 22 | "typescript": "4.7.4" 23 | }, 24 | "scripts": { 25 | "start": "npm run start:dev", 26 | "start:prod": "node dist/main.js", 27 | "start:dev": "ts-node-esm -T src/main.ts", 28 | "build": "tsc", 29 | "lint": "eslint ./src --ext .ts", 30 | "lint:fix": "eslint ./src --ext .ts --fix", 31 | "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 32 | }, 33 | "author": "It's not you it's me", 34 | "license": "ISC" 35 | } 36 | -------------------------------------------------------------------------------- /checker-puppeteer/src/checkers.ts: -------------------------------------------------------------------------------- 1 | import type { CheerioAPI } from 'cheerio'; 2 | 3 | export function distilCaptcha($: CheerioAPI): boolean { 4 | return $('#distilCaptchaForm').length > 0 5 | || $('[action*="distil_r_captcha.html"]').length > 0; 6 | } 7 | 8 | export function recaptcha($: CheerioAPI): boolean { 9 | return $('#recaptcha').length > 0 10 | || $('iframe[src*="/recaptcha/"]').length > 0; 11 | } 12 | 13 | export function hCaptcha($: CheerioAPI): boolean { 14 | return $('[action="/errors/validateCaptcha"]').length > 0; 15 | } 16 | 17 | export function accessDenied($: CheerioAPI): boolean { 18 | return $('title').text().includes('Access Denied'); 19 | } 20 | 21 | export function testHtml($: CheerioAPI) { 22 | return { 23 | accessDenied: accessDenied($), 24 | distilCaptcha: distilCaptcha($), 25 | recaptcha: recaptcha($), 26 | hCaptcha: hCaptcha($), 27 | }; 28 | } -------------------------------------------------------------------------------- /checker-puppeteer/src/handleFailedRequest.ts: -------------------------------------------------------------------------------- 1 | import { log } from 'crawlee'; 2 | 3 | import type { PuppeteerCrawlingContext } from 'crawlee'; 4 | 5 | import { ActorCheckDetailedOutput } from './typedefs.js'; 6 | 7 | export async function handleFailedRequest(state: ActorCheckDetailedOutput, { request }: PuppeteerCrawlingContext) { 8 | state.totalPages.push({ url: request.url }); 9 | 10 | const [error] = request.errorMessages; 11 | log.warning(`Request failed --- ${request.url}\n${error}`); 12 | 13 | if (error.includes('request timed out')) { 14 | state.timedOut.push({ url: request.url }); 15 | } else { 16 | state.failedToLoadOther.push({ url: request.url }); 17 | } 18 | 19 | // CheerioCrawler obscures status code >=500 to a string message so we have to parse it 20 | const maybeStatusCheerio = error.match(/(\d\d\d) - Internal Server Error/); 21 | if (maybeStatusCheerio) { 22 | const statusCode = Number(maybeStatusCheerio[1]); 23 | state.statusCodes[statusCode] ??= []; 24 | state.statusCodes[statusCode].push({ url: request.url }); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /checker-puppeteer/src/handlePage.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | import Cheerio from 'cheerio'; 3 | import { testHtml } from './checkers.js'; 4 | import { puppeteerUtils, PseudoUrl } from 'crawlee'; 5 | 6 | import type { RequestQueue } from 'apify'; 7 | import type { PuppeteerCrawlingContext, RequestOptions } from 'crawlee'; 8 | 9 | import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs.js'; 10 | 11 | export async function handlePage( 12 | input: PuppeteerActorInput, 13 | requestQueue: RequestQueue, 14 | state: ActorCheckDetailedOutput, 15 | { request, response, page, crawler }: PuppeteerCrawlingContext 16 | ): Promise { 17 | let htmlUrl; 18 | let screenshotUrl; 19 | 20 | const waitFor = input['puppeteer.waitFor']; 21 | 22 | if (waitFor) { 23 | // We wait for number in ms or a selector 24 | const maybeNumber = Number(waitFor); 25 | if (maybeNumber || maybeNumber === 0) { 26 | await page.waitForTimeout(maybeNumber); 27 | } else { 28 | await page.waitForSelector(waitFor); 29 | } 30 | } 31 | 32 | if (input.saveSnapshot) { 33 | const key = `SNAPSHOT-${Math.random().toString()}`; 34 | await puppeteerUtils.saveSnapshot(page, { key }); 35 | screenshotUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${key}.jpg?disableRedirect=true`; 36 | htmlUrl = `https://api.apify.com/v2/key-value-stores/${Actor.getEnv().defaultKeyValueStoreId}/records/${key}.html?disableRedirect=true`; 37 | } 38 | 39 | state.totalPages.push({ url: request.url, htmlUrl, screenshotUrl }); 40 | 41 | const statusCode = response!.status(); 42 | 43 | state.statusCodes[statusCode] ??= []; 44 | state.statusCodes[statusCode].push({ url: request.url, htmlUrl, screenshotUrl }); 45 | 46 | const html = await page.content(); 47 | const $ = Cheerio.load(html); 48 | 49 | const captchas: string[] = []; 50 | const testResult = testHtml($); 51 | 52 | for (const testResultEntry of Object.entries(testResult)) { 53 | const wasFound = testResultEntry[1]; 54 | const testCase = testResultEntry[0] as 'accessDenied' | 'distilCaptcha' | 'recaptcha' | 'hCaptcha'; 55 | if (wasFound) { 56 | captchas.push(testCase); 57 | 58 | state[testCase].push({ url: request.url, htmlUrl }); 59 | } 60 | } 61 | 62 | const wasSuccess = statusCode < 400 && captchas.length === 0; 63 | if (wasSuccess) { 64 | state.success.push({ url: request.url, htmlUrl, screenshotUrl }); 65 | } 66 | 67 | await Actor.pushData({ 68 | url: request.url, 69 | htmlUrl, 70 | screenshotUrl, 71 | statusCode, 72 | captchas, 73 | wasSuccess, 74 | }); 75 | 76 | const pageOrigin = new URL(request.url).origin; 77 | 78 | if (input.linkSelector && !!$) { 79 | const info = await requestQueue.getInfo(); 80 | 81 | const maxUrlsToEnqueue = input.maxNumberOfPagesCheckedPerDomain - info!.totalRequestCount; 82 | if (maxUrlsToEnqueue > 0) { 83 | const toEnqueue: RequestOptions[] = []; 84 | $(input.linkSelector).each((_, el) => { 85 | const rawHref = $(el).attr('href'); 86 | if (!rawHref) { 87 | return; 88 | } 89 | const href = new URL(rawHref, pageOrigin).toString(); 90 | for (const pseudoUrlInput of input.pseudoUrls) { 91 | if (href && new PseudoUrl(pseudoUrlInput.purl).matches(href)) { 92 | const newUrl = new URL(href, request.loadedUrl).toString(); 93 | toEnqueue.push({ 94 | url: newUrl, 95 | headers: pseudoUrlInput.headers, 96 | method: pseudoUrlInput.method as 'GET' | 'POST', 97 | payload: pseudoUrlInput.payload, 98 | userData: pseudoUrlInput.userData, 99 | }); 100 | } 101 | } 102 | }); 103 | console.log(`Found ${toEnqueue.length} links to enqueue on ${request.url}.`); 104 | await crawler.addRequests(toEnqueue.slice(0, maxUrlsToEnqueue)); 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /checker-puppeteer/src/main.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | import { log, PuppeteerCrawler, RequestOptions } from 'crawlee'; 3 | 4 | import type { ActorCheckDetailedOutput, PuppeteerActorInput } from './typedefs'; 5 | 6 | import { inspect } from 'util'; 7 | import { handleFailedRequest } from './handleFailedRequest.js'; 8 | import { handlePage } from './handlePage.js'; 9 | import { convertDetailedOutputToSimplified } from './utils.js'; 10 | 11 | Actor.main(async () => { 12 | const input = await Actor.getInput() as PuppeteerActorInput; 13 | 14 | // Log the input 15 | // Log the input 16 | log.info('Input provided:'); 17 | log.debug(inspect(input, false, 4)); 18 | 19 | log.info('Running a Puppeteer Checker.'); 20 | 21 | const env = Actor.getEnv(); 22 | 23 | const { 24 | maxConcurrentPagesCheckedPerDomain, 25 | maxNumberOfPagesCheckedPerDomain, 26 | proxyConfiguration, 27 | urlsToCheck, 28 | repeatChecksOnProvidedUrls, 29 | retireBrowserInstanceAfterRequestCount, 30 | 'puppeteer.useChrome': useChrome, 31 | 'puppeteer.headfull': headfull, 32 | } = input; 33 | 34 | const proxy = await Actor.createProxyConfiguration({ 35 | groups: proxyConfiguration.apifyProxyGroups, 36 | countryCode: proxyConfiguration.apifyProxyCountry, 37 | }); 38 | 39 | const requestQueue = await Actor.openRequestQueue(); 40 | 41 | const [urlData] = urlsToCheck; 42 | await requestQueue.addRequest(urlData as RequestOptions); 43 | for (let _ = 0; _ < (repeatChecksOnProvidedUrls ?? 0); _++) { 44 | await requestQueue.addRequest({ 45 | ...urlData, 46 | uniqueKey: Math.random().toString(), 47 | } as RequestOptions); 48 | } 49 | 50 | const state: ActorCheckDetailedOutput = { 51 | url: urlData.url, 52 | checkerType: 'puppeteer', 53 | simplifiedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`, 54 | detailedOutput: `https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`, 55 | totalPages: [], 56 | timedOut: [], 57 | failedToLoadOther: [], 58 | accessDenied: [], 59 | success: [], 60 | statusCodes: {}, 61 | recaptcha: [], 62 | distilCaptcha: [], 63 | hCaptcha: [], 64 | }; 65 | 66 | const crawler = new PuppeteerCrawler({ 67 | maxRequestRetries: 0, 68 | maxRequestsPerCrawl: maxNumberOfPagesCheckedPerDomain, 69 | maxConcurrency: maxConcurrentPagesCheckedPerDomain, 70 | requestQueue, 71 | requestHandler: (pageInputs) => handlePage(input, requestQueue, state, pageInputs), 72 | failedRequestHandler: (requestInput) => handleFailedRequest(state, requestInput), 73 | proxyConfiguration: proxy, 74 | useSessionPool: false, 75 | launchContext: { 76 | useChrome, 77 | launchOptions: { 78 | headless: headfull ? undefined : true, 79 | }, 80 | }, 81 | browserPoolOptions: { 82 | retireBrowserAfterPageCount: retireBrowserInstanceAfterRequestCount, 83 | }, 84 | }); 85 | 86 | await crawler.run(); 87 | 88 | await Actor.setValue('OUTPUT', convertDetailedOutputToSimplified(state)); 89 | await Actor.setValue('DETAILED-OUTPUT', state); 90 | 91 | log.info('Checker finished.'); 92 | log.info( 93 | `Simplified output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/OUTPUT?disableRedirect=true`, 94 | ); 95 | log.info( 96 | `Detailed output: https://api.apify.com/v2/key-value-stores/${env.defaultKeyValueStoreId}/records/DETAILED-OUTPUT?disableRedirect=true`, 97 | ); 98 | log.info(`Preview dataset: https://api.apify.com/v2/datasets/${env.defaultDatasetId}/items?clean=true&format=html`); 99 | }); 100 | -------------------------------------------------------------------------------- /checker-puppeteer/src/typedefs.ts: -------------------------------------------------------------------------------- 1 | import type { CheerioCrawlingContext } from 'crawlee'; 2 | 3 | type KeysNotRequired = 4 | | 'checkers.cheerio' 5 | | 'checkers.puppeteer' 6 | | 'checkers.playwright' 7 | | 'playwright.chrome' 8 | | 'playwright.firefox' 9 | | 'playwright.webkit' 10 | | 'maxConcurrentDomainsChecked'; 11 | 12 | export type PuppeteerActorInput = Omit; 13 | 14 | export type CheerioCheckerHandlePageInputs = CheerioCrawlingContext 15 | 16 | export interface PseudoUrlInputCustom { 17 | purl: string; 18 | method?: string; 19 | payload?: string; 20 | userData?: Record; 21 | headers?: Record; 22 | } 23 | 24 | export interface UrlInput { 25 | url: string; 26 | method?: string; 27 | payload?: string; 28 | userData?: Record; 29 | headers?: Record; 30 | } 31 | 32 | export interface ProxyConfiguration { 33 | useApifyProxy: boolean; 34 | apifyProxyGroups?: string[]; 35 | apifyProxyCountry?: string; 36 | } 37 | 38 | export interface ActorInputData { 39 | // Crawlers to use 40 | 'checkers.cheerio'?: boolean; 41 | 'checkers.puppeteer'?: boolean; 42 | 'checkers.playwright'?: boolean; 43 | 44 | // Pass these to crawlers 45 | 46 | // save snapshots 47 | saveSnapshot?: boolean; 48 | 49 | // General options 50 | urlsToCheck: UrlInput[]; 51 | proxyConfiguration: ProxyConfiguration; 52 | linkSelector?: string; 53 | pseudoUrls: PseudoUrlInputCustom[]; 54 | repeatChecksOnProvidedUrls?: number; 55 | maxNumberOfPagesCheckedPerDomain: number; 56 | maxConcurrentPagesCheckedPerDomain: number; 57 | maxConcurrentDomainsChecked: number; 58 | retireBrowserInstanceAfterRequestCount: number; 59 | 60 | // Pass only to puppeteer 61 | 'puppeteer.headfull'?: boolean; 62 | 'puppeteer.useChrome'?: boolean; 63 | 'puppeteer.waitFor'?: string; 64 | 65 | // Pass only to playwright 66 | 'playwright.chrome'?: boolean; 67 | 'playwright.firefox'?: boolean; 68 | 'playwright.webkit'?: boolean; 69 | 'playwright.headfull'?: boolean; 70 | 'playwright.useChrome'?: boolean; 71 | 'playwright.waitFor'?: string; 72 | } 73 | 74 | export interface PreparedActorConfig { 75 | actorId: string; 76 | proxyUsed?: string; 77 | url: string; 78 | input: ActorInputData; 79 | params: { 80 | memory: number; 81 | timeout: number; 82 | }; 83 | // This data is set when the config is ran 84 | runId?: string; 85 | } 86 | 87 | export interface CreateActorRunConfig { 88 | checkerId: string; 89 | input: ActorInputData; 90 | urlData: UrlInput; 91 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 92 | } 93 | 94 | // --- OUTPUT --- 95 | 96 | export interface ActorCheckDetailedOutput { 97 | // Set by waitForRunToFinishAndPushData 98 | proxyUsed?: string; 99 | checkerType: 'cheerio' | 'puppeteer' | 'playwright'; 100 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 101 | computeUnitsUsedForThisCheck?: number; 102 | // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit 103 | pagesPerComputeUnit?: number; 104 | 105 | // URLs 106 | url: string; 107 | simplifiedOutput: string; 108 | detailedOutput: string; 109 | 110 | // Page data 111 | totalPages: UrlCheckResult[]; 112 | timedOut: UrlCheckResult[]; 113 | failedToLoadOther: UrlCheckResult[]; 114 | accessDenied: UrlCheckResult[]; 115 | success: UrlCheckResult[]; 116 | 117 | // Status codes 118 | statusCodes: Record; 119 | 120 | // Captcha time 121 | recaptcha: UrlCheckResult[]; 122 | distilCaptcha: UrlCheckResult[]; 123 | hCaptcha: UrlCheckResult[]; 124 | } 125 | 126 | export interface UrlCheckResult { 127 | url: string; 128 | screenshotUrl?: string; 129 | htmlUrl?: string; 130 | } 131 | 132 | export type ActorCheckSimplifiedOutput = { 133 | [K in keyof ActorCheckDetailedOutput]: 134 | ActorCheckDetailedOutput[K] extends Array 135 | ? number 136 | : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] } 137 | ? Record 138 | : ActorCheckDetailedOutput[K]; 139 | }; -------------------------------------------------------------------------------- /checker-puppeteer/src/utils.ts: -------------------------------------------------------------------------------- 1 | import { Dictionary } from 'crawlee'; 2 | import type { ActorCheckDetailedOutput, ActorCheckSimplifiedOutput } from './typedefs.js'; 3 | 4 | export function convertDetailedOutputToSimplified(data: ActorCheckDetailedOutput): ActorCheckSimplifiedOutput { 5 | const obj: Dictionary = {}; 6 | 7 | for (const [key, value] of Object.entries(data)) { 8 | if (Array.isArray(value)) { 9 | obj[key] = value.length; 10 | } else if (typeof value === 'object') { 11 | if (!obj[key]) { 12 | obj[key] = {}; 13 | } 14 | const nestedObject: Dictionary = obj[key]; 15 | 16 | for (const [statusCode, statusValue] of Object.entries(value)) { 17 | nestedObject[statusCode] = (statusValue as any).length; 18 | } 19 | } else { 20 | obj[key] = value; 21 | } 22 | } 23 | 24 | // @ts-expect-error We are merging the objects 25 | return obj; 26 | } -------------------------------------------------------------------------------- /checker-puppeteer/tsconfig.json: -------------------------------------------------------------------------------- 1 | 2 | 3 | { 4 | "extends": "@apify/tsconfig", 5 | "compilerOptions": { 6 | "module": "ES2022", 7 | "target": "ES2022", 8 | "outDir": "dist", 9 | "noUnusedLocals": false, 10 | "lib": ["DOM"], 11 | "skipLibCheck": true 12 | }, 13 | "include": [ 14 | "./src/**/*" 15 | ] 16 | } -------------------------------------------------------------------------------- /starter/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify-projects/store-website-checker/4600159968d7289e023c071ad72c22bc5f3e4570/starter/.DS_Store -------------------------------------------------------------------------------- /starter/.actor/actor.json: -------------------------------------------------------------------------------- 1 | { 2 | "actorSpecification": 1, 3 | "name": "starter", 4 | "version": "0.0", 5 | "buildTag": "latest" 6 | } 7 | -------------------------------------------------------------------------------- /starter/.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "env": { 4 | "browser": true, 5 | "es2020": true, 6 | "node": true 7 | }, 8 | "extends": [ 9 | "@apify/eslint-config-ts" 10 | ], 11 | "parserOptions": { 12 | "project": "./tsconfig.json", 13 | "ecmaVersion": 2020 14 | }, 15 | "ignorePatterns": [ 16 | "node_modules", 17 | "dist", 18 | "**/*.d.ts" 19 | ] 20 | } -------------------------------------------------------------------------------- /starter/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | apify_storage 3 | dist 4 | 5 | storage 6 | # Added by Apify CLI 7 | .venv 8 | -------------------------------------------------------------------------------- /starter/Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://crawlee.dev/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node:16 AS builder 5 | 6 | # Copy just package.json and package-lock.json 7 | # to speed up the build using Docker layer cache. 8 | COPY package*.json ./ 9 | 10 | # Install all dependencies. Don't audit to speed up the installation. 11 | RUN npm install --include=dev --audit=false 12 | 13 | # Next, copy the source files using the user set 14 | # in the base image. 15 | COPY . ./ 16 | 17 | # Install all dependencies and build the project. 18 | # Don't audit to speed up the installation. 19 | RUN npm run build 20 | 21 | # Create final image 22 | FROM apify/actor-node:16 23 | 24 | # Copy only built JS files from builder image 25 | COPY --from=builder /usr/src/app/dist ./dist 26 | 27 | # Copy just package.json and package-lock.json 28 | # to speed up the build using Docker layer cache. 29 | COPY package*.json ./ 30 | 31 | # Install NPM packages, skip optional and development dependencies to 32 | # keep the image small. Avoid logging too much and print the dependency 33 | # tree for debugging 34 | RUN npm --quiet set progress=false \ 35 | && npm install --omit=dev --omit=optional \ 36 | && echo "Installed NPM packages:" \ 37 | && (npm list --omit=dev --all || true) \ 38 | && echo "Node.js version:" \ 39 | && node --version \ 40 | && echo "NPM version:" \ 41 | && npm --version \ 42 | && rm -r ~/.npm 43 | 44 | # Next, copy the remaining files and directories with the source code. 45 | # Since we do this after NPM install, quick build will be really fast 46 | # for most source file changes. 47 | COPY . ./ 48 | 49 | 50 | # Run the image. 51 | CMD npm run start:prod --silent -------------------------------------------------------------------------------- /starter/INPUT_SCHEMA.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Web Checker", 3 | "description": "The web checker actor loads URLs to check and checks for common captchas, status codes returned from crawling, as well as calculates the price a user may pay.", 4 | "type": "object", 5 | "schemaVersion": 1, 6 | "properties": { 7 | "urlsToCheck": { 8 | "title": "URLs to check", 9 | "type": "array", 10 | "description": "A static list of URLs to check for captchas. To be able to add new URLs on the fly, enable the Use request queue option.

For details, see Start URLs in README.", 11 | "sectionCaption": "Checker Options", 12 | "sectionDescription": "Options that will be passed to the checkers", 13 | "editor": "requestListSources", 14 | "prefill": [ 15 | { 16 | "url": "https://www.amazon.com/b?ie=UTF8&node=11392907011" 17 | } 18 | ] 19 | }, 20 | "proxyConfiguration": { 21 | "title": "Proxy Configuration", 22 | "type": "object", 23 | "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.

For details, see Proxy configuration in README.", 24 | "default": {}, 25 | "editor": "proxy", 26 | "prefill": { 27 | "useApifyProxy": true, 28 | "apifyProxyGroups": [ 29 | "SHADER", 30 | "BUYPROXIES94952", 31 | "RESIDENTIAL" 32 | ] 33 | } 34 | }, 35 | "checkers.cheerio": { 36 | "title": "Cheerio", 37 | "type": "boolean", 38 | "description": "Crawl with Cheerio", 39 | "default": true, 40 | "editor": "checkbox", 41 | "groupCaption": "Crawlers to use", 42 | "groupDescription": "Select which crawler types should be used for checking these domains" 43 | }, 44 | "checkers.puppeteer": { 45 | "title": "Puppeteer", 46 | "type": "boolean", 47 | "description": "Crawl with Puppeteer", 48 | "default": true, 49 | "editor": "checkbox" 50 | }, 51 | "checkers.playwright": { 52 | "title": "Playwright", 53 | "type": "boolean", 54 | "description": "Crawl with Playwright", 55 | "editor": "checkbox", 56 | "default": true 57 | }, 58 | "saveSnapshot": { 59 | "title": "Enabled", 60 | "type": "boolean", 61 | "description": "Will save HTML for Cheerio and HTML + screenshot for Puppeteer/Playwright", 62 | "editor": "checkbox", 63 | "groupCaption": "Save Snapshots", 64 | "default": true 65 | }, 66 | "enqueueAllOnDomain": { 67 | "title": "Enqueue any URL on domain (no need for link selector or pseudo URLs)", 68 | "type": "boolean", 69 | "description": "Will enqueue any URLs on the domain", 70 | "default": true 71 | }, 72 | "linkSelector": { 73 | "title": "Link Selector", 74 | "type": "string", 75 | "description": "A CSS selector saying which links on the page (<a> elements with href attribute) shall be followed and added to the request queue. This setting only applies if Use request queue is enabled. To filter the links added to the queue, use the Pseudo-URLs setting.

If Link selector is empty, the page links are ignored.

For details, see Link selector in README.", 76 | "sectionCaption": "Crawler Options", 77 | "sectionDescription": "Specific options that are relevant for crawlers", 78 | "editor": "textfield" 79 | }, 80 | "pseudoUrls": { 81 | "title": "Pseudo-URLs", 82 | "type": "array", 83 | "description": "Specifies what kind of URLs found by Link selector should be added to the request queue. A pseudo-URL is a URL with regular expressions enclosed in [] brackets, e.g. http://www.example.com/[.*]. This setting only applies if the Use request queue option is enabled.

If Pseudo-URLs are omitted, the actor enqueues all links matched by the Link selector.

For details, see Pseudo-URLs in README.", 84 | "default": [], 85 | "editor": "pseudoUrls" 86 | }, 87 | "repeatChecksOnProvidedUrls": { 88 | "title": "Repeat checks on provided URLs", 89 | "type": "integer", 90 | "description": "Will access each URL multiple times. Useful to test the same URL or bypass blocking of the first page.", 91 | "editor": "number", 92 | "prefill": 10 93 | }, 94 | "maxNumberOfPagesCheckedPerDomain": { 95 | "title": "Max number of pages checked per domain", 96 | "type": "integer", 97 | "description": "The maximum number of pages that the checker will load. The checker will stop when this limit is reached. It's always a good idea to set this limit in order to prevent excess platform usage for misconfigured scrapers. Note that the actual number of pages loaded might be slightly higher than this value.

If set to 0, there is no limit.", 98 | "prefill": 1000, 99 | "editor": "number" 100 | }, 101 | "maxConcurrentPagesCheckedPerDomain": { 102 | "title": "Maximum concurrent pages checked per domain", 103 | "type": "integer", 104 | "description": "Specifies the maximum number of pages that can be processed by the checker in parallel for one domain. The checker automatically increases and decreases concurrency based on available system resources. This option enables you to set an upper limit, for example to reduce the load on a target website.", 105 | "default": 500, 106 | "editor": "number", 107 | "minimum": 1 108 | }, 109 | "maxConcurrentDomainsChecked": { 110 | "title": "Maximum number of concurrent domains checked", 111 | "type": "integer", 112 | "description": "Specifies the maximum number of domains that should be checked at a time. This setting is relevant when passing in more than one URL to check.", 113 | "default": 5, 114 | "editor": "number", 115 | "minimum": 1, 116 | "maximum": 10 117 | }, 118 | "retireBrowserInstanceAfterRequestCount": { 119 | "title": "Retire browser instance after request count", 120 | "type": "integer", 121 | "description": "How often will the browser itself rotate. Pick a higher number for smaller consumption, pick a lower number to rotate (test) more proxies.", 122 | "default": 10, 123 | "editor": "number", 124 | "minimum": 1 125 | }, 126 | "navigationTimeoutSecs": { 127 | "title": "Navigation timeout (seconds)", 128 | "type": "integer", 129 | "description": "Specifies the maximum time in seconds the request will wait for the page to load. If the page is not loaded within this time, the browser will throw an error and the page will be marked as failed.", 130 | "default": 60, 131 | "minimum": 1 132 | }, 133 | "puppeteer.headfull": { 134 | "title": "Headfull browser (XVFB)", 135 | "type": "boolean", 136 | "description": "Only works for Puppeteer type!", 137 | "sectionCaption": "Puppeteer Options", 138 | "sectionDescription": "Options that are passed in to puppeteer when checking", 139 | "editor": "checkbox" 140 | }, 141 | "puppeteer.useChrome": { 142 | "title": "Use Chrome", 143 | "type": "boolean", 144 | "description": "Only works for Puppeteer type! Be careful that Chrome is not guaranteed to work with Puppeteer.", 145 | "editor": "checkbox" 146 | }, 147 | "puppeteer.waitFor": { 148 | "title": "Wait for", 149 | "type": "string", 150 | "description": "Only works for Puppeteer type. Will wait on each page. You can provide number in ms or a selector.", 151 | "editor": "textfield", 152 | "default": "2000" 153 | }, 154 | "puppeteer.memory": { 155 | "title": "Memory", 156 | "type": "integer", 157 | "unit": "MB", 158 | "default": 4096, 159 | "minimum": 1024, 160 | "maximum": 32768, 161 | "description": "Must be power of 2 between 128 and 32768." 162 | }, 163 | "playwright.chrome": { 164 | "title": "Chrome", 165 | "type": "boolean", 166 | "description": "Use Chrome when checking", 167 | "default": false, 168 | "sectionCaption": "Playwright options", 169 | "sectionDescription": "Options passed to playwright when checking", 170 | "editor": "checkbox", 171 | "groupCaption": "Browser type", 172 | "groupDescription": "Which type of browser should the checker use" 173 | }, 174 | "playwright.firefox": { 175 | "title": "Firefox", 176 | "type": "boolean", 177 | "description": "Use Firefox when checking", 178 | "editor": "checkbox", 179 | "default": true 180 | }, 181 | "playwright.webkit": { 182 | "title": "Safari (Webkit)", 183 | "type": "boolean", 184 | "description": "Use Safari when checking", 185 | "editor": "checkbox" 186 | }, 187 | "playwright.useChrome": { 188 | "title": "Use Chrome instead of Chromium", 189 | "type": "boolean", 190 | "description": "Only works for Playwright type! Be careful that Chrome is not guaranteed to work with Playwright.", 191 | "editor": "checkbox" 192 | }, 193 | "playwright.headfull": { 194 | "title": "Headfull browser (XVFB)", 195 | "type": "boolean", 196 | "description": "If the browser should be headfull or not", 197 | "editor": "checkbox" 198 | }, 199 | "playwright.waitFor": { 200 | "title": "Wait for", 201 | "type": "string", 202 | "description": "Only works for playwright type. Will wait on each page. You can provide number in ms or a selector.", 203 | "editor": "textfield", 204 | "default": "2000" 205 | }, 206 | "playwright.memory": { 207 | "title": "Memory", 208 | "type": "integer", 209 | "unit": "MB", 210 | "default": 4096, 211 | "minimum": 1024, 212 | "maximum": 32768, 213 | "description": "Must be power of 2 between 128 and 32768." 214 | } 215 | }, 216 | "required": ["urlsToCheck"] 217 | } 218 | -------------------------------------------------------------------------------- /starter/README.md: -------------------------------------------------------------------------------- 1 | ## Website Checker 2 | 3 | Website checker is a simple actor that allows you to scan any website for performance and blocking using various scraping methods as Cheerio, Puppeteer and Playwright. 4 | 5 | ### Features 6 | 7 | The actor provides these useful features out of the box: 8 | 9 | - Collects response status codes 10 | - Recognizes the most common captchas 11 | - Saves HTML snapshots and screenshots (if Puppeteer or Playwright is chosen) 12 | - Enables choosing between Cheerio (plain HTTP) and Puppeteer/Playwright (browser) scraper 13 | - Enables choosing different browsers for Playwright - Chrome, Firefox and Webkit (Safari) 14 | - Enables re-scraping start URLs or enqueueing with a familiar link selector + pseudo URLs system 15 | - Handles different failure states like timeouts and network errors 16 | - Enables basic proxy and browser configuration 17 | 18 | ### How to use 19 | 20 | The most common use-case is to do a quick check on how aggressively the target site is blocking. In that case just supply a start URL, ideally a category one or product one. You can either set `replicateStartUrls` or add enqueueing with `linkSelector` + `pseudoUrls`, both are good options to test different proxies. 21 | 22 | You can pick any combination of run options and the checker will spawn runner actor for every combination of scraping tool & proxies and then combine the results into single output. 23 | 24 | In the end you will get a simple statistics about the blocking rate. It is recommended to check a few screenshots just to make sure the actor correctly recognized the page status. You can get to the detailed output (per URL) via KV store or dataset (the KV output sorts by response status while dataset is simply ordered by scraping order). 25 | 26 | #### Multiple URLs and configurations 27 | Website checker doesn't have any limitation of how many websites and configs you can check. For each website, it will run each config. You just need to set a reasonable `maxConcurrentDomainsChecked` so that all parallel runs fit into your total memory (4 GB for Cheerio and 8 GB for Puppeteer/Playwright checks). 28 | 29 | ### Input 30 | 31 | Please follow the [actor's input page](https://apify.com/lukaskrivka/website-checker/input-schema) for a detailed explanation. Most input fields have reasonable defaults. 32 | 33 | ### Example output 34 | 35 | #### Simple output 36 | 37 | ``` 38 | { 39 | "timeouted": 0, 40 | "failedToLoadOther": 9, 41 | "accessDenied": 0, 42 | "recaptcha": 0, 43 | "distilCaptcha": 24, 44 | "hCaptcha": 0, 45 | "statusCodes": { 46 | "200": 3, 47 | "401": 2, 48 | "403": 5, 49 | "405": 24 50 | }, 51 | "success": 3, 52 | "total": 43 53 | } 54 | ``` 55 | 56 | ### Changelog 57 | 58 | Check history of changes in the [CHANGELOG](https://github.com/metalwarrior665/actor-website-checker/blob/master/CHANGELOG.md) 59 | -------------------------------------------------------------------------------- /starter/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawlee-cheerio-typescript", 3 | "version": "0.0.1", 4 | "type": "module", 5 | "description": "This is a boilerplate of an Apify actor.", 6 | "engines": { 7 | "node": ">=16.0.0" 8 | }, 9 | "dependencies": { 10 | "apify": "^3.0.0", 11 | "crawlee": "^3.0.0" 12 | }, 13 | "devDependencies": { 14 | "@apify/eslint-config-ts": "^0.2.3", 15 | "@apify/tsconfig": "^0.1.0", 16 | "@typescript-eslint/eslint-plugin": "^5.32.0", 17 | "@typescript-eslint/parser": "^5.32.0", 18 | "eslint": "^8.20.0", 19 | "ts-node": "^10.9.1", 20 | "typescript": "4.7.4" 21 | }, 22 | "scripts": { 23 | "start": "npm run start:dev", 24 | "start:prod": "node dist/main.js", 25 | "start:dev": "ts-node-esm -T src/main.ts", 26 | "build": "tsc", 27 | "lint": "eslint ./src --ext .ts", 28 | "lint:fix": "eslint ./src --ext .ts --fix", 29 | "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 30 | }, 31 | "author": "It's not you it's me", 32 | "license": "ISC" 33 | } -------------------------------------------------------------------------------- /starter/src/configs.ts: -------------------------------------------------------------------------------- 1 | import { ACTOR_CHEERIO_CHECKER_NAME, ACTOR_PLAYWRIGHT_CHECKER_NAME, ACTOR_PUPPETEER_CHECKER_NAME } from './constants.js'; 2 | import type { PreparedActorConfig, ActorInputData, CreateActorRunConfig } from './typedefs.js'; 3 | 4 | export function convertInputToActorConfigs(input: ActorInputData): PreparedActorConfig[] { 5 | const configs: PreparedActorConfig[] = []; 6 | 7 | for (const urlData of input.urlsToCheck) { 8 | if (input['checkers.cheerio']) { 9 | configs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_CHEERIO_CHECKER_NAME })); 10 | } 11 | if (input['checkers.puppeteer']) { 12 | configs.push(...createActorRunConfigForCrawler({ input, urlData, checkerId: ACTOR_PUPPETEER_CHECKER_NAME, memory: input['puppeteer.memory'] })); 13 | } 14 | if (input['checkers.playwright']) { 15 | // Create a run config for each playwright browser 16 | if (input['playwright.chrome']) { 17 | configs.push(...createActorRunConfigForCrawler({ 18 | input, 19 | urlData, 20 | checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME, 21 | playwrightBrowser: 'chrome', 22 | memory: input['playwright.memory'], 23 | })); 24 | } 25 | if (input['playwright.firefox']) { 26 | configs.push(...createActorRunConfigForCrawler({ 27 | input, 28 | urlData, 29 | checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME, 30 | playwrightBrowser: 'firefox', 31 | memory: input['playwright.memory'], 32 | })); 33 | } 34 | if (input['playwright.webkit']) { 35 | configs.push(...createActorRunConfigForCrawler({ 36 | input, 37 | urlData, 38 | checkerId: ACTOR_PLAYWRIGHT_CHECKER_NAME, 39 | playwrightBrowser: 'webkit', 40 | memory: input['playwright.memory'], 41 | })); 42 | } 43 | } 44 | } 45 | 46 | return configs; 47 | } 48 | 49 | function* createActorRunConfigForCrawler({ input, urlData, checkerId, playwrightBrowser, memory }: CreateActorRunConfig) { 50 | const proxyGroups = input.proxyConfiguration.apifyProxyGroups?.length 51 | ? input.proxyConfiguration.apifyProxyGroups 52 | : ['auto']; 53 | for (const group of proxyGroups) { 54 | const { url } = urlData; 55 | const config: PreparedActorConfig = { 56 | actorId: checkerId, 57 | proxyUsed: group === 'auto' ? undefined : group, 58 | url, 59 | input: { 60 | saveSnapshot: input.saveSnapshot, 61 | urlsToCheck: [urlData], 62 | proxyConfiguration: { 63 | useApifyProxy: input.proxyConfiguration.useApifyProxy, 64 | apifyProxyCountry: input.proxyConfiguration.apifyProxyCountry, 65 | apifyProxyGroups: group === 'auto' ? undefined : [group], 66 | }, 67 | linkSelector: input.enqueueAllOnDomain ? 'a[href]' : input.linkSelector, 68 | pseudoUrls: input.enqueueAllOnDomain 69 | ? [{ purl: `${new URL(url).origin}[.*]` }] 70 | : input.pseudoUrls, 71 | repeatChecksOnProvidedUrls: input.repeatChecksOnProvidedUrls, 72 | maxNumberOfPagesCheckedPerDomain: input.maxNumberOfPagesCheckedPerDomain, 73 | maxConcurrentPagesCheckedPerDomain: input.maxConcurrentPagesCheckedPerDomain, 74 | maxConcurrentDomainsChecked: input.maxConcurrentDomainsChecked, 75 | retireBrowserInstanceAfterRequestCount: input.retireBrowserInstanceAfterRequestCount, 76 | navigationTimeoutSecs: input.navigationTimeoutSecs, 77 | }, 78 | params: { 79 | memory: memory || (checkerId === ACTOR_CHEERIO_CHECKER_NAME ? 4096 : 8192), 80 | timeout: 24 * 3600, 81 | }, 82 | }; 83 | 84 | if (checkerId === ACTOR_PUPPETEER_CHECKER_NAME) { 85 | config.input['puppeteer.headfull'] = input['puppeteer.headfull']; 86 | config.input['puppeteer.useChrome'] = input['puppeteer.useChrome']; 87 | config.input['puppeteer.waitFor'] = input['puppeteer.waitFor']; 88 | } else if (checkerId === ACTOR_PLAYWRIGHT_CHECKER_NAME && playwrightBrowser) { 89 | config.input[`playwright.${playwrightBrowser}`] = input[`playwright.${playwrightBrowser}`]; 90 | config.input['playwright.headfull'] = input[`playwright.headfull`]; 91 | config.input['playwright.useChrome'] = input['playwright.useChrome']; 92 | config.input['playwright.waitFor'] = input['playwright.waitFor']; 93 | } 94 | 95 | yield config; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /starter/src/constants.ts: -------------------------------------------------------------------------------- 1 | export const ACTOR_CHEERIO_CHECKER_NAME = 'lukaskrivka/website-checker-cheerio'; 2 | 3 | export const ACTOR_PUPPETEER_CHECKER_NAME = 'lukaskrivka/website-checker-puppeteer'; 4 | 5 | export const ACTOR_PLAYWRIGHT_CHECKER_NAME = 'lukaskrivka/website-checker-playwright'; 6 | 7 | export const DEFAULT_COSTS = { 8 | COMPUTE_UNIT: 0.25, 9 | RESIDENTIAL_GB: 12.5, 10 | } 11 | 12 | export const TABLE_FIELDS_ORDER = [ 13 | 'url', 14 | 'checkerType', 15 | 'proxyUsed', 16 | 'totalPages', 17 | 'success', 18 | 'successRate', 19 | 'estimatedCostPerRequest', 20 | 'computeUnitsPerRequest', 21 | 'residentialGBsPerRequest', 22 | 'runUrl' 23 | ] 24 | -------------------------------------------------------------------------------- /starter/src/main.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | import { log, RequestList, BasicCrawler } from 'crawlee'; 3 | 4 | import { inspect } from 'util'; 5 | import { convertInputToActorConfigs } from './configs.js'; 6 | import { waitForRunToFinishAndPushData, startRun } from './startRunAndPool.js'; 7 | 8 | import type { ActorInputData, FrontendActorState, PreparedActorConfig } from './typedefs.js'; 9 | import { TABLE_FIELDS_ORDER } from './constants.js'; 10 | 11 | const env = Actor.getEnv(); 12 | 13 | Actor.main(async () => { 14 | const input = await Actor.getInput() as ActorInputData; 15 | log.debug('Provided inputs:'); 16 | log.debug(inspect(input)); 17 | 18 | // TODO: Add utilization of all user memory instead of having to rely on maxConcurrentDomainsChecked 19 | const { maxConcurrentDomainsChecked, urlsToCheck } = input; 20 | 21 | // Log the input 22 | log.info('Input provided:'); 23 | log.debug(inspect(input, false, 4)); 24 | 25 | const state: FrontendActorState = await Actor.getValue('STATE') ?? { 26 | runConfigurations: [], 27 | totalUrls: urlsToCheck.length, 28 | checkerFinished: false, 29 | }; 30 | 31 | Actor.on('persistState', async () => { 32 | await Actor.setValue('STATE', state); 33 | }); 34 | 35 | // If we haven't initialized the state yet, do it now 36 | if (state.runConfigurations.length === 0 && !state.checkerFinished) { 37 | state.runConfigurations = convertInputToActorConfigs(input); 38 | } 39 | 40 | // Sort state based on started runs 41 | state.runConfigurations = state.runConfigurations.sort((_, b) => Number(Boolean(b.runId))); 42 | await Actor.setValue('STATE', state); 43 | 44 | log.info(`Preparing to process ${state.totalUrls} URLs...\n`); 45 | 46 | const sources = state.runConfigurations.map((actorInput, index) => ({ 47 | url: 'https://localhost', 48 | uniqueKey: index.toString(), 49 | userData: { actorInput }, 50 | })); 51 | 52 | 53 | 54 | const requestList = await RequestList.open(null, sources); 55 | 56 | const runner = new BasicCrawler({ 57 | maxConcurrency: maxConcurrentDomainsChecked, 58 | requestList, 59 | requestHandler: async ({ request }) => { 60 | const { userData } = request; 61 | const actorInput = (userData.actorInput) as PreparedActorConfig; 62 | 63 | if (actorInput.runId) { 64 | log.info(`Found run ${actorInput.runId} with actor ${actorInput.actorId} for URL "${actorInput.url}" - waiting for it to finish.`); 65 | log.info(`You can monitor the status of the run by going to https://console.apify.com/actors/runs/${actorInput.runId}`); 66 | } else { 67 | const result = await startRun(actorInput); 68 | log.info( 69 | `Starting run for "${actorInput.url}" with actor ${actorInput.actorId} and ${ 70 | actorInput.input.proxyConfiguration.useApifyProxy ? `proxy ${actorInput.proxyUsed ?? 'auto'}` : 'no proxy' 71 | }.`, 72 | ); 73 | log.info(`You can monitor the status of the run by going to https://console.apify.com/actors/runs/${result.id}`); 74 | actorInput.runId = result.id; 75 | } 76 | 77 | // Wait for the run to finish 78 | await waitForRunToFinishAndPushData(actorInput); 79 | }, 80 | requestHandlerTimeoutSecs: 999_999, 81 | }); 82 | 83 | // Run the checker 84 | await runner.run(); 85 | 86 | // Save the state as done, to prevent resurrection doing requests it doesn't have to do 87 | state.runConfigurations = []; 88 | state.checkerFinished = true; 89 | await Actor.setValue('STATE', state); 90 | 91 | log.info(`\nChecking ${state.totalUrls} URLs completed!`); 92 | log.info(`NICER TABLE VIEW:\nhttps://api.apify.com/v2/datasets/${Actor.getEnv().defaultDatasetId}/items?clean=true&format=html` 93 | + `&fields=${TABLE_FIELDS_ORDER.join(',')}`); 94 | }); 95 | -------------------------------------------------------------------------------- /starter/src/startRunAndPool.ts: -------------------------------------------------------------------------------- 1 | import { Actor, ActorRun } from 'apify'; 2 | 3 | import { DEFAULT_COSTS } from './constants.js'; 4 | import type { PreparedActorConfig, ActorCheckSimplifiedOutput, FixedActorRun } from './typedefs.js'; 5 | 6 | export async function startRun(run: PreparedActorConfig) { 7 | const client = Actor.newClient(); 8 | const result = await client.actor(run.actorId).start(run.input, run.params); 9 | 10 | return result; 11 | } 12 | 13 | export async function waitForRunToFinishAndPushData(runConfig: PreparedActorConfig) { 14 | const client = Actor.newClient(); 15 | const run = client.run(runConfig.runId!); 16 | 17 | const finishedRun = await run.waitForFinish() as FixedActorRun; 18 | const { 19 | ACTOR_COMPUTE_UNITS: computeUnits, 20 | PROXY_RESIDENTIAL_TRANSFER_GBYTES: residentialGBs, 21 | } = finishedRun.usage; 22 | 23 | const value = (await run.keyValueStore().getRecord('OUTPUT'))!.value as ActorCheckSimplifiedOutput; 24 | 25 | value.computeUnitsUsedForThisCheck = Number(computeUnits.toFixed(4)); 26 | value.pagesPerComputeUnit = Number((value.totalPages / computeUnits).toFixed(2)); 27 | value.computeUnitsPerRequest = Number((computeUnits / value.totalPages).toFixed(6)); 28 | // 8 decimals gives all the precision we need (level of 10 Bytes) 29 | value.residentialGBs = Number(residentialGBs.toFixed(8)); 30 | value.residentialGBsPerRequest = Number((residentialGBs / value.totalPages).toFixed(8)); 31 | value.proxyUsed = runConfig.proxyUsed; 32 | value.estimatedCost = Number((computeUnits * DEFAULT_COSTS.COMPUTE_UNIT + residentialGBs * DEFAULT_COSTS.RESIDENTIAL_GB).toFixed(4)); 33 | value.estimatedCostPerRequest = Number((value.estimatedCost / value.totalPages).toFixed(6)); 34 | 35 | if (runConfig.input['playwright.chrome']) { 36 | value.playwrightBrowser = 'chrome'; 37 | } else if (runConfig.input['playwright.firefox']) { 38 | value.playwrightBrowser = 'firefox'; 39 | } else if (runConfig.input['playwright.webkit']) { 40 | value.playwrightBrowser = 'webkit'; 41 | } 42 | 43 | value.successRate = Number(((value.success / value.totalPages) * 100).toFixed(2)); 44 | value.runUrl = `https://console.apify.com/actors/runs/${runConfig.runId}`; 45 | 46 | await Actor.pushData(value); 47 | } 48 | -------------------------------------------------------------------------------- /starter/src/typedefs.ts: -------------------------------------------------------------------------------- 1 | import { ActorRun } from "apify"; 2 | 3 | export interface FrontendActorState { 4 | totalUrls: number; 5 | runConfigurations: PreparedActorConfig[]; 6 | checkerFinished: boolean; 7 | } 8 | 9 | export interface PseudoUrlInput { 10 | purl: string; 11 | method?: string; 12 | payload?: string; 13 | userData?: Record; 14 | headers?: Record; 15 | } 16 | 17 | export interface UrlInput { 18 | url: string; 19 | method?: string; 20 | payload?: string; 21 | userData?: Record; 22 | headers?: Record; 23 | } 24 | 25 | export interface ProxyConfiguration { 26 | useApifyProxy: boolean; 27 | apifyProxyGroups?: string[]; 28 | apifyProxyCountry?: string; 29 | } 30 | 31 | export interface ActorInputData { 32 | // Crawlers to use 33 | 'checkers.cheerio'?: boolean; 34 | 'checkers.puppeteer'?: boolean; 35 | 'checkers.playwright'?: boolean; 36 | 37 | // Pass these to crawlers 38 | 39 | // save snapshots 40 | saveSnapshot?: boolean; 41 | 42 | // General options 43 | urlsToCheck: UrlInput[]; 44 | proxyConfiguration: ProxyConfiguration; 45 | enqueueAllOnDomain?: boolean; 46 | linkSelector?: string; 47 | pseudoUrls: PseudoUrlInput[]; 48 | repeatChecksOnProvidedUrls?: number; 49 | maxNumberOfPagesCheckedPerDomain: number; 50 | maxConcurrentPagesCheckedPerDomain: number; 51 | maxConcurrentDomainsChecked: number; 52 | retireBrowserInstanceAfterRequestCount: number; 53 | navigationTimeoutSecs: number; 54 | 55 | // Pass only to puppeteer 56 | 'puppeteer.headfull'?: boolean; 57 | 'puppeteer.useChrome'?: boolean; 58 | 'puppeteer.waitFor'?: string; 59 | 'puppeteer.memory'?: number; 60 | 61 | // Pass only to playwright 62 | 'playwright.chrome'?: boolean; 63 | 'playwright.firefox'?: boolean; 64 | 'playwright.webkit'?: boolean; 65 | 'playwright.headfull'?: boolean; 66 | 'playwright.useChrome'?: boolean; 67 | 'playwright.waitFor'?: string; 68 | 'playwright.memory'?: number; 69 | } 70 | 71 | export interface PreparedActorConfig { 72 | actorId: string; 73 | proxyUsed?: string; 74 | url: string; 75 | input: ActorInputData; 76 | params: { 77 | memory: number; 78 | timeout: number; 79 | }; 80 | // This data is set when the config is ran 81 | runId?: string; 82 | } 83 | 84 | export interface CreateActorRunConfig { 85 | checkerId: string; 86 | input: ActorInputData; 87 | urlData: UrlInput; 88 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 89 | memory?: number; 90 | } 91 | 92 | export interface UrlCheckResult { 93 | url: string; 94 | screenshotUrl?: string; 95 | htmlUrl?: string; 96 | } 97 | 98 | export interface ActorCheckDetailedOutput { 99 | // Set by waitForRunToFinishAndPushData 100 | proxyUsed?: string; 101 | checkerType: 'cheerio' | 'puppeteer' | 'playwright'; 102 | playwrightBrowser?: 'chrome' | 'firefox' | 'webkit'; 103 | computeUnitsUsedForThisCheck?: number; 104 | // (totalPages.length / computeUnitsUsedForThisCheck) yields the amount of pages checkable per compute unit 105 | pagesPerComputeUnit: number; 106 | computeUnitsPerRequest: number; 107 | residentialGBs: number; 108 | residentialGBsPerRequest: number; 109 | estimatedCost: number; 110 | estimatedCostPerRequest: number; 111 | 112 | 113 | // URLs 114 | url: string; 115 | simplifiedOutput: string; 116 | detailedOutput: string; 117 | runUrl: string; 118 | 119 | successRate?: number; 120 | 121 | // Page data 122 | totalPages: UrlCheckResult[]; 123 | timedOut: UrlCheckResult[]; 124 | failedToLoadOther: UrlCheckResult[]; 125 | accessDenied: UrlCheckResult[]; 126 | success: UrlCheckResult[]; 127 | 128 | // Status codes 129 | statusCodes: Record; 130 | 131 | // Captcha time 132 | recaptcha: UrlCheckResult[]; 133 | distilCaptcha: UrlCheckResult[]; 134 | hCaptcha: UrlCheckResult[]; 135 | } 136 | 137 | export type ActorCheckSimplifiedOutput = { 138 | [K in keyof ActorCheckDetailedOutput]: 139 | ActorCheckDetailedOutput[K] extends Array 140 | ? number 141 | : ActorCheckDetailedOutput[K] extends { [key: number]: UrlCheckResult[] } 142 | ? Record 143 | : ActorCheckDetailedOutput[K]; 144 | }; 145 | 146 | export interface FixedActorRun extends ActorRun { 147 | usage: { 148 | ACTOR_COMPUTE_UNITS: number, 149 | DATASET_READS: number, 150 | DATASET_WRITES: number, 151 | KEY_VALUE_STORE_READS: number, 152 | KEY_VALUE_STORE_WRITES: number, 153 | KEY_VALUE_STORE_LISTS: number, 154 | REQUEST_QUEUE_READS: number, 155 | REQUEST_QUEUE_WRITES: number, 156 | DATA_TRANSFER_INTERNAL_GBYTES: number, 157 | DATA_TRANSFER_EXTERNAL_GBYTES: number, 158 | PROXY_RESIDENTIAL_TRANSFER_GBYTES: number, 159 | PROXY_SERPS: number, 160 | } 161 | }; 162 | -------------------------------------------------------------------------------- /starter/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "module": "ES2022", 5 | "target": "ES2022", 6 | "outDir": "dist", 7 | "noUnusedLocals": false, 8 | "lib": ["DOM"], 9 | "skipLibCheck": true 10 | }, 11 | "include": [ 12 | "./src/**/*" 13 | ] 14 | } --------------------------------------------------------------------------------