├── .actor ├── Dockerfile ├── actor.json └── input_schema.json ├── .dockerignore ├── .editorconfig ├── .eslintrc ├── .gitignore ├── README.md ├── package-lock.json ├── package.json ├── src ├── const.ts ├── crawlers.ts ├── errors.ts ├── extract_rules_utils.ts ├── instructions_utils.ts ├── main.ts ├── params.ts ├── responses.ts ├── router.ts ├── types.ts └── utils.ts └── tsconfig.json /.actor/Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://crawlee.dev/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node-playwright-chrome:20 AS builder 5 | 6 | # Copy just package.json and package-lock.json 7 | # to speed up the build using Docker layer cache. 8 | COPY --chown=myuser package*.json ./ 9 | 10 | # Install all dependencies. Don't audit to speed up the installation. 11 | RUN npm install --include=dev --audit=false 12 | 13 | # Next, copy the source files using the user set 14 | # in the base image. 15 | COPY --chown=myuser . ./ 16 | 17 | # Install all dependencies and build the project. 18 | # Don't audit to speed up the installation. 19 | RUN npm run build 20 | 21 | # Create final image 22 | FROM apify/actor-node-playwright-chrome:20 23 | 24 | # Copy just package.json and package-lock.json 25 | # to speed up the build using Docker layer cache. 26 | COPY --chown=myuser package*.json ./ 27 | 28 | # Install NPM packages, skip optional and development dependencies to 29 | # keep the image small. Avoid logging too much and print the dependency 30 | # tree for debugging 31 | RUN npm --quiet set progress=false \ 32 | && npm install --omit=dev --omit=optional \ 33 | && echo "Installed NPM packages:" \ 34 | && (npm list --omit=dev --all || true) \ 35 | && echo "Node.js version:" \ 36 | && node --version \ 37 | && echo "NPM version:" \ 38 | && npm --version \ 39 | && rm -r ~/.npm 40 | 41 | # Copy built JS files from builder image 42 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist 43 | 44 | # Next, copy the remaining files and directories with the source code. 45 | # Since we do this after NPM install, quick build will be really fast 46 | # for most source file changes. 47 | COPY --chown=myuser . ./ 48 | 49 | 50 | # Run the image. If you know you won't need headful browsers, 51 | # you can remove the XVFB start script for a micro perf gain. 52 | CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent 53 | -------------------------------------------------------------------------------- /.actor/actor.json: -------------------------------------------------------------------------------- 1 | { 2 | "actorSpecification": 1, 3 | "name": "standby-crawler", 4 | "title": "Project Playwright Crawler Typescript", 5 | "description": "Crawlee and Playwright project in typescript.", 6 | "version": "0.0", 7 | "meta": { 8 | "templateId": "ts-crawlee-playwright-chrome" 9 | }, 10 | "input": "./input_schema.json", 11 | "dockerfile": "./Dockerfile" 12 | } 13 | -------------------------------------------------------------------------------- /.actor/input_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "PlaywrightCrawler Template", 3 | "type": "object", 4 | "schemaVersion": 1, 5 | "description": "Super Scraper API currently cannot be run manually via Input. Use Standby endpoint with available parameters.", 6 | "properties": { 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # configurations 2 | .idea 3 | 4 | # crawlee and apify storage folders 5 | apify_storage 6 | crawlee_storage 7 | storage 8 | 9 | # installed files 10 | node_modules 11 | 12 | # git folder 13 | .git 14 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | end_of_line = lf 10 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "env": { 4 | "browser": true, 5 | "es2020": true, 6 | "node": true 7 | }, 8 | "rules": { 9 | "no-underscore-dangle": "off" 10 | }, 11 | "extends": [ 12 | "@apify/eslint-config-ts" 13 | ], 14 | "parserOptions": { 15 | "project": "./tsconfig.json", 16 | "ecmaVersion": 2020 17 | }, 18 | "ignorePatterns": [ 19 | "node_modules", 20 | "dist", 21 | "**/*.d.ts" 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # This file tells Git which files shouldn't be added to source control 2 | 3 | .DS_Store 4 | .idea 5 | dist 6 | node_modules 7 | apify_storage 8 | storage 9 | 10 | # Added by Apify CLI 11 | .venv 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SuperScraper API 2 | 3 | SuperScraper API is an Actor that provides a REST API for scraping websites. 4 | Just pass the URL of a web page and get back the fully rendered HTML content. 5 | SuperScraper API is compatible with [ScrapingBee](https://www.scrapingbee.com/), 6 | [ScrapingAnt](https://scrapingant.com/), 7 | and [ScraperAPI](https://scraperapi.com/) interfaces. 8 | 9 | Main features: 10 | - Extract HTML from arbitrary URLs with a headless browser for dynamic content rendering. 11 | - Circumvent blocking using datacenter or residential proxies, as well as browser fingerprinting. 12 | - Seamlessly scale to a large number of web pages as needed. 13 | - Capture screenshots of the web pages. 14 | 15 | Note that SuperScraper API uses the new experimental Actor Standby mode, so it's not started the traditional way from Apify Console. 16 | Instead, it's invoked via the HTTP REST API provided directly by the Actor. See the examples below. 17 | 18 | ## Usage examples 19 | 20 | To run these examples, you need an Apify API token, 21 | which you can find under [Settings > Integrations](https://console.apify.com/account/integrations) in Apify Console. 22 | 23 | You can create an Apify account free of charge. 24 | 25 | ### Node.js 26 | 27 | ```ts 28 | import axios from 'axios'; 29 | 30 | const resp = await axios.get('https://super-scraper-api.apify.actor/', { 31 | params: { 32 | url: 'https://apify.com/store', 33 | wait_for: '.ActorStoreItem-title', 34 | json_response: true, 35 | screenshot: true, 36 | }, 37 | headers: { 38 | Authorization: 'Bearer ', 39 | }, 40 | }); 41 | 42 | console.log(resp.data); 43 | ``` 44 | 45 | ### curl 46 | 47 | ```shell 48 | curl -X GET \ 49 | 'https://super-scraper-api.apify.actor/?url=https://apify.com/store&wait_for=.ActorStoreItem-title&screenshot=true&json_response=true' \ 50 | --header 'Authorization: Bearer ' 51 | ``` 52 | 53 | ## Authentication 54 | 55 | The best way to authenticate is to pass your Apify API token using the `Authorization` HTTP header. 56 | Alternatively, you can pass the API token via the `token` query parameter to authenticate the requests, which is more convenient for testing in a web browser. 57 | 58 | ### Node.js 59 | 60 | ```ts 61 | const resp = await axios.get('https://super-scraper-api.apify.actor/', { 62 | params: { 63 | url: 'https://apify.com/store', 64 | token: '' 65 | }, 66 | }); 67 | ``` 68 | 69 | 70 | ### curl 71 | 72 | ```shell 73 | curl -X GET 'https://super-scraper-api.apify.actor/?url=https://apify.com/store&wait_for=.ActorStoreItem-title&json_response=true&token=' 74 | ``` 75 | 76 | ## Pricing 77 | 78 | When using SuperScraper API, you're charged based on your actual usage of the Apify platform's computing, storage, and networking resources. 79 | 80 | Cost depends on the target sites, your settings and API parameters, the load of your requests, and random network and target site conditions. 81 | 82 | The best way to see your price is to conduct a real-world test. 83 | 84 | An example cost on a free account (the pricing is cheaper on higher plans) for 30 one-by-one requests plus 50 batched requests test: 85 | 86 | | parameters | cost estimate 87 | | ------------- |-----------------------------------| 88 | | no `render_js` + basic proxy | $1/1000 requests 89 | | no `render_js` + premium (residential) proxy | $2/1000 requests 90 | | `render_js` + basic proxy | $4/1000 requests 91 | | `render_js` + premium (residential) proxy | $5/1000 requests 92 | 93 | ## API parameters 94 | 95 | ### ScrapingBee API parameters 96 | 97 | SuperScraper API supports most of the API parameters of [ScrapingBee](https://www.scrapingbee.com/documentation/): 98 | 99 | | parameter | description | 100 | | -------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 101 | | `url` | URL of the webpage to be scraped. **This parameter is required.** | 102 | | `json_response` | Return a verbose JSON response with additional details about the webpage. Can be either `true` or `false`, default is `false`. | 103 | | `extract_rules` | A stringified JSON containing custom rules how to extract data from the webpage. | 104 | | `render_js` | Indicates that the webpage should be scraped using a headless browser, with dynamic content rendered. Can be `true` or `false`, default is `true`. This is equivalent to ScrapingAnt's `browser`. | 105 | | `screenshot` | Get screenshot of the browser's current viewport. If `json_response` is set to `true`, screenshot will be returned in the Base64 encoding. Can be `true` or `false`, default is `false`. | 106 | | `screenshot_full_page` | Get screenshot of the full page. If `json_response` is set to `true`, screenshot will be returned in the Base64 encoding. Can be `true` or `false`, default is `false`. | 107 | | `screenshot_selector` | Get screenshot of the element specified by the selector. If `json_response` is set to `true`, screenshot will be returned in Base64. Must be a non-empty string. | 108 | | `js_scenario` | JavaScript instructions that will be executed after loading the webpage. | 109 | | `wait` | Specify a duration that the browser will wait after loading the page, in milliseconds. | 110 | | `wait_for` | Specify a CSS selector of an element for which the browser will wait after loading the page. | 111 | | `wait_browser` | Specify a browser event to wait for. Can be either `load`, `domcontentloaded`, or `networkidle`. | 112 | | `block_resources` | Specify that you want to block images and CSS. Can be `true` or `false`, default is `true`. | 113 | | `window_width` | Specify the width of the browser's viewport, in pixels. | 114 | | `window_height` | Specify the height of the browser's viewport, in pixels. | 115 | | `cookies` | Custom cookies to use to fetch the web pages. This is useful for fetching webpage behing login. The cookies must be specified in a string format: `cookie_name_1=cookie_value1;cookie_name_2=cookie_value_2`. | 116 | | `own_proxy` | A custom proxy to be used for scraping, in the format `:@:`. | 117 | | `premium_proxy` | Use residential proxies to fetch the web content, in order to reduce the probability of being blocked. Can be either `true` or `false`, default is `false`. | 118 | | `stealth_proxy` | Works same as `premium_proxy`. | 119 | | `country_code` | Use IP addresses that are geolocated in the specified country by specifying its [2-letter ISO code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). When using code other than `US`, `premium_proxy` must be set to `true`. This is equivalent to setting ScrapingAnt's `proxy_country`. | 120 | | `custom_google` | Use this option if you want to scrape Google-related websites (such as Google Searach or Google Shopping). Can be `true` or `false`, default is `false`. | 121 | | `return_page_source` | Return HTML of the webpage from the response before any dynamic JavaSript rendering. Can be `true` or `false`, default is `false`. | 122 | | `transparent_status_code` | By default, if target webpage responds with HTTP status code other than a 200-299 or a 404, the API will return a HTTP status code 500. Set this paremeter to `true` to disable this behavior and return the status code of the actual response. | 123 | | `timeout` | Set maximum timeout for the response from this Actor, in milliseconds. The default is 140 000 ms. | 124 | | `forward_headers` | If set to `true`, HTTP headers starting with prefix `Spb-` or `Ant-` will be forwarded to the target webpage alongside headers generated by us (the prefix will be trimmed). | 125 | | `forward_headers_pure` | If set to `true`, only headers starting with prefix `Spb-` or `Ant-` will be forwarded to the target webpage (prefix will be trimmed), without any other HTTP headers from our side. | 126 | | `device` | Can be either `desktop` (default) or `mobile`. | 127 | 128 | ScrapingBee's API parameters `block_ads` and `session_id` are currently not supported. 129 | 130 | ### ScrapingAnt API parameters 131 | 132 | SuperScraper API supports most of the API parameters of [ScrapingAnt](https://docs.scrapingant.com/request-response-format#available-parameters): 133 | 134 | | parameter | description | 135 | | -------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 136 | | `url` | URL of the webpage to be scraped. **This parameter is required.** | 137 | | `browser` | Indicates that the webpage should be scraped using a headless browser, with dynamic content rendered. Can be `true` or `false`, default is `true`. This is equivalent as ScrapingBee's `render_js`. | (Same as `render_js`.) | 138 | | `cookies` | Use custom cookies, must be in a string format: `cookie_name_1=cookie_value1;cookie_name_2=cookie_value_2`. | 139 | | `js_snippet` | A Base64-encoded JavaScript code to be executed on the webpage. Will be treated as the [evaluate](#evaluate) instruction. | 140 | | `proxy_type` | Specify the type of proxies, which can be either `datacenter` (default) or `residential`. This is equivalent to setting ScrapingBee's `premium_proxy` or `steath_proxy` to `true`. | 141 | | `wait_for_selector` | Specify a CSS selector of an element for which the browser will wait after loading the page. This is equivalent to setting ScrapingBee's `wait_for`. | 142 | | `block_resource` | Specify one or more resources types you want to block from being downloaded. The parameter can be repeated in the URL (e.g. `block_resource=image&block_resource=media`). Available options are: `document`, `stylesheet`, `image`, `media`, `font`, `script`, `texttrack`, `xhr`, `fetch`, `eventsource`, `websocket`, `manifest`, `other`. | 143 | | `return_page_source` | Return HTML of the webpage from the response before any dynamic JavaSript rendering. Can be `true` or `false`, default is `false`. | 144 | | `proxy_country` | Use IP addresses that are geolocated in the specified country by specifying its [2-letter ISO code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). When using code other than `US`, `premium_proxy` must be set to `true`. This is equivalent to setting ScrapingBee's `country_code`. | 145 | 146 | ScrapingAnt's API parameter `x-api-key` is not supported. 147 | 148 | Note that HTTP headers in a request to this Actor beginning with prefix `Ant-` will be forwarded (without the prefix) to the target webpage alongside headers generated by the Actor. 149 | This behavior can be changed using ScrapingBee's `forward_headers` or `forward_headers_pure` parameters. 150 | 151 | 152 | ### ScraperAPI API parameters 153 | 154 | SuperScraper API supports most of the API parameters of [ScraperAPI](https://docs.scraperapi.com/making-requests/customizing-requests): 155 | 156 | | parameter | description | 157 | | -------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 158 | | `url` | URL of the webpage to be scraped. **This parameter is required.** | 159 | | `render` | Specify, if you want to scrape the webpage with or without using a headless browser, can be `true` or `false`, default `true`. (Same as `render_js`.) | 160 | | `wait_for_selector` | Specify a CSS selector of an element for which the browser will wait after loading the page. This is equivalent to setting ScrapingBee's `wait_for`. | 161 | | `premium` | Use residential proxies to fetch the web content, in order to reduce the probability of being blocked. Can be either `true` or `false`, default is `false`. This is equivalent to setting ScrapingBee's `premium_proxy`. | 162 | | `ultra_premium` | Same as `premium`. | 163 | | `country_code` | Use IP addresses that are geolocated in the specified country by specifying its [2-letter ISO code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). When using code other than `US`, `premium_proxy` must be set to `true`. This is equivalent to setting ScrapingAnt's `proxy_country`. | 164 | | `keep_headers` | If `true`, then all headers sent to this Actor will be forwarded to the target website. The `Authorization` header will be removed. | 165 | | `device_type` | Can be either `desktop` (default) or `mobile`. This is equivalent to setting ScrapingBees's `device`. | 166 | | `binary_target` | Specify whether the target is a file. Can be `true` or `false`, default is `false`. Currently only supported when JS rendering is set to `false` via the `render_js`, `browser`, or `render` parameters. | 167 | 168 | ScraperAPI's API parameters `session_number` and `autoparse` are currently not supported, and they are ignored. 169 | 170 | 171 | ### Custom extraction rules 172 | 173 | Using ScrapingBee's `extract_rules` parameter, you can specify a set of rules to extract specific data from the target web pages. You can create an extraction rule in one of two ways: with shortened options, or with full options. 174 | 175 | #### Shortened options 176 | 177 | - value for the given key serves as a `selector` 178 | - using `@`, we can access attribute of the selected element 179 | 180 | ##### Example: 181 | 182 | ```json 183 | { 184 | "title": "h1", 185 | "link": "a@href" 186 | } 187 | ``` 188 | 189 | #### Full options 190 | 191 | - `selector` is required 192 | - `type` can be either `item` (default) or `list` 193 | - `output` indicates how the result for these element(s) will look like. It can be: 194 | - `text` (default option when `output` is omitted) - text of the element 195 | - `html` - HTML of the element 196 | - attribute name (starts with `@`, for example `@href`) 197 | - object with other extract rules for the given item (key + shortened or full options) 198 | - `table_json` or `table_array` to scrape a table in a json or array format 199 | - `clean` - relevant when having `text` as `output`, specifies whether the text of the element should be trimmed of whitespaces (can be `true` or `false`, default `true`) 200 | 201 | ##### Example: 202 | 203 | ```json 204 | { 205 | "custom key for links": { 206 | "selector": "a", 207 | "type": "list", 208 | "output": { 209 | "linkName" : { 210 | "selector": "a", 211 | "clean": "false" 212 | }, 213 | "href": { 214 | "selector": "a", 215 | "output": "@href" 216 | } 217 | } 218 | 219 | } 220 | } 221 | ``` 222 | 223 | #### Example 224 | 225 | This example extracts all links from [Apify Blog](https://blog.apify.com/) along with their titles. 226 | 227 | ```ts 228 | const extractRules = { 229 | title: 'h1', 230 | allLinks: { 231 | selector: 'a', 232 | type: 'list', 233 | output: { 234 | title: 'a', 235 | link: 'a@href', 236 | }, 237 | }, 238 | }; 239 | 240 | const resp = await axios.get('https://super-scraper-api.apify.actor/', { 241 | params: { 242 | url: 'https://blog.apify.com/', 243 | extract_rules: JSON.stringify(extractRules), 244 | // verbose: true, 245 | }, 246 | headers: { 247 | Authorization: 'Bearer ', 248 | }, 249 | }); 250 | 251 | console.log(resp.data); 252 | ``` 253 | 254 | The results look like this: 255 | 256 | ```json 257 | { 258 | "title": "Apify Blog", 259 | "allLinks": [ 260 | { 261 | "title": "Data for generative AI & LLM", 262 | "link": "https://apify.com/data-for-generative-ai" 263 | }, 264 | { 265 | "title": "Product matching AI", 266 | "link": "https://apify.com/product-matching-ai" 267 | }, 268 | { 269 | "title": "Universal web scrapers", 270 | "link": "https://apify.com/store/scrapers/universal-web-scrapers" 271 | } 272 | ] 273 | } 274 | ``` 275 | 276 | ### Custom JavaScript code 277 | 278 | Use ScrapingBee's `js_scenario` parameter to specify instructions in order to be executed one by one after opening the page. 279 | 280 | Set `json_response` to `true` to get a full report of the executed instructions, the results of `evaluate` instructions will be added to the `evaluate_results` field. 281 | 282 | Example of clicking a button: 283 | 284 | ```ts 285 | const instructions = { 286 | instructions: [ 287 | { click: '#button' }, 288 | ], 289 | }; 290 | 291 | const resp = await axios.get('https://super-scraper-api.apify.actor/', { 292 | params: { 293 | url: 'https://www.example.com', 294 | js_scenario: JSON.stringify(instructions), 295 | }, 296 | headers: { 297 | Authorization: 'Bearer ', 298 | }, 299 | }); 300 | 301 | console.log(resp.data); 302 | ``` 303 | 304 | #### Strict mode 305 | 306 | If one instruction fails, then the subsequent instructions will not be executed. To disable this behavior, you can optionally set `strict` to `false` (by default it's `true`): 307 | 308 | ```json 309 | { 310 | "instructions": [ 311 | { "click": "#button1" }, 312 | { "click": "#button2" } 313 | ], 314 | "strict": false 315 | } 316 | ``` 317 | 318 | #### Supported instructions 319 | 320 | ##### `wait` 321 | 322 | - wait for some time specified in ms 323 | - example: `{"wait": 10000}` 324 | 325 | ##### `wait_for` 326 | 327 | - wait for an element specified by the selector 328 | - example `{"wait_for": "#element"}` 329 | 330 | ##### `click` 331 | 332 | - click on an element specified by the selector 333 | - example `{"click": "#button"}` 334 | 335 | ##### `wait_for_and_click` 336 | - combination of previous two 337 | - example `{"wait_for_and_click": "#button"}` 338 | 339 | ##### `scroll_x` and `scroll_y` 340 | 341 | - scroll a specified number of pixels horizontally or vertically 342 | - example `{"scroll_y": 1000}` or `{"scroll_x": 1000}` 343 | 344 | ##### `fill` 345 | 346 | - specify a selector of the input element and the value you want to fill 347 | - example `{"fill": ["input_1", "value_1"]}` 348 | 349 | ##### `evaluate` 350 | 351 | - evaluate custom javascript on the webpage 352 | - text/number/object results will be saved in the `evaluate_results` field 353 | - example `{"evaluate":"document.querySelectorAll('a').length"}` -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "standby-crawler", 3 | "version": "0.0.1", 4 | "type": "module", 5 | "description": "This is an example of an Apify actor.", 6 | "engines": { 7 | "node": ">=18.0.0" 8 | }, 9 | "dependencies": { 10 | "@crawlee/memory-storage": "^3.8.2", 11 | "apify": "^3.1.10", 12 | "cheerio": "^1.0.0-rc.12", 13 | "crawlee": "^3.9.1", 14 | "header-generator": "^2.1.50", 15 | "playwright": "*", 16 | "uuid": "^9.0.1" 17 | }, 18 | "devDependencies": { 19 | "@apify/eslint-config-ts": "^0.3.0", 20 | "@apify/tsconfig": "^0.1.0", 21 | "@types/uuid": "^9.0.8", 22 | "@typescript-eslint/eslint-plugin": "^6.7.2", 23 | "@typescript-eslint/parser": "^6.7.2", 24 | "eslint": "^8.50.0", 25 | "tsx": "^4.6.2", 26 | "typescript": "^5.3.3" 27 | }, 28 | "scripts": { 29 | "start": "npm run start:dev", 30 | "start:prod": "node dist/main.js", 31 | "start:dev": "tsx src/main.ts", 32 | "build": "tsc", 33 | "lint": "eslint ./src --ext .ts", 34 | "lint:fix": "eslint ./src --ext .ts --fix", 35 | "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1", 36 | "postinstall": "npx crawlee install-playwright-browsers" 37 | }, 38 | "author": "It's not you it's me", 39 | "license": "ISC" 40 | } 41 | -------------------------------------------------------------------------------- /src/const.ts: -------------------------------------------------------------------------------- 1 | export enum Label { 2 | BROWSER = 'browser', 3 | HTTP = 'http', 4 | BINARY_TARGET = 'binary-target', 5 | } 6 | 7 | export const VALID_RESOURCES = [ 8 | 'document', 9 | 'stylesheet', 10 | 'image', 11 | 'media', 12 | 'font', 13 | 'script', 14 | 'texttrack', 15 | 'xhr', 16 | 'fetch', 17 | 'eventsource', 18 | 'websocket', 19 | 'manifest', 20 | 'other', 21 | ]; 22 | -------------------------------------------------------------------------------- /src/crawlers.ts: -------------------------------------------------------------------------------- 1 | import { Actor, RequestQueue, log } from 'apify'; 2 | import { PlaywrightCrawler } from 'crawlee'; 3 | import type { PlaywrightCrawlingContext, RequestOptions, AutoscaledPoolOptions } from 'crawlee'; 4 | import { MemoryStorage } from '@crawlee/memory-storage'; 5 | import { ServerResponse } from 'http'; 6 | import { TimeMeasure, UserData, VerboseResult, CrawlerOptions } from './types.js'; 7 | import { addResponse, sendErrorResponseById } from './responses.js'; 8 | import { router } from './router.js'; 9 | import { pushLogData } from './utils.js'; 10 | import { Label } from './const.js'; 11 | 12 | const crawlers = new Map(); 13 | 14 | export const DEFAULT_CRAWLER_OPTIONS: CrawlerOptions = { 15 | proxyConfigurationOptions: {}, 16 | }; 17 | 18 | export const createAndStartCrawler = async (crawlerOptions: CrawlerOptions = DEFAULT_CRAWLER_OPTIONS) => { 19 | const client = new MemoryStorage(); 20 | const queue = await RequestQueue.open(undefined, { storageClient: client }); 21 | 22 | const proxyConfig = await Actor.createProxyConfiguration(crawlerOptions.proxyConfigurationOptions); 23 | 24 | const crawler = new PlaywrightCrawler({ 25 | keepAlive: true, 26 | proxyConfiguration: proxyConfig, 27 | maxRequestRetries: 4, 28 | requestQueue: queue, 29 | launchContext: { 30 | browserPerProxy: false, 31 | }, 32 | statisticsOptions: { 33 | persistenceOptions: { 34 | enable: false, 35 | }, 36 | }, 37 | requestHandlerTimeoutSecs: 3600, 38 | sessionPoolOptions: { 39 | persistenceOptions: { 40 | enable: false, 41 | }, 42 | }, 43 | errorHandler: async ({ request }, err) => { 44 | const { requestDetails, timeMeasures, transparentStatusCode } = request.userData as UserData; 45 | timeMeasures.push({ 46 | event: 'error', 47 | time: Date.now(), 48 | }); 49 | 50 | requestDetails.requestErrors.push({ 51 | attempt: request.retryCount + 1, 52 | errorMessage: err.message, 53 | }); 54 | 55 | if (transparentStatusCode) { 56 | request.noRetry = true; 57 | } 58 | }, 59 | failedRequestHandler: async ({ request, response, page }, err) => { 60 | const { 61 | requestDetails, 62 | jsonResponse, 63 | inputtedUrl, 64 | parsedInputtedParams, 65 | timeMeasures, 66 | transparentStatusCode, 67 | nonbrowserRequestStatus, 68 | } = request.userData as UserData; 69 | 70 | requestDetails.requestErrors.push({ 71 | attempt: request.retryCount + 1, 72 | errorMessage: err.message, 73 | }); 74 | 75 | const errorResponse = { 76 | errorMessage: err.message, 77 | }; 78 | 79 | const responseStatusCode = request.skipNavigation ? nonbrowserRequestStatus! : (response?.status() || null); 80 | let statusCode = 500; 81 | if (transparentStatusCode && responseStatusCode) { 82 | statusCode = responseStatusCode; 83 | } 84 | if (jsonResponse) { 85 | const verboseResponse: VerboseResult = { 86 | body: errorResponse, 87 | cookies: await page.context().cookies(request.url) || [], 88 | evaluateResults: [], 89 | jsScenarioReport: {}, 90 | headers: requestDetails.responseHeaders || {}, 91 | type: 'json', 92 | iframes: [], 93 | xhr: [], 94 | initialStatusCode: responseStatusCode, 95 | resolvedUrl: '', 96 | screenshot: null, 97 | }; 98 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }, true); 99 | sendErrorResponseById(request.uniqueKey, JSON.stringify(verboseResponse), statusCode); 100 | } else { 101 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: errorResponse, errors: requestDetails.requestErrors }, true); 102 | sendErrorResponseById(request.uniqueKey, JSON.stringify(errorResponse), statusCode); 103 | } 104 | }, 105 | preNavigationHooks: [ 106 | async ({ request, page, blockRequests }) => { 107 | const { timeMeasures, blockResources, width, height, blockResourceTypes, jsonResponse, requestDetails } = request.userData as UserData; 108 | timeMeasures.push({ 109 | event: 'pre-navigation hook', 110 | time: Date.now(), 111 | }); 112 | 113 | await page.setViewportSize({ width, height }); 114 | 115 | if (request.label === Label.BROWSER && blockResources) { 116 | await blockRequests({ 117 | extraUrlPatterns: ['*.svg'], 118 | }); 119 | } 120 | 121 | if (request.label === Label.BROWSER && blockResourceTypes.length) { 122 | await page.route('**', async (route) => { 123 | if (blockResourceTypes.includes(route.request().resourceType())) { 124 | await route.abort(); 125 | } 126 | }); 127 | } 128 | 129 | if (request.label === Label.BROWSER && jsonResponse) { 130 | page.on('response', async (resp) => { 131 | try { 132 | const req = resp.request(); 133 | if (req.resourceType() !== 'xhr') { 134 | return; 135 | } 136 | 137 | requestDetails.xhr.push({ 138 | url: req.url(), 139 | statusCode: resp.status(), 140 | method: req.method(), 141 | requestHeaders: req.headers(), 142 | headers: resp.headers(), 143 | body: (await resp.body()).toString(), 144 | }); 145 | } catch (e) { 146 | log.warning((e as Error).message); 147 | } 148 | }); 149 | } 150 | }, 151 | ], 152 | requestHandler: router, 153 | }); 154 | 155 | // TODO: This is just for Crawlee perf measurement, remove it once we properly understand the bottlenecks 156 | // @ts-expect-error Overriding internal method 157 | const origRunTaskFunction = crawler.autoscaledPoolOptions.runTaskFunction.bind(crawler); 158 | // @ts-expect-error Overriding internal method 159 | crawler.autoscaledPoolOptions.runTaskFunction = async function () { 160 | // This code runs before we pull request from queue so we have to approximate that by having mutable global 161 | // It will ofc be wrong if someone bombs requests with interval shorter than 1 sec 162 | (global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure = { 163 | event: 'crawlee internal run task', 164 | time: Date.now(), 165 | }; 166 | await (origRunTaskFunction as AutoscaledPoolOptions['runTaskFunction'])!(); 167 | }; 168 | 169 | // @ts-expect-error Overriding internal method 170 | const origRunRequestHandler = crawler._runRequestHandler.bind(crawler); 171 | // @ts-expect-error Overriding internal method 172 | crawler._runRequestHandler = async function (context: PlaywrightCrawlingContext) { 173 | context.request.userData.timeMeasures.push({ 174 | event: 'crawlee internal request handler', 175 | time: Date.now(), 176 | }); 177 | await origRunRequestHandler(context); 178 | }; 179 | 180 | await crawler.stats.stopCapturing(); 181 | crawler.run().then(() => log.warning(`Crawler ended`, crawlerOptions), () => { }); 182 | crawlers.set(JSON.stringify(crawlerOptions), crawler); 183 | log.info('Crawler ready 🫡', crawlerOptions); 184 | return crawler; 185 | }; 186 | 187 | export const addRequest = async (request: RequestOptions, res: ServerResponse, crawlerOptions: CrawlerOptions) => { 188 | const key = JSON.stringify(crawlerOptions); 189 | const crawler = crawlers.has(key) ? crawlers.get(key)! : await createAndStartCrawler(crawlerOptions); 190 | 191 | addResponse(request.uniqueKey!, res); 192 | 193 | request.userData?.timeMeasures.push({ 194 | event: 'before queue add', 195 | time: Date.now(), 196 | }); 197 | await crawler.requestQueue!.addRequest(request); 198 | }; 199 | -------------------------------------------------------------------------------- /src/errors.ts: -------------------------------------------------------------------------------- 1 | export class UserInputError extends Error { 2 | constructor(message: string) { 3 | super(message); 4 | this.name = 'UserInputError'; 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /src/extract_rules_utils.ts: -------------------------------------------------------------------------------- 1 | import { AnyNode, Cheerio, CheerioAPI, load } from 'cheerio'; 2 | import { ExtractRule, ExtractRules } from './types.js'; 3 | import { UserInputError } from './errors.js'; 4 | 5 | // validation and transformation to full Extract Rules (i.e. including all parameters, not the shortened version, for easier scraping process) 6 | 7 | function validateAndTransformFullOptionsRule(key: string, inputtedExtractRule: Record): ExtractRule { 8 | const { selector, output = 'text', type = 'item', clean = true } = inputtedExtractRule; 9 | 10 | if (!selector || typeof selector !== 'string' || !selector.length) { 11 | throw new UserInputError(`Selector must be a non-empty string, rule for key: ${key}`); 12 | } 13 | 14 | if (typeof type !== 'string' || (type !== 'item' && type !== 'list')) { 15 | throw new UserInputError(`Type can be either 'item' or 'list', rule for a key: ${key}`); 16 | } 17 | 18 | if (typeof clean !== 'boolean') { 19 | throw new UserInputError('Clean can be set either to true or false'); 20 | } 21 | 22 | if (typeof output === 'string') { 23 | const availableTypes = ['text', 'html', 'table_json', 'table_array']; 24 | const trimmed = (output as string).trim(); 25 | if (availableTypes.includes(trimmed) || trimmed.startsWith('@')) { 26 | return { 27 | selector, 28 | type, 29 | output: trimmed, 30 | clean, 31 | }; 32 | } 33 | 34 | throw new UserInputError( 35 | `Result in the extract rule for ${key} has invalid value, expected one of ${JSON.stringify(availableTypes)} or an attribute name starting with '@'`, 36 | ); 37 | } 38 | 39 | if (typeof output === 'object') { 40 | const nestedRules = validateAndTransformExtractRules(output as Record); 41 | return { 42 | selector, 43 | type, 44 | output: nestedRules, 45 | clean, 46 | }; 47 | } 48 | 49 | throw new UserInputError(`Output in the extract rule for ${key} in a wrong format, expected object or a string`); 50 | } 51 | 52 | function validateAndTransformShortenedRule(key: string, inputtedRule: string): ExtractRule { 53 | const trimmedRule = inputtedRule.trim(); 54 | 55 | if (trimmedRule.includes('@')) { 56 | const selector = trimmedRule.split('@').shift() as string; 57 | if (!selector.length) { 58 | throw new UserInputError(`Selector cannot be an empty string, rule: ${trimmedRule} for key ${key}`); 59 | } 60 | 61 | const attributeName = trimmedRule.slice(selector.length); 62 | if (!attributeName.length) { 63 | throw new UserInputError(`Attribute name cannot be an empty string, rule: ${trimmedRule} for key ${key}`); 64 | } 65 | 66 | return { 67 | selector, 68 | type: 'item', 69 | output: attributeName, 70 | clean: true, 71 | }; 72 | } 73 | 74 | return { 75 | selector: trimmedRule, 76 | type: 'item', 77 | output: 'text', 78 | clean: true, 79 | }; 80 | } 81 | 82 | export function validateAndTransformExtractRules(inputtedExtractRules: Record): ExtractRules { 83 | const extractRules: ExtractRules = {}; 84 | 85 | for (const entry of Object.entries(inputtedExtractRules)) { 86 | const key = entry[0]; 87 | const keyValue = entry[1]; 88 | if (typeof keyValue === 'object') { 89 | extractRules[key] = validateAndTransformFullOptionsRule(key, keyValue as Record); 90 | } else if (typeof keyValue === 'string') { 91 | extractRules[key] = validateAndTransformShortenedRule(key, keyValue); 92 | } else { 93 | throw new UserInputError(`Extract rule for ${key} in a wrong format, expected object or a string`); 94 | } 95 | } 96 | 97 | return extractRules; 98 | } 99 | 100 | // scraping based on full Extract Rules 101 | 102 | function scrapeTable(item: Cheerio) { 103 | const $ = load(item.html() || ''); 104 | const headings: string[] = []; 105 | item.find('tr').has('th').eq(0).find('th') 106 | .each((_, el) => { 107 | headings.push($(el).text().trim()); 108 | }); 109 | if (!headings.length) { 110 | return []; 111 | } 112 | 113 | const data: Record[] = []; 114 | item.find('tr').has('td').each((_, el) => { 115 | const rowData: Record = {}; 116 | const tdElements = $(el).find('td'); 117 | for (let i = 0; i < headings.length; i++) { 118 | const val = tdElements.eq(i).text().trim(); 119 | rowData[headings[i]] = val; 120 | } 121 | data.push(rowData); 122 | }); 123 | return data; 124 | } 125 | 126 | function scrapeItems(item: Cheerio, output: string | ExtractRules, clean: boolean) { 127 | if (output === 'text') { 128 | if (clean) { 129 | return item.text().trim() || null; 130 | } 131 | return item.text() || ''; 132 | } 133 | 134 | if (output === 'html') { 135 | // we do this so the HTML od the whole element returns, not just its inner HTML 136 | const $ = load(''); 137 | const newHtmlWithItem = $('body').append(item); 138 | return newHtmlWithItem.html() || ''; 139 | } 140 | 141 | if (output === 'table_json' || output === 'table_array') { 142 | const data = scrapeTable(item); 143 | if (output === 'table_json') { 144 | return data; 145 | } 146 | return data.map((row) => Object.values(row)); 147 | } 148 | 149 | if (typeof output === 'string' && output.startsWith('@')) { 150 | return item.attr(output.slice(1)) || ''; 151 | } 152 | 153 | if (typeof output === 'object') { 154 | /* 155 | This is here to have an option to work with already selected element(s). Scraping bee 156 | does it like this, we could replace it with something like '.' to refer the element itself. 157 | Example why this is needed: 158 | { 159 | allLinks: { 160 | type: 'list', 161 | selector: 'a', <--- selects all 'a' elements 162 | result: { 163 | linkTitle: 'a', <--- refers to each 'a' element that were selected before (in the level above) 164 | link: 'a@href' <--- refers to each 'a' element that were selected before (in the level above) 165 | } 166 | } 167 | } 168 | */ 169 | const $ = load(''); 170 | const newHtmlWithItem = $('body').append(item); 171 | return scrapeExtractRules(newHtmlWithItem, output); 172 | } 173 | throw new UserInputError('Invalid output value'); 174 | } 175 | 176 | function scrapeExtractRules($: Cheerio, extractRules: ExtractRules) { 177 | const scrapedData: Record = {}; 178 | 179 | for (const entries of Object.entries(extractRules)) { 180 | const key = entries[0]; 181 | const rule = entries[1]; 182 | 183 | const { selector, type, output, clean } = rule; 184 | 185 | const itemsFoundBySelector = $.find(selector); 186 | if (type === 'item') { 187 | scrapedData[key] = scrapeItems(itemsFoundBySelector.eq(0), output, clean); 188 | } else { 189 | const resultList: unknown[] = []; 190 | itemsFoundBySelector.each((i) => { 191 | resultList.push(scrapeItems(itemsFoundBySelector.eq(i), output, clean)); 192 | }).get(); 193 | scrapedData[key] = resultList; 194 | } 195 | } 196 | return scrapedData; 197 | } 198 | 199 | export function scrapeBasedOnExtractRules($: CheerioAPI, extractRules: ExtractRules) { 200 | const html = $('html'); 201 | return scrapeExtractRules(html, extractRules); 202 | } 203 | -------------------------------------------------------------------------------- /src/instructions_utils.ts: -------------------------------------------------------------------------------- 1 | import { Page } from 'playwright'; 2 | import { sleep } from 'crawlee'; 3 | import { Action, FullJsScenarioReport, IndividualInstructionReport, Instruction, JsScenario } from './types.js'; 4 | import { UserInputError } from './errors.js'; 5 | 6 | export const parseAndValidateInstructions = (rawInput: string): JsScenario => { 7 | const input = JSON.parse(rawInput); 8 | 9 | let strictMode = true; 10 | if (input.strict !== undefined) { 11 | if (typeof input.strict !== 'boolean') { 12 | throw new UserInputError('Parameter strict in js_scenario can be only true or false'); 13 | } 14 | strictMode = input.strict; 15 | } 16 | 17 | if (!input.instructions || !Array.isArray(input.instructions)) { 18 | return { 19 | strict: strictMode, 20 | instructions: [], 21 | }; 22 | } 23 | 24 | const instructions = input.instructions as Record[]; 25 | const parsedInstructions: Instruction[] = []; 26 | for (const instruction of instructions) { 27 | if (typeof instruction !== 'object') { 28 | throw new UserInputError('Instruction must be an object'); 29 | } 30 | if (Object.keys(instruction).length !== 1) { 31 | throw new UserInputError('Instruction must include only one action with params'); 32 | } 33 | const action = Object.keys(instruction)[0]; 34 | const param = instruction[action]; 35 | 36 | const possibleActions = ['wait', 'wait_for', 'click', 'scroll_x', 'scroll_y', 'fill', 'evaluate', 'wait_for_and_click']; // todo 37 | if (typeof action !== 'string' || !possibleActions.includes(action)) { 38 | throw new UserInputError(`Unsupported instruction: ${action}`); 39 | } 40 | 41 | if (typeof param !== 'string' && typeof param !== 'number' && !Array.isArray(param)) { 42 | throw new UserInputError(`Unsupported params: ${action}, can be either number, string, or an array of strings`); 43 | } 44 | 45 | if (action === 'wait_for_and_click') { 46 | parsedInstructions.push({ action: 'wait_for', param }); 47 | parsedInstructions.push({ action: 'click', param }); 48 | continue; 49 | } 50 | 51 | parsedInstructions.push({ action: action as Action, param }); 52 | } 53 | 54 | return { 55 | instructions: parsedInstructions, 56 | strict: strictMode, 57 | }; 58 | }; 59 | 60 | const performInstruction = async (instruction: Instruction, page: Page): Promise<{ success: boolean, errorMessage?: string | undefined; result?: string; }> => { 61 | try { 62 | let result; 63 | switch (instruction.action) { 64 | case 'wait': { 65 | await sleep(instruction.param as number); 66 | break; 67 | } 68 | case 'click': { 69 | await page.click(instruction.param as string, { timeout: 5000 }); 70 | break; 71 | } 72 | case 'wait_for': { 73 | await page.waitForSelector(instruction.param as string); 74 | break; 75 | } 76 | case 'fill': { 77 | const params = instruction.param as string[]; 78 | await page.fill(params[0], params[1]); 79 | break; 80 | } 81 | case 'scroll_x': { 82 | const paramX = instruction.param as number; 83 | await page.mouse.wheel(paramX, 0); 84 | break; 85 | } 86 | case 'scroll_y': { 87 | const paramY = instruction.param as number; 88 | await page.mouse.wheel(0, paramY); 89 | break; 90 | } 91 | case 'wait_browser': { 92 | await page.waitForLoadState(instruction.param as 'load' | 'domcontentloaded' | 'networkidle'); 93 | break; 94 | } 95 | case 'evaluate': { 96 | const evaluateResult = await page.evaluate(instruction.param as string); 97 | if (['boolean', 'number', 'string'].includes(typeof evaluateResult)) { 98 | result = String(evaluateResult); 99 | } else if (typeof evaluateResult === 'object') { 100 | result = JSON.stringify(evaluateResult); 101 | } 102 | break; 103 | } 104 | default: { 105 | return { success: false, errorMessage: 'unknown instruction' }; 106 | } 107 | } 108 | return { success: true, result }; 109 | } catch (e) { 110 | return { success: false, errorMessage: (e as Error).message }; 111 | } 112 | }; 113 | 114 | export const performInstructionsAndGenerateReport = async (jsScenario: JsScenario, page: Page): Promise => { 115 | const { strict, instructions } = jsScenario; 116 | 117 | let executed: number = 0; 118 | let success: number = 0; 119 | let failed: number = 0; 120 | const reports: IndividualInstructionReport[] = []; 121 | const evaluateResults: string[] = []; 122 | const start = Date.now(); 123 | 124 | for (const instruction of instructions) { 125 | const instructionStart = Date.now(); 126 | const instructionResult = await performInstruction(instruction, page); 127 | const instructionDuration = (Date.now() - instructionStart) / 1000; 128 | 129 | executed += 1; 130 | if (instructionResult.success) { 131 | success += 1; 132 | if (instruction.action === 'evaluate' && instructionResult.result) { 133 | evaluateResults.push(instructionResult.result); 134 | } 135 | } else { 136 | failed += 1; 137 | } 138 | 139 | reports.push({ 140 | task: instruction.action, 141 | params: instruction.param, 142 | duration: instructionDuration, 143 | success: instructionResult.success, 144 | }); 145 | 146 | if (strict && !instructionResult.success) { 147 | break; 148 | } 149 | } 150 | const totalDuration = (Date.now() - start) / 1000; 151 | return { 152 | jsScenarioReport: { 153 | totalDuration, 154 | taskExecuted: executed, 155 | taskSuccess: success, 156 | taskFailure: failed, 157 | tasks: reports, 158 | }, 159 | evaluateResults, 160 | }; 161 | }; 162 | -------------------------------------------------------------------------------- /src/main.ts: -------------------------------------------------------------------------------- 1 | import { Actor, log } from 'apify'; 2 | import { createServer } from 'http'; 3 | import { CrawlerOptions } from './types.js'; 4 | import { addRequest, createAndStartCrawler, DEFAULT_CRAWLER_OPTIONS } from './crawlers.js'; 5 | import { addTimeoutToAllResponses, sendErrorResponseById } from './responses.js'; 6 | import { ScrapingBee } from './params.js'; 7 | import { createProxyOptions, createRequestForCrawler, parseParameters } from './utils.js'; 8 | import { UserInputError } from './errors.js'; 9 | 10 | await Actor.init(); 11 | 12 | if (Actor.isAtHome() && Actor.getEnv().metaOrigin !== 'STANDBY') { 13 | await Actor.fail('The Actor must start by being called using its Standby endpoint.'); 14 | } 15 | 16 | Actor.on('migrating', () => { 17 | addTimeoutToAllResponses(60); 18 | }); 19 | 20 | const server = createServer(async (req, res) => { 21 | const requestReceivedTime = Date.now(); 22 | if (req.method !== 'HEAD') { 23 | log.info(`Request received: ${req.method} ${req.url}`); 24 | } 25 | try { 26 | const params = parseParameters(req.url!); 27 | const crawlerRequest = createRequestForCrawler(params, req); 28 | crawlerRequest.userData?.timeMeasures.push({ 29 | event: 'request received', 30 | time: requestReceivedTime, 31 | }); 32 | 33 | let timeout = 140000; 34 | if (params[ScrapingBee.timeout]) { 35 | const timeoutNumber = Number.parseInt(params[ScrapingBee.timeout] as string, 10); 36 | if (Number.isNaN(timeoutNumber)) { 37 | throw new UserInputError('Parameter timeout must be a number'); 38 | } 39 | if (timeoutNumber < 1000 || timeoutNumber > 3600000) { 40 | throw new UserInputError('Parameter timeout must be between 1000 and 3600000 ms (1 hour)'); 41 | } 42 | timeout = timeoutNumber; 43 | } 44 | 45 | setTimeout(() => { 46 | const timeoutErrorMessage = { 47 | errorMessage: `Response timed out.`, 48 | }; 49 | sendErrorResponseById(crawlerRequest.uniqueKey!, JSON.stringify(timeoutErrorMessage)); 50 | }, timeout); 51 | 52 | const crawlerOptions: CrawlerOptions = { 53 | proxyConfigurationOptions: createProxyOptions(params), 54 | }; 55 | await addRequest(crawlerRequest, res, crawlerOptions); 56 | } catch (e) { 57 | const error = e as Error; 58 | const errorMessage = { 59 | errorMessage: error.message, 60 | }; 61 | const statusCode = error instanceof UserInputError ? 400 : 500; 62 | res.writeHead(statusCode, { 'Content-Type': 'application/json' }); 63 | res.end(JSON.stringify(errorMessage)); 64 | } 65 | }); 66 | 67 | const port = Actor.isAtHome() ? process.env.ACTOR_STANDBY_PORT : 8080; 68 | server.listen(port, async () => { 69 | log.info('SuperScraper is listening for user requests'); 70 | 71 | // Pre-create common crawlers because crawler init can take about 1 sec 72 | await Promise.all([ 73 | createAndStartCrawler(DEFAULT_CRAWLER_OPTIONS), 74 | createAndStartCrawler({ ...DEFAULT_CRAWLER_OPTIONS, proxyConfigurationOptions: { groups: ['RESIDENTIAL'] } }), 75 | ]); 76 | }); 77 | -------------------------------------------------------------------------------- /src/params.ts: -------------------------------------------------------------------------------- 1 | export enum ScrapingBee { 2 | // skipped for now: session_id, block_ads 3 | url = 'url', 4 | extractRules = 'extract_rules', 5 | device = 'device', 6 | jsScenario = 'js_scenario', 7 | renderJs = 'render_js', 8 | wait = 'wait', 9 | waitFor = 'wait_for', 10 | waitBrowser = 'wait_browser', 11 | screenshot = 'screenshot', 12 | screenshotFullPage = 'screenshot_full_page', 13 | screenshotSelector = 'screenshot_selector', 14 | windowWidth = 'window_width', 15 | windowHeight = 'window_height', 16 | returnPageSource = 'return_page_source', 17 | transparentStatusCode = 'transparent_status_code', 18 | forwardHeaders = 'forward_headers', 19 | forwardHeadersPure = 'forward_headers_pure', 20 | cookies = 'cookies', 21 | timeout = 'timeout', 22 | customGoogle = 'custom_google', 23 | ownProxy = 'own_proxy', 24 | premiumProxy = 'premium_proxy', 25 | stealthProxy = 'stealth_proxy', 26 | countryCode = 'country_code', 27 | jsonResponse = 'json_response', 28 | blockResources = 'block_resources' 29 | } 30 | 31 | export enum ScrapingAnt { 32 | // we already have: url, return_page_source, cookies 33 | browser = 'browser', 34 | jsSnippet = 'js_snippet', 35 | proxyType = 'proxy_type', 36 | waitForSelector = 'wait_for_selector', 37 | blockResource = 'block_resource', 38 | proxyCountry = 'proxy_country', 39 | } 40 | 41 | export enum ScraperApi { 42 | // we already have: wait_for_selector, country_code 43 | // skipped for now: session_number, autoparse 44 | render = 'render', 45 | premium = 'premium', 46 | binaryTarget = 'binary_target', 47 | keepHeaders = 'keep_headers', 48 | deviceType = 'device_type', 49 | ultraPremium = 'ultra_premium', 50 | } 51 | 52 | export const EquivalentParameters = { 53 | [ScrapingBee.device]: [ScrapingBee.device, ScraperApi.deviceType], 54 | [ScrapingBee.renderJs]: [ScrapingAnt.browser, ScraperApi.render], 55 | [ScrapingBee.waitFor]: [ScrapingAnt.waitForSelector], 56 | [ScrapingBee.premiumProxy]: [ScrapingBee.stealthProxy, ScraperApi.premium, ScraperApi.ultraPremium], 57 | [ScrapingBee.countryCode]: [ScrapingAnt.proxyCountry], 58 | }; 59 | -------------------------------------------------------------------------------- /src/responses.ts: -------------------------------------------------------------------------------- 1 | import { log } from 'apify'; 2 | import { ServerResponse } from 'http'; 3 | 4 | const responses = new Map(); 5 | 6 | export const sendSuccResponseById = (responseId: string, result: unknown, contentType: string) => { 7 | const res = responses.get(responseId); 8 | if (!res) { 9 | log.info(`Response for request ${responseId} not found`); 10 | return; 11 | } 12 | res.writeHead(200, { 'Content-Type': contentType }); 13 | res.end(result); 14 | responses.delete(responseId); 15 | }; 16 | 17 | export const sendErrorResponseById = (responseId: string, result: string, statusCode: number = 500) => { 18 | const res = responses.get(responseId); 19 | if (!res) { 20 | log.info(`Response for request ${responseId} not found`); 21 | return; 22 | } 23 | res.writeHead(statusCode, { 'Content-Type': 'application/json' }); 24 | res.end(result); 25 | responses.delete(responseId); 26 | }; 27 | 28 | export const addResponse = (responseId: string, response: ServerResponse) => { 29 | responses.set(responseId, response); 30 | }; 31 | 32 | export const addTimeoutToAllResponses = (timeoutInSeconds: number = 60) => { 33 | const migrationErrorMessage = { 34 | errorMessage: `Actor had to migrate to another server. Please, retry your request.`, 35 | }; 36 | 37 | const responseKeys = Object.keys(responses); 38 | 39 | for (const key of responseKeys) { 40 | setTimeout(() => { 41 | sendErrorResponseById(key, JSON.stringify(migrationErrorMessage)); 42 | }, timeoutInSeconds * 1000); 43 | } 44 | }; 45 | -------------------------------------------------------------------------------- /src/router.ts: -------------------------------------------------------------------------------- 1 | import { createPlaywrightRouter } from 'crawlee'; 2 | import { CheerioAPI, load } from 'cheerio'; 3 | import { Label } from './const.js'; 4 | import { FullJsScenarioReport, IFrameData, TimeMeasure, UserData, VerboseResult } from './types.js'; 5 | import { performInstructionsAndGenerateReport } from './instructions_utils.js'; 6 | import { sendSuccResponseById } from './responses.js'; 7 | import { scrapeBasedOnExtractRules } from './extract_rules_utils.js'; 8 | import { pushLogData } from './utils.js'; 9 | 10 | export const router = createPlaywrightRouter(); 11 | 12 | router.addHandler(Label.BROWSER, async ({ request, page, response, parseWithCheerio }) => { 13 | const { 14 | requestDetails, 15 | jsonResponse, 16 | extractRules, 17 | screenshotSettings, 18 | inputtedUrl, 19 | parsedInputtedParams, 20 | timeMeasures, 21 | jsScenario, 22 | returnPageSource, 23 | } = request.userData; 24 | 25 | // See comment in crawler.autoscaledPoolOptions.runTaskFunction override 26 | timeMeasures.push((global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure); 27 | 28 | const responseId = request.uniqueKey; 29 | 30 | timeMeasures.push({ 31 | event: 'page loaded', 32 | time: Date.now(), 33 | }); 34 | 35 | const jsScenarioReportFull: FullJsScenarioReport = {}; 36 | if (jsScenario.instructions.length) { 37 | const { jsScenarioReport, evaluateResults } = await performInstructionsAndGenerateReport(jsScenario, page); 38 | jsScenarioReportFull.jsScenarioReport = jsScenarioReport; 39 | jsScenarioReportFull.evaluateResults = evaluateResults; 40 | } 41 | 42 | requestDetails.resolvedUrl = response?.url() || ''; 43 | requestDetails.responseHeaders = response?.headers() || {}; 44 | const $ = await parseWithCheerio(); 45 | const statusCode = response?.status() || null; 46 | 47 | const cookies = await page.context().cookies(request.url) || []; 48 | 49 | const iframes: IFrameData[] = []; 50 | if (jsonResponse) { 51 | const frames = page.frames(); 52 | for (const frame of frames) { 53 | let frameEl; 54 | try { 55 | frameEl = await frame.frameElement(); 56 | } catch (e) { 57 | continue; 58 | } 59 | 60 | const src = await frameEl.getAttribute('src') || ''; 61 | const content = await frame.content(); 62 | 63 | iframes.push({ 64 | src, 65 | content, 66 | }); 67 | } 68 | } 69 | 70 | let screenshot = null; 71 | if (screenshotSettings.screenshotType !== 'none') { 72 | const { screenshotType, selector } = screenshotSettings; 73 | let screenshotBuffer: Buffer; 74 | if (screenshotType === 'full') { 75 | screenshotBuffer = await page.screenshot({ fullPage: true }); 76 | } else if (screenshotType === 'window') { 77 | screenshotBuffer = await page.screenshot(); 78 | } else { 79 | screenshotBuffer = await page.locator(selector as string).screenshot(); 80 | } 81 | screenshot = screenshotBuffer.toString('base64'); 82 | 83 | if (!jsonResponse) { 84 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: screenshot, errors: requestDetails.requestErrors }); 85 | sendSuccResponseById(responseId, screenshotBuffer, 'image/png'); 86 | return; 87 | } 88 | } 89 | 90 | if (extractRules) { 91 | const resultFromExtractRules = scrapeBasedOnExtractRules($ as CheerioAPI, extractRules); 92 | if (jsonResponse) { 93 | const verboseResponse: VerboseResult = { 94 | body: resultFromExtractRules, 95 | cookies, 96 | evaluateResults: jsScenarioReportFull.evaluateResults || [], 97 | jsScenarioReport: jsScenarioReportFull.jsScenarioReport || {}, 98 | headers: requestDetails.responseHeaders, 99 | type: 'json', 100 | iframes, 101 | xhr: requestDetails.xhr, 102 | initialStatusCode: statusCode, 103 | resolvedUrl: requestDetails.resolvedUrl, 104 | screenshot, 105 | }; 106 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }); 107 | sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json'); 108 | } else { 109 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: resultFromExtractRules, errors: requestDetails.requestErrors }); 110 | sendSuccResponseById(responseId, JSON.stringify(resultFromExtractRules), 'application/json'); 111 | } 112 | return; 113 | } 114 | 115 | // response.body() contains HTML of the page before js rendering 116 | const htmlResult = returnPageSource 117 | ? (await response?.body())?.toString() as string 118 | : $.html(); 119 | 120 | if (jsonResponse) { 121 | const verboseResponse: VerboseResult = { 122 | body: htmlResult, 123 | cookies, 124 | evaluateResults: jsScenarioReportFull.evaluateResults || [], 125 | jsScenarioReport: jsScenarioReportFull.jsScenarioReport || {}, 126 | headers: requestDetails.responseHeaders, 127 | type: 'html', 128 | iframes, 129 | xhr: requestDetails.xhr, 130 | initialStatusCode: statusCode, 131 | resolvedUrl: requestDetails.resolvedUrl, 132 | screenshot, 133 | }; 134 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }); 135 | sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json'); 136 | return; 137 | } 138 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: htmlResult, errors: requestDetails.requestErrors }); 139 | sendSuccResponseById(responseId, htmlResult, 'text/html'); 140 | }); 141 | 142 | router.addHandler(Label.HTTP, async ({ request, sendRequest }) => { 143 | const { 144 | requestDetails, 145 | jsonResponse, 146 | extractRules, 147 | inputtedUrl, 148 | parsedInputtedParams, 149 | timeMeasures, 150 | } = request.userData as UserData; 151 | 152 | // See comment in crawler.autoscaledPoolOptions.runTaskFunction override 153 | timeMeasures.push((global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure); 154 | 155 | const responseId = request.uniqueKey; 156 | 157 | const resp = await sendRequest({ 158 | url: request.url, 159 | throwHttpErrors: false, 160 | headers: request.headers, 161 | }); 162 | 163 | timeMeasures.push({ 164 | event: 'page loaded', 165 | time: Date.now(), 166 | }); 167 | 168 | const { statusCode } = resp; 169 | if (resp.statusCode >= 300 && resp.statusCode !== 404) { 170 | (request.userData as UserData).nonbrowserRequestStatus = resp.statusCode; 171 | throw new Error(`HTTPError: Response code ${resp.statusCode}`); 172 | } 173 | 174 | requestDetails.resolvedUrl = resp.url; 175 | requestDetails.responseHeaders = resp.headers as Record; 176 | 177 | if (extractRules) { 178 | const $ = load(resp.body); 179 | const resultFromExtractRules = scrapeBasedOnExtractRules($, extractRules); 180 | if (jsonResponse) { 181 | const verboseResponse: VerboseResult = { 182 | body: resultFromExtractRules, 183 | cookies: [], 184 | evaluateResults: [], 185 | jsScenarioReport: {}, 186 | headers: requestDetails.responseHeaders, 187 | type: 'json', 188 | iframes: [], 189 | xhr: [], 190 | initialStatusCode: statusCode, 191 | resolvedUrl: requestDetails.resolvedUrl, 192 | screenshot: null, 193 | }; 194 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }); 195 | sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json'); 196 | } else { 197 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: resultFromExtractRules, errors: requestDetails.requestErrors }); 198 | sendSuccResponseById(responseId, JSON.stringify(resultFromExtractRules), 'application/json'); 199 | } 200 | return; 201 | } 202 | 203 | const htmlResult = resp.body; 204 | if (jsonResponse) { 205 | const verboseResponse: VerboseResult = { 206 | body: htmlResult, 207 | cookies: [], 208 | evaluateResults: [], 209 | jsScenarioReport: {}, 210 | headers: requestDetails.responseHeaders, 211 | type: 'html', 212 | iframes: [], 213 | xhr: [], 214 | initialStatusCode: statusCode, 215 | resolvedUrl: requestDetails.resolvedUrl, 216 | screenshot: null, 217 | }; 218 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }); 219 | sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json'); 220 | return; 221 | } 222 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: htmlResult, errors: requestDetails.requestErrors }); 223 | sendSuccResponseById(responseId, htmlResult, 'text/html'); 224 | }); 225 | 226 | router.addHandler(Label.BINARY_TARGET, async ({ request, sendRequest }) => { 227 | const { 228 | requestDetails, 229 | jsonResponse, 230 | inputtedUrl, 231 | parsedInputtedParams, 232 | timeMeasures, 233 | } = request.userData as UserData; 234 | 235 | // See comment in crawler.autoscaledPoolOptions.runTaskFunction override 236 | timeMeasures.push((global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure); 237 | 238 | const responseId = request.uniqueKey; 239 | 240 | const resp = await sendRequest({ 241 | url: request.url, 242 | throwHttpErrors: false, 243 | headers: request.headers, 244 | }); 245 | 246 | timeMeasures.push({ 247 | event: 'page loaded', 248 | time: Date.now(), 249 | }); 250 | 251 | const { statusCode } = resp; 252 | if (resp.statusCode >= 300 && resp.statusCode !== 404) { 253 | (request.userData as UserData).nonbrowserRequestStatus = resp.statusCode; 254 | throw new Error(`HTTPError: Response code ${resp.statusCode}`); 255 | } 256 | 257 | requestDetails.resolvedUrl = resp.url; 258 | requestDetails.responseHeaders = resp.headers as Record; 259 | const result = resp.rawBody; 260 | const contentType = resp.headers['content-type']; 261 | if (!contentType) { 262 | throw new Error(`No content-type returned in the response`); 263 | } 264 | 265 | if (jsonResponse) { 266 | const verboseResponse: VerboseResult = { 267 | body: result.toString(), 268 | cookies: [], 269 | evaluateResults: [], 270 | jsScenarioReport: {}, 271 | headers: requestDetails.responseHeaders, 272 | type: 'file', 273 | iframes: [], 274 | xhr: [], 275 | initialStatusCode: statusCode, 276 | resolvedUrl: requestDetails.resolvedUrl, 277 | screenshot: null, 278 | }; 279 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }); 280 | sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json'); 281 | return; 282 | } 283 | 284 | await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result, errors: requestDetails.requestErrors }); 285 | sendSuccResponseById(responseId, result, contentType); 286 | }); 287 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import type { ProxyConfigurationOptions } from 'apify'; 2 | import { Cookie } from 'crawlee'; 3 | 4 | export interface RequestDetails { 5 | requestErrors: { attempt: number, errorMessage: string }[], 6 | resolvedUrl: string | null, 7 | responseHeaders: Record | null, 8 | xhr: XHRRequestData[], 9 | } 10 | 11 | export interface XHRRequestData { 12 | url: string, 13 | statusCode: number, 14 | method: string, 15 | requestHeaders: Record, 16 | headers: Record, 17 | body: string, 18 | } 19 | 20 | export interface IFrameData { 21 | src: string, 22 | content: string, 23 | } 24 | 25 | export interface VerboseResult { 26 | body: string | Record, 27 | cookies: Cookie[], 28 | evaluateResults: string[], 29 | jsScenarioReport: JsScenarioReport | Record, 30 | headers: Record, 31 | type: 'html' | 'json' | 'file', 32 | screenshot: string | null, 33 | iframes: IFrameData[], 34 | xhr: XHRRequestData[], 35 | initialStatusCode: number | null, 36 | resolvedUrl: string, 37 | metadata?: string, 38 | } 39 | 40 | export interface ExtractRule { 41 | selector: string, 42 | type: 'list' | 'item', 43 | output: string | Record 44 | clean: boolean, 45 | } 46 | 47 | export type ExtractRules = Record; 48 | 49 | export interface TimeMeasure { 50 | event: 'request received' | 'before queue add' | 'crawlee internal run task' | 'crawlee internal request handler' | 'pre-navigation hook' | 51 | 'page loaded' | 'handler end' | 'error' | 'failed request', 52 | time: number, 53 | } 54 | 55 | export type Action = 'wait' | 'wait_for' | 'click' | 'scroll_x' | 'scroll_y' | 'fill' | 'wait_browser' | 'evaluate'; 56 | type ActionParam = number | string | string[]; 57 | 58 | export interface Instruction { 59 | action: Action, 60 | param: ActionParam, 61 | } 62 | 63 | export interface JsScenario { 64 | instructions: Instruction[], 65 | strict: boolean, 66 | } 67 | 68 | export interface IndividualInstructionReport { 69 | task: Action, 70 | params: ActionParam, 71 | success: boolean, 72 | duration: number, 73 | } 74 | 75 | export interface JsScenarioReport { 76 | tasks: IndividualInstructionReport[], 77 | taskExecuted: number, 78 | taskSuccess: number, 79 | taskFailure: number, 80 | totalDuration: number, 81 | } 82 | 83 | export interface FullJsScenarioReport { 84 | evaluateResults?: string[], 85 | jsScenarioReport?: JsScenarioReport, 86 | } 87 | 88 | export interface ScreenshotSettings { 89 | screenshotType: 'none' | 'window' | 'full' | 'selector', 90 | selector?: string, 91 | } 92 | 93 | export interface UserData { 94 | jsonResponse: boolean, 95 | screenshotSettings: ScreenshotSettings, 96 | requestDetails: RequestDetails, 97 | extractRules: ExtractRules | null, 98 | inputtedUrl: string, 99 | parsedInputtedParams: Record, 100 | timeMeasures: TimeMeasure[], 101 | jsScenario: JsScenario, 102 | blockResources: boolean, 103 | blockResourceTypes: string[], 104 | height: number, 105 | width: number, 106 | returnPageSource: boolean, 107 | transparentStatusCode: boolean, 108 | nonbrowserRequestStatus?: number, 109 | binaryTarget: boolean, 110 | } 111 | 112 | export interface CrawlerOptions { 113 | proxyConfigurationOptions: ProxyConfigurationOptions; 114 | } 115 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | import type { ParsedUrlQuery } from 'querystring'; 2 | import { parse } from 'querystring'; 3 | import type { IncomingMessage } from 'http'; 4 | import { RequestOptions } from 'crawlee'; 5 | import { v4 as uuidv4 } from 'uuid'; 6 | import { HeaderGenerator } from 'header-generator'; 7 | import { Actor, ProxyConfigurationOptions, log } from 'apify'; 8 | import { TimeMeasure, JsScenario, RequestDetails, ScreenshotSettings, UserData } from './types.js'; 9 | import { EquivalentParameters, ScrapingBee, ScraperApi, ScrapingAnt } from './params.js'; 10 | import { UserInputError } from './errors.js'; 11 | import { validateAndTransformExtractRules } from './extract_rules_utils.js'; 12 | import { parseAndValidateInstructions } from './instructions_utils.js'; 13 | import { Label, VALID_RESOURCES } from './const.js'; 14 | 15 | const transformTimeMeasuresToRelative = (timeMeasures: TimeMeasure[]): TimeMeasure[] => { 16 | const firstMeasure = timeMeasures[0].time; 17 | return timeMeasures.map((measure) => { 18 | return { 19 | event: measure.event, 20 | time: measure.time - firstMeasure, 21 | }; 22 | }).sort((a, b) => a.time - b.time); 23 | }; 24 | 25 | export async function pushLogData(timeMeasures: TimeMeasure[], data: Record, failed = false) { 26 | timeMeasures.push({ 27 | event: failed ? 'failed request' : 'handler end', 28 | time: Date.now(), 29 | }); 30 | const relativeMeasures = transformTimeMeasuresToRelative(timeMeasures); 31 | log.info(`Response sent (${relativeMeasures.at(-1)?.time} ms) ${data.inputtedUrl}`, { ...relativeMeasures }); 32 | await Actor.pushData({ 33 | ...data, 34 | measures: relativeMeasures, 35 | }); 36 | } 37 | 38 | const isValidResourceType = (resource: string) => { 39 | return VALID_RESOURCES.includes(resource); 40 | }; 41 | 42 | function mapEquivalentParams(params: ParsedUrlQuery) { 43 | for (const [ScrapingBeeParam, EquivalentParams] of Object.entries(EquivalentParameters)) { 44 | if (params[ScrapingBeeParam]) { 45 | continue; 46 | } 47 | for (const eqParam of EquivalentParams) { 48 | if (params[eqParam]) { 49 | params[ScrapingBeeParam] = params[eqParam]; 50 | continue; 51 | } 52 | } 53 | } 54 | return params; 55 | } 56 | 57 | export function parseParameters(url: string) { 58 | const params = parse(url.slice(2)); 59 | return mapEquivalentParams(params); 60 | } 61 | 62 | function generateHeaders(device: 'mobile' | 'desktop') { 63 | const headerGenerator = new HeaderGenerator({ 64 | devices: [device], 65 | }); 66 | const generatedHeaders = headerGenerator.getHeaders(); 67 | // remove 'te' header as it is causing page.goto: net::ERR_INVALID_ARGUMENT error 68 | // eslint-disable-next-line @typescript-eslint/no-unused-vars 69 | const { te, ...rest } = generatedHeaders; 70 | return rest; 71 | } 72 | 73 | export function createRequestForCrawler(params: ParsedUrlQuery, req: IncomingMessage): RequestOptions { 74 | if (!params[ScrapingBee.url] || !params[ScrapingBee.url].length) { 75 | throw new UserInputError('Parameter url is either missing or empty'); 76 | } 77 | const urlToScrape = params[ScrapingBee.url] as string; 78 | 79 | const useExtractRules = !!params[ScrapingBee.extractRules]; // using !! casts non-bool to bool 80 | let inputtedExtractRules; 81 | if (useExtractRules) { 82 | inputtedExtractRules = JSON.parse(params[ScrapingBee.extractRules] as string); 83 | } 84 | 85 | let selectedDevice: 'desktop' | 'mobile' = 'desktop'; 86 | if (params[ScrapingBee.device]) { 87 | const device = params[ScrapingBee.device] as string; 88 | if (device === 'mobile') { 89 | selectedDevice = 'mobile'; 90 | } 91 | 92 | if (device !== 'desktop' && device !== 'mobile') { 93 | throw new UserInputError('Param device can be either desktop or mobile'); 94 | } 95 | } 96 | 97 | const generatedHeaders = generateHeaders(selectedDevice); 98 | 99 | const doScenario = !!params[ScrapingBee.jsScenario]; 100 | const jsScenario: JsScenario = doScenario 101 | ? parseAndValidateInstructions(params[ScrapingBee.jsScenario] as string) 102 | : { instructions: [], strict: false }; 103 | 104 | const renderJs = !(params[ScrapingBee.renderJs] === 'false' 105 | || params[ScrapingAnt.browser] === 'false' 106 | || params[ScraperApi.render] === 'false'); 107 | 108 | if (renderJs && params[ScrapingBee.wait]) { 109 | const parsedWait = Number.parseInt(params[ScrapingBee.wait] as string, 10); 110 | if (Number.isNaN(parsedWait)) { 111 | throw new UserInputError('Number value expected for wait parameter'); 112 | } else { 113 | jsScenario.instructions.unshift({ 114 | action: 'wait', 115 | param: Math.min(parsedWait, 35000), 116 | }); 117 | } 118 | } 119 | 120 | if (renderJs && (params[ScrapingBee.waitFor])) { 121 | const waitForSelector = params[ScrapingBee.waitFor]; 122 | if (typeof waitForSelector !== 'string' || !waitForSelector.length) { 123 | throw new UserInputError('Non-empty selector expected for wait_for and wait_for_selector parameters'); 124 | } else { 125 | jsScenario.instructions.unshift({ 126 | action: 'wait_for', 127 | param: waitForSelector, 128 | }); 129 | } 130 | } 131 | 132 | if (renderJs && params[ScrapingBee.waitBrowser]) { 133 | const waitForBrowserState = params[ScrapingBee.waitBrowser] as string; 134 | if (!['load', 'domcontentloaded', 'networkidle'].includes(waitForBrowserState)) { 135 | throw new UserInputError('Unsupported value for wait_browser parameter'); 136 | } else { 137 | jsScenario.instructions.unshift({ 138 | action: 'wait_browser', 139 | param: waitForBrowserState, 140 | }); 141 | } 142 | } 143 | 144 | if (renderJs && params[ScrapingAnt.jsSnippet]) { 145 | const jsSnippetBase64 = params[ScrapingAnt.jsSnippet] as string; 146 | if (!jsSnippetBase64.length) { 147 | throw new UserInputError('Parameter js_snippet must be a non empty string'); 148 | } 149 | const jsSnippet = Buffer.from(jsSnippetBase64, 'base64').toString(); 150 | if (!jsSnippet.length) { 151 | throw new UserInputError('Decoding of js_snippet was not successful'); 152 | } 153 | jsScenario.instructions.unshift({ 154 | action: 'evaluate', 155 | param: jsSnippet, 156 | }); 157 | } 158 | 159 | const requestDetails: RequestDetails = { 160 | requestErrors: [], 161 | resolvedUrl: null, 162 | responseHeaders: null, 163 | xhr: [], 164 | }; 165 | 166 | const screenshotSettings: ScreenshotSettings = { 167 | screenshotType: 'none', 168 | }; 169 | if (params[ScrapingBee.screenshot] === 'true') { 170 | screenshotSettings.screenshotType = 'window'; 171 | } 172 | if (params[ScrapingBee.screenshotFullPage] === 'true') { 173 | screenshotSettings.screenshotType = 'full'; 174 | } 175 | if (params[ScrapingBee.screenshotSelector]) { 176 | if (typeof params[ScrapingBee.screenshotSelector] !== 'string') { 177 | throw new UserInputError('Parameter screenshot_selector must be a string'); 178 | } 179 | screenshotSettings.screenshotType = 'selector'; 180 | screenshotSettings.selector = params[ScrapingBee.screenshotSelector]; 181 | } 182 | 183 | let blockResourceTypes: string[] = []; 184 | if (params[ScrapingAnt.blockResource]) { 185 | const paramValue = params[ScrapingAnt.blockResource]; 186 | const resources = Array.isArray(paramValue) ? paramValue : [paramValue]; 187 | const resourcesToBlock = new Set(); 188 | for (const resource of resources) { 189 | if (isValidResourceType(resource)) { 190 | resourcesToBlock.add(resource); 191 | } else { 192 | throw new UserInputError(`Unsupported value in block_resource: ${resource}`); 193 | } 194 | } 195 | blockResourceTypes = Array.from(resourcesToBlock.values()); 196 | } 197 | 198 | let binaryTarget = false; 199 | if (params[ScraperApi.binaryTarget]) { 200 | const binaryTargetIsTrue = params[ScraperApi.binaryTarget] === 'true'; 201 | binaryTarget = binaryTargetIsTrue; 202 | } 203 | 204 | const finalRequest: RequestOptions = { 205 | url: urlToScrape, 206 | uniqueKey: uuidv4(), 207 | headers: { 208 | ...generatedHeaders, 209 | }, 210 | skipNavigation: !renderJs, 211 | userData: { 212 | jsonResponse: params[ScrapingBee.jsonResponse] === 'true', 213 | screenshotSettings, 214 | requestDetails, 215 | extractRules: useExtractRules ? validateAndTransformExtractRules(inputtedExtractRules) : null, 216 | inputtedUrl: req.url as string, 217 | parsedInputtedParams: params, 218 | timeMeasures: [], 219 | jsScenario, 220 | blockResources: !(params[ScrapingBee.blockResources] === 'false'), 221 | width: Number.parseInt(params[ScrapingBee.windowWidth] as string, 10) || 1920, 222 | height: Number.parseInt(params[ScrapingBee.windowHeight] as string, 10) || 1080, 223 | returnPageSource: params[ScrapingBee.returnPageSource] === 'true', 224 | transparentStatusCode: params[ScrapingBee.transparentStatusCode] === 'true', 225 | blockResourceTypes, 226 | binaryTarget, 227 | }, 228 | }; 229 | 230 | // headers with ant/spb prefixes 231 | if (params[ScrapingBee.forwardHeaders] === 'true' || params[ScrapingBee.forwardHeadersPure] === 'true') { 232 | const reqHeaders = req.headers; 233 | const headersToForward: Record = {}; 234 | for (const headerKey of Object.keys(reqHeaders)) { 235 | if (headerKey.startsWith('spb-') || headerKey.startsWith('ant-')) { 236 | const withoutPrefixKey = headerKey.slice(4); 237 | 238 | const skippedHeaders = ['cookie', 'set-cookie', 'host']; 239 | if (skippedHeaders.includes(withoutPrefixKey)) { 240 | continue; 241 | } 242 | 243 | // header values other than 'set-cookie' should be string (not string[]), but there's a check just in case 244 | const headerValue = reqHeaders[headerKey]; 245 | if (Array.isArray(headerValue)) { 246 | continue; 247 | } 248 | headersToForward[withoutPrefixKey] = headerValue as string; 249 | } 250 | } 251 | 252 | if (params[ScrapingBee.forwardHeaders] === 'true') { 253 | const currentHeaders = finalRequest.headers; 254 | finalRequest.headers = { 255 | ...currentHeaders, 256 | ...headersToForward, 257 | }; 258 | } else { 259 | // forward headers pure 260 | finalRequest.headers = { 261 | ...headersToForward, 262 | }; 263 | } 264 | } 265 | 266 | // all headers 267 | if (params[ScraperApi.keepHeaders] === 'true') { 268 | const reqHeaders = req.headers; 269 | const headersToForward: Record = {}; 270 | for (const [key, val] of Object.entries(reqHeaders)) { 271 | if (Array.isArray(val)) { 272 | continue; 273 | } 274 | headersToForward[key] = val as string; 275 | } 276 | finalRequest.headers = headersToForward; 277 | } 278 | 279 | if (params[ScrapingBee.cookies]) { 280 | finalRequest.headers!.Cookie = params[ScrapingBee.cookies] as string; 281 | } 282 | 283 | if (binaryTarget) { 284 | finalRequest.label = Label.BINARY_TARGET; 285 | return finalRequest; 286 | } 287 | 288 | finalRequest.label = renderJs ? Label.BROWSER : Label.HTTP; 289 | return finalRequest; 290 | } 291 | 292 | export function createProxyOptions(params: ParsedUrlQuery) { 293 | const proxyOptions: ProxyConfigurationOptions = {}; 294 | 295 | const proxyType = params[ScrapingAnt.proxyType] as string || 'datacenter'; 296 | if (proxyType !== 'datacenter' && proxyType !== 'residential') { 297 | throw new UserInputError('Parameter proxy_type can be either residential or datacenter'); 298 | } 299 | 300 | const useGoogleProxy = params[ScrapingBee.customGoogle] === 'true'; 301 | const url = new URL(params[ScrapingBee.url] as string); 302 | if (url.host.includes('google') && !useGoogleProxy) { 303 | throw new UserInputError('Set param custom_google to true to scrape Google urls'); 304 | } 305 | if (useGoogleProxy) { 306 | proxyOptions.groups = ['GOOGLE_SERP']; 307 | return proxyOptions; 308 | } 309 | 310 | if (params[ScrapingBee.ownProxy]) { 311 | proxyOptions.proxyUrls = [params[ScrapingBee.ownProxy] as string]; 312 | return proxyOptions; 313 | } 314 | 315 | const usePremium = params[ScrapingBee.premiumProxy] === 'true' || proxyType === 'residential'; 316 | if (usePremium) { 317 | proxyOptions.groups = ['RESIDENTIAL']; 318 | } 319 | 320 | if (params[ScrapingBee.countryCode]) { 321 | const countryCode = (params[ScrapingBee.countryCode] as string).toUpperCase(); 322 | if (countryCode.length !== 2) { 323 | throw new UserInputError('Parameter for country code must be a string of length 2'); 324 | } 325 | if (!usePremium && countryCode !== 'US') { 326 | throw new UserInputError('Parameter for country code must be used with premium proxies when using non-US country'); 327 | } 328 | proxyOptions.countryCode = countryCode; 329 | } 330 | return proxyOptions; 331 | } 332 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "module": "NodeNext", 5 | "moduleResolution": "NodeNext", 6 | "target": "ES2022", 7 | "outDir": "dist", 8 | "noUnusedLocals": false, 9 | "skipLibCheck": true, 10 | "lib": ["DOM"] 11 | }, 12 | "include": [ 13 | "./src/**/*" 14 | ] 15 | } 16 | --------------------------------------------------------------------------------