├── .actor
    ├── Dockerfile
    ├── actor.json
    └── input_schema.json
├── .dockerignore
├── .editorconfig
├── .eslintrc
├── .gitignore
├── README.md
├── package-lock.json
├── package.json
├── src
    ├── const.ts
    ├── crawlers.ts
    ├── errors.ts
    ├── extract_rules_utils.ts
    ├── instructions_utils.ts
    ├── main.ts
    ├── params.ts
    ├── responses.ts
    ├── router.ts
    ├── types.ts
    └── utils.ts
└── tsconfig.json


/.actor/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Specify the base Docker image. You can read more about
 2 | # the available images at https://crawlee.dev/docs/guides/docker-images
 3 | # You can also use any other image from Docker Hub.
 4 | FROM apify/actor-node-playwright-chrome:20 AS builder
 5 | 
 6 | # Copy just package.json and package-lock.json
 7 | # to speed up the build using Docker layer cache.
 8 | COPY --chown=myuser package*.json ./
 9 | 
10 | # Install all dependencies. Don't audit to speed up the installation.
11 | RUN npm install --include=dev --audit=false
12 | 
13 | # Next, copy the source files using the user set
14 | # in the base image.
15 | COPY --chown=myuser . ./
16 | 
17 | # Install all dependencies and build the project.
18 | # Don't audit to speed up the installation.
19 | RUN npm run build
20 | 
21 | # Create final image
22 | FROM apify/actor-node-playwright-chrome:20
23 | 
24 | # Copy just package.json and package-lock.json
25 | # to speed up the build using Docker layer cache.
26 | COPY --chown=myuser package*.json ./
27 | 
28 | # Install NPM packages, skip optional and development dependencies to
29 | # keep the image small. Avoid logging too much and print the dependency
30 | # tree for debugging
31 | RUN npm --quiet set progress=false \
32 |     && npm install --omit=dev --omit=optional \
33 |     && echo "Installed NPM packages:" \
34 |     && (npm list --omit=dev --all || true) \
35 |     && echo "Node.js version:" \
36 |     && node --version \
37 |     && echo "NPM version:" \
38 |     && npm --version \
39 |     && rm -r ~/.npm
40 | 
41 | # Copy built JS files from builder image
42 | COPY --from=builder --chown=myuser /home/myuser/dist ./dist
43 | 
44 | # Next, copy the remaining files and directories with the source code.
45 | # Since we do this after NPM install, quick build will be really fast
46 | # for most source file changes.
47 | COPY --chown=myuser . ./
48 | 
49 | 
50 | # Run the image. If you know you won't need headful browsers,
51 | # you can remove the XVFB start script for a micro perf gain.
52 | CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
53 | 


--------------------------------------------------------------------------------
/.actor/actor.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"actorSpecification": 1,
 3 | 	"name": "standby-crawler",
 4 | 	"title": "Project Playwright Crawler Typescript",
 5 | 	"description": "Crawlee and Playwright project in typescript.",
 6 | 	"version": "0.0",
 7 | 	"meta": {
 8 | 		"templateId": "ts-crawlee-playwright-chrome"
 9 | 	},
10 | 	"input": "./input_schema.json",
11 | 	"dockerfile": "./Dockerfile"
12 | }
13 | 


--------------------------------------------------------------------------------
/.actor/input_schema.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "PlaywrightCrawler Template",
3 |     "type": "object",
4 |     "schemaVersion": 1,
5 |     "description": "Super Scraper API currently cannot be run manually via Input. Use Standby endpoint with available parameters.",
6 |     "properties": {
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # configurations
 2 | .idea
 3 | 
 4 | # crawlee and apify storage folders
 5 | apify_storage
 6 | crawlee_storage
 7 | storage
 8 | 
 9 | # installed files
10 | node_modules
11 | 
12 | # git folder
13 | .git
14 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | charset = utf-8
 7 | trim_trailing_whitespace = true
 8 | insert_final_newline = true
 9 | end_of_line = lf
10 | 


--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": true,
 3 |     "env": {
 4 |         "browser": true,
 5 |         "es2020": true,
 6 |         "node": true
 7 |     },
 8 |     "rules": {
 9 |         "no-underscore-dangle": "off"
10 |     },
11 |     "extends": [
12 |         "@apify/eslint-config-ts"
13 |     ],
14 |     "parserOptions": {
15 |         "project": "./tsconfig.json",
16 |         "ecmaVersion": 2020
17 |     },
18 |     "ignorePatterns": [
19 |         "node_modules",
20 |         "dist",
21 |         "**/*.d.ts"
22 |     ]
23 | }
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # This file tells Git which files shouldn't be added to source control
 2 | 
 3 | .DS_Store
 4 | .idea
 5 | dist
 6 | node_modules
 7 | apify_storage
 8 | storage
 9 | 
10 | # Added by Apify CLI
11 | .venv
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SuperScraper API
  2 | 
  3 | SuperScraper API is an Actor that provides a REST API for scraping websites.
  4 | Just pass the URL of a web page and get back the fully rendered HTML content.
  5 | SuperScraper API is compatible with [ScrapingBee](https://www.scrapingbee.com/),
  6 | [ScrapingAnt](https://scrapingant.com/),
  7 | and [ScraperAPI](https://scraperapi.com/) interfaces.
  8 | 
  9 | Main features:
 10 | - Extract HTML from arbitrary URLs with a headless browser for dynamic content rendering.
 11 | - Circumvent blocking using datacenter or residential proxies, as well as browser fingerprinting.
 12 | - Seamlessly scale to a large number of web pages as needed.
 13 | - Capture screenshots of the web pages.
 14 | 
 15 | Note that SuperScraper API uses the new experimental Actor Standby mode, so it's not started the traditional way from Apify Console.
 16 | Instead, it's invoked via the HTTP REST API provided directly by the Actor. See the examples below.
 17 | 
 18 | ## Usage examples
 19 | 
 20 | To run these examples, you need an Apify API token,
 21 | which you can find under [Settings > Integrations](https://console.apify.com/account/integrations) in Apify Console.
 22 | 
 23 | You can create an Apify account free of charge.
 24 | 
 25 | ### Node.js
 26 | 
 27 | ```ts
 28 | import axios from 'axios';
 29 | 
 30 | const resp = await axios.get('https://super-scraper-api.apify.actor/', {
 31 |     params: {
 32 |         url: 'https://apify.com/store',
 33 |         wait_for: '.ActorStoreItem-title',
 34 |         json_response: true,
 35 |         screenshot: true,
 36 |     },
 37 |     headers: {
 38 |         Authorization: 'Bearer <YOUR_APIFY_API_TOKEN>',
 39 |     },
 40 | });
 41 | 
 42 | console.log(resp.data);
 43 | ```
 44 | 
 45 | ### curl
 46 | 
 47 | ```shell
 48 | curl -X GET \
 49 |   'https://super-scraper-api.apify.actor/?url=https://apify.com/store&wait_for=.ActorStoreItem-title&screenshot=true&json_response=true' \
 50 |   --header 'Authorization: Bearer <YOUR_APIFY_API_TOKEN>'
 51 | ```
 52 | 
 53 | ## Authentication
 54 | 
 55 | The best way to authenticate is to pass your Apify API token using the `Authorization` HTTP header.
 56 | Alternatively, you can pass the API token via the `token` query parameter to authenticate the requests, which is more convenient for testing in a web browser.
 57 | 
 58 | ### Node.js
 59 | 
 60 | ```ts
 61 | const resp = await axios.get('https://super-scraper-api.apify.actor/', {
 62 |     params: {
 63 |         url: 'https://apify.com/store',
 64 |         token: '<YOUR_APIFY_API_TOKEN>'
 65 |     },
 66 | });
 67 | ```
 68 | 
 69 | 
 70 | ### curl
 71 | 
 72 | ```shell
 73 | curl -X GET 'https://super-scraper-api.apify.actor/?url=https://apify.com/store&wait_for=.ActorStoreItem-title&json_response=true&token=<YOUR_APIFY_API_TOKEN>'
 74 | ```
 75 | 
 76 | ## Pricing
 77 | 
 78 | When using SuperScraper API, you're charged based on your actual usage of the Apify platform's computing, storage, and networking resources. 
 79 | 
 80 | Cost depends on the target sites, your settings and API parameters, the load of your requests, and random network and target site conditions.
 81 | 
 82 | The best way to see your price is to conduct a real-world test.
 83 | 
 84 | An example cost on a free account (the pricing is cheaper on higher plans) for 30 one-by-one requests plus 50 batched requests test:
 85 | 
 86 | | parameters | cost estimate  
 87 | | ------------- |-----------------------------------|
 88 | | no `render_js` + basic proxy |  $1/1000 requests
 89 | | no `render_js` + premium (residential) proxy | $2/1000 requests
 90 | | `render_js` + basic proxy | $4/1000 requests
 91 | | `render_js` + premium (residential) proxy | $5/1000 requests
 92 | 
 93 | ## API parameters
 94 | 
 95 | ### ScrapingBee API parameters
 96 | 
 97 | SuperScraper API supports most of the API parameters of [ScrapingBee](https://www.scrapingbee.com/documentation/):
 98 | 
 99 | | parameter | description                                                                                                                                                                                                                                                                                                                   |
100 | | -------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
101 | | `url` | URL of the webpage to be scraped. **This parameter is required.**                                                                                                                                                                                                                                                             |
102 | | `json_response` | Return a verbose JSON response with additional details about the webpage. Can be either `true` or `false`, default is `false`.                                                                                                                                                                                                |
103 | | `extract_rules` | A stringified JSON containing custom rules how to extract data from the webpage.                                                                                                                                                                                                                                              |
104 | | `render_js` | Indicates that the webpage should be scraped using a headless browser, with dynamic content rendered. Can be `true` or `false`, default is `true`. This is equivalent to ScrapingAnt's `browser`.                                                                                                                             |
105 | | `screenshot` | Get screenshot of the browser's current viewport. If `json_response` is set to `true`, screenshot will be returned in the Base64 encoding. Can be `true` or `false`, default is `false`.                                                                                                                                      |
106 | | `screenshot_full_page` | Get screenshot of the full page. If `json_response` is set to `true`, screenshot will be returned in the Base64 encoding. Can be `true` or `false`, default is `false`.                                                                                                                                                       |
107 | | `screenshot_selector` | Get screenshot of the element specified by the selector. If `json_response` is set to `true`, screenshot will be returned in Base64. Must be a non-empty string.                                                                                                                                                              |
108 | | `js_scenario` | JavaScript instructions that will be executed after loading the webpage.                                                                                                                                                                                                                                                      |
109 | | `wait` | Specify a duration that the browser will wait after loading the page, in milliseconds.                                                                                                                                                                                                                                        |
110 | | `wait_for` | Specify a CSS selector of an element for which the browser will wait after loading the page.                                                                                                                                                                                                                                  |
111 | | `wait_browser` | Specify a browser event to wait for. Can be either `load`, `domcontentloaded`, or `networkidle`.                                                                                                                                                                                                                              |
112 | | `block_resources` | Specify that you want to block images and CSS. Can be `true` or `false`, default is `true`.                                                                                                                                                                                                                                   |
113 | | `window_width` | Specify the width of the browser's viewport, in pixels.                                                                                                                                                                                                                                                                       |
114 | | `window_height` | Specify the height of the browser's viewport, in pixels.                                                                                                                                                                                                                                                                      |
115 | | `cookies` | Custom cookies to use to fetch the web pages. This is useful for fetching webpage behing login. The cookies must be specified in a string format: `cookie_name_1=cookie_value1;cookie_name_2=cookie_value_2`.                                                                                                                 |
116 | | `own_proxy` | A custom proxy to be used for scraping, in the format `<protocol><username>:<password>@<host>:<port>`.                                                                                                                                                                                                                        |
117 | | `premium_proxy` | Use residential proxies to fetch the web content, in order to reduce the probability of being blocked. Can be either `true` or `false`, default is `false`.                                                                                                                                                                   |
118 | | `stealth_proxy` | Works same as `premium_proxy`.                                                                                                                                                                                                                                                                                                |
119 | | `country_code` | Use IP addresses that are geolocated in the specified country by specifying its [2-letter ISO code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). When using code other than `US`, `premium_proxy` must be set to `true`. This is equivalent to setting ScrapingAnt's `proxy_country`. |
120 | | `custom_google` | Use this option if you want to scrape Google-related websites (such as Google Searach or Google Shopping). Can be `true` or `false`, default is `false`.                                                                                                                                                                      |
121 | | `return_page_source` | Return HTML of the webpage from the response before any dynamic JavaSript rendering. Can be `true` or `false`, default is `false`.                                                                                                                                                                                            |
122 | | `transparent_status_code` | By default, if target webpage responds with HTTP status code other than a 200-299 or a 404, the API will return a HTTP status code 500. Set this paremeter to `true` to disable this behavior and return the status code of the actual response.                                                                              |
123 | | `timeout` | Set maximum timeout for the response from this Actor, in milliseconds. The default is 140 000 ms.                                                                                                                                                                                                                             |
124 | | `forward_headers` | If set to `true`, HTTP headers starting with prefix `Spb-` or `Ant-` will be forwarded to the target webpage alongside headers generated by us (the prefix will be trimmed).                                                                                                                                                  |
125 | | `forward_headers_pure` | If set to `true`, only headers starting with prefix `Spb-` or `Ant-` will be forwarded to the target webpage (prefix will be trimmed), without any other HTTP headers from our side.                                                                                                                                          |
126 | | `device` | Can be either `desktop` (default) or `mobile`.                                                                                                                                                                                                                                                                                |
127 | 
128 | ScrapingBee's API parameters `block_ads` and `session_id` are currently not supported.
129 | 
130 | ### ScrapingAnt API parameters
131 | 
132 | SuperScraper API supports most of the API parameters of [ScrapingAnt](https://docs.scrapingant.com/request-response-format#available-parameters):
133 | 
134 | | parameter | description                                                                                                                                                                                                                                                                                                                                  |
135 | | -------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
136 | | `url` | URL of the webpage to be scraped. **This parameter is required.**                                                                                                                                                                                                                                                                            |
137 | | `browser` | Indicates that the webpage should be scraped using a headless browser, with dynamic content rendered. Can be `true` or `false`, default is `true`. This is equivalent as ScrapingBee's `render_js`.                                                                                                                                          | (Same as `render_js`.)                                                                                                                                                    |
138 | | `cookies` | Use custom cookies, must be in a string format: `cookie_name_1=cookie_value1;cookie_name_2=cookie_value_2`.                                                                                                                                                                                                                                  |
139 | | `js_snippet` | A Base64-encoded JavaScript code to be executed on the webpage. Will be treated as the [evaluate](#evaluate) instruction.                                                                                                                                                                                                                    |
140 | | `proxy_type` | Specify the type of proxies, which can be either `datacenter` (default) or `residential`. This is equivalent to setting ScrapingBee's `premium_proxy` or `steath_proxy` to `true`.                                                                                                                                                           |
141 | | `wait_for_selector` | Specify a CSS selector of an element for which the browser will wait after loading the page. This is equivalent to setting ScrapingBee's `wait_for`.                                                                                                                                                                                         |
142 | | `block_resource` | Specify one or more resources types you want to block from being downloaded. The parameter can be repeated in the URL (e.g. `block_resource=image&block_resource=media`). Available options are: `document`, `stylesheet`, `image`, `media`, `font`, `script`, `texttrack`, `xhr`, `fetch`, `eventsource`, `websocket`, `manifest`, `other`. |
143 | | `return_page_source` | Return HTML of the webpage from the response before any dynamic JavaSript rendering. Can be `true` or `false`, default is `false`.                                                                                                                                                                                                           |
144 | | `proxy_country` | Use IP addresses that are geolocated in the specified country by specifying its [2-letter ISO code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). When using code other than `US`, `premium_proxy` must be set to `true`. This is equivalent to setting ScrapingBee's `country_code`.                 |
145 | 
146 | ScrapingAnt's API parameter `x-api-key` is not supported.
147 | 
148 | Note that HTTP headers in a request to this Actor beginning with prefix `Ant-` will be forwarded (without the prefix) to the target webpage alongside headers generated by the Actor.
149 | This behavior can be changed using ScrapingBee's `forward_headers` or `forward_headers_pure` parameters.
150 | 
151 | 
152 | ### ScraperAPI API parameters
153 | 
154 | SuperScraper API supports most of the API parameters of [ScraperAPI](https://docs.scraperapi.com/making-requests/customizing-requests):
155 | 
156 | | parameter | description                                                                                                                                                                                                                                                                                                                   |
157 | | -------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
158 | | `url` | URL of the webpage to be scraped. **This parameter is required.**                                                                                                                                                                                                                                                             |
159 | | `render` | Specify, if you want to scrape the webpage with or without using a headless browser, can be `true` or `false`, default `true`. (Same as `render_js`.)                                                                                                                                                                         |
160 | | `wait_for_selector` | Specify a CSS selector of an element for which the browser will wait after loading the page. This is equivalent to setting ScrapingBee's `wait_for`.                                                                                                                                                                          |
161 | | `premium` | Use residential proxies to fetch the web content, in order to reduce the probability of being blocked. Can be either `true` or `false`, default is `false`. This is equivalent to setting ScrapingBee's `premium_proxy`.                                                                                                      |
162 | | `ultra_premium` | Same as `premium`.                                                                                                                                                                                                                                                                                                            |
163 | | `country_code` | Use IP addresses that are geolocated in the specified country by specifying its [2-letter ISO code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). When using code other than `US`, `premium_proxy` must be set to `true`. This is equivalent to setting ScrapingAnt's `proxy_country`. |
164 | | `keep_headers` | If `true`, then all headers sent to this Actor will be forwarded to the target website. The `Authorization` header will be removed.                                                                                                                                                                                           |
165 | | `device_type` | Can be either `desktop` (default) or `mobile`. This is equivalent to setting ScrapingBees's `device`.                                                                                                                                                                                                                         |
166 | | `binary_target` | Specify whether the target is a file. Can be `true` or `false`, default is `false`. Currently only supported when JS rendering is set to `false` via the `render_js`, `browser`, or `render` parameters.                                                                                                                      |
167 | 
168 | ScraperAPI's API parameters `session_number` and `autoparse` are currently not supported, and they are ignored.
169 | 
170 | 
171 | ### Custom extraction rules
172 | 
173 | Using ScrapingBee's `extract_rules` parameter, you can specify a set of rules to extract specific data from the target web pages. You can create an extraction rule in one of two ways: with shortened options, or with full options.
174 | 
175 | #### Shortened options
176 | 
177 | - value for the given key serves as a `selector`
178 | - using `@`, we can access attribute of the selected element
179 | 
180 | ##### Example:
181 | 
182 | ```json
183 | {
184 |     "title": "h1",
185 |     "link": "a@href"
186 | }
187 | ```
188 | 
189 | #### Full options
190 | 
191 | - `selector` is required
192 | - `type` can be either `item` (default) or `list`
193 | - `output` indicates how the result for these element(s) will look like. It can be:
194 |     - `text` (default option when `output` is omitted) - text of the element
195 |     - `html` - HTML of the element
196 |     - attribute name (starts with `@`, for example `@href`)
197 |     - object with other extract rules for the given item (key + shortened or full options)
198 |     - `table_json` or `table_array` to scrape a table in a json or array format
199 | - `clean` - relevant when having `text` as `output`, specifies whether the text of the element should be trimmed of whitespaces (can be `true` or `false`, default `true`)
200 | 
201 | ##### Example:
202 | 
203 | ```json
204 | {
205 |     "custom key for links": {
206 |         "selector": "a",
207 |         "type": "list",
208 |         "output": {
209 |             "linkName" : {
210 |                 "selector": "a",
211 |                 "clean": "false"
212 |             },
213 |             "href": {
214 |                 "selector": "a",
215 |                 "output": "@href"
216 |             }
217 |         }
218 | 
219 |     }
220 | }
221 | ```
222 | 
223 | #### Example
224 | 
225 | This example extracts all links from [Apify Blog](https://blog.apify.com/) along with their titles.
226 | 
227 | ```ts
228 | const extractRules = {
229 |     title: 'h1',
230 |     allLinks: {
231 |         selector: 'a',
232 |         type: 'list',
233 |         output: {
234 |             title: 'a',
235 |             link: 'a@href',
236 |         },
237 |     },
238 | };
239 | 
240 | const resp = await axios.get('https://super-scraper-api.apify.actor/', {
241 |     params: {
242 |         url: 'https://blog.apify.com/',
243 |         extract_rules: JSON.stringify(extractRules),
244 |         // verbose: true,
245 |     },
246 |     headers: {
247 |         Authorization: 'Bearer <YOUR_APIFY_API_TOKEN>',
248 |     },
249 | });
250 | 
251 | console.log(resp.data);
252 | ```
253 | 
254 | The results look like this:
255 | 
256 | ```json
257 | {
258 |   "title": "Apify Blog",
259 |   "allLinks": [
260 |     {
261 |       "title": "Data for generative AI & LLM",
262 |       "link": "https://apify.com/data-for-generative-ai"
263 |     },
264 |     {
265 |       "title": "Product matching AI",
266 |       "link": "https://apify.com/product-matching-ai"
267 |     },
268 |     {
269 |       "title": "Universal web scrapers",
270 |       "link": "https://apify.com/store/scrapers/universal-web-scrapers"
271 |     }
272 |   ]
273 | }
274 | ```
275 | 
276 | ### Custom JavaScript code
277 | 
278 | Use ScrapingBee's `js_scenario` parameter to specify instructions in order to be executed one by one after opening the page.
279 | 
280 | Set `json_response` to `true` to get a full report of the executed instructions, the results of `evaluate` instructions will be added to the `evaluate_results` field.
281 | 
282 | Example of clicking a button:
283 | 
284 | ```ts
285 | const instructions = {
286 |     instructions: [
287 |         { click: '#button' },
288 |     ],
289 | };
290 | 
291 | const resp = await axios.get('https://super-scraper-api.apify.actor/', {
292 |     params: {
293 |         url: 'https://www.example.com',
294 |         js_scenario: JSON.stringify(instructions),
295 |     },
296 |     headers: {
297 |         Authorization: 'Bearer <YOUR_APIFY_API_TOKEN>',
298 |     },
299 | });
300 | 
301 | console.log(resp.data);
302 | ```
303 | 
304 | #### Strict mode
305 | 
306 | If one instruction fails, then the subsequent instructions will not be executed. To disable this behavior, you can optionally set `strict` to `false` (by default it's `true`):
307 | 
308 | ```json
309 | {
310 |     "instructions": [
311 |         { "click": "#button1" },
312 |         { "click": "#button2" }
313 |     ],
314 |     "strict": false
315 | }
316 | ```
317 | 
318 | #### Supported instructions
319 | 
320 | ##### `wait`
321 | 
322 | - wait for some time specified in ms
323 | - example: `{"wait": 10000}`
324 | 
325 | ##### `wait_for`
326 | 
327 | - wait for an element specified by the selector
328 | - example `{"wait_for": "#element"}`
329 | 
330 | ##### `click`
331 | 
332 | - click on an element specified by the selector
333 | - example `{"click": "#button"}`
334 | 
335 | ##### `wait_for_and_click`
336 | - combination of previous two
337 | - example `{"wait_for_and_click": "#button"}`
338 | 
339 | ##### `scroll_x` and `scroll_y`
340 | 
341 | - scroll a specified number of pixels horizontally or vertically
342 | - example `{"scroll_y": 1000}` or `{"scroll_x": 1000}`
343 | 
344 | ##### `fill`
345 | 
346 | - specify a selector of the input element and the value you want to fill
347 | - example `{"fill": ["input_1", "value_1"]}`
348 | 
349 | ##### `evaluate`
350 | 
351 | - evaluate custom javascript on the webpage
352 | - text/number/object results will be saved in the `evaluate_results` field
353 | - example `{"evaluate":"document.querySelectorAll('a').length"}`


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "standby-crawler",
 3 | 	"version": "0.0.1",
 4 | 	"type": "module",
 5 | 	"description": "This is an example of an Apify actor.",
 6 | 	"engines": {
 7 | 		"node": ">=18.0.0"
 8 | 	},
 9 | 	"dependencies": {
10 | 		"@crawlee/memory-storage": "^3.8.2",
11 | 		"apify": "^3.1.10",
12 | 		"cheerio": "^1.0.0-rc.12",
13 | 		"crawlee": "^3.9.1",
14 | 		"header-generator": "^2.1.50",
15 | 		"playwright": "*",
16 | 		"uuid": "^9.0.1"
17 | 	},
18 | 	"devDependencies": {
19 | 		"@apify/eslint-config-ts": "^0.3.0",
20 | 		"@apify/tsconfig": "^0.1.0",
21 | 		"@types/uuid": "^9.0.8",
22 | 		"@typescript-eslint/eslint-plugin": "^6.7.2",
23 | 		"@typescript-eslint/parser": "^6.7.2",
24 | 		"eslint": "^8.50.0",
25 | 		"tsx": "^4.6.2",
26 | 		"typescript": "^5.3.3"
27 | 	},
28 | 	"scripts": {
29 | 		"start": "npm run start:dev",
30 | 		"start:prod": "node dist/main.js",
31 | 		"start:dev": "tsx src/main.ts",
32 | 		"build": "tsc",
33 | 		"lint": "eslint ./src --ext .ts",
34 | 		"lint:fix": "eslint ./src --ext .ts --fix",
35 | 		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1",
36 | 		"postinstall": "npx crawlee install-playwright-browsers"
37 | 	},
38 | 	"author": "It's not you it's me",
39 | 	"license": "ISC"
40 | }
41 | 


--------------------------------------------------------------------------------
/src/const.ts:
--------------------------------------------------------------------------------
 1 | export enum Label {
 2 |     BROWSER = 'browser',
 3 |     HTTP = 'http',
 4 |     BINARY_TARGET = 'binary-target',
 5 | }
 6 | 
 7 | export const VALID_RESOURCES = [
 8 |     'document',
 9 |     'stylesheet',
10 |     'image',
11 |     'media',
12 |     'font',
13 |     'script',
14 |     'texttrack',
15 |     'xhr',
16 |     'fetch',
17 |     'eventsource',
18 |     'websocket',
19 |     'manifest',
20 |     'other',
21 | ];
22 | 


--------------------------------------------------------------------------------
/src/crawlers.ts:
--------------------------------------------------------------------------------
  1 | import { Actor, RequestQueue, log } from 'apify';
  2 | import { PlaywrightCrawler } from 'crawlee';
  3 | import type { PlaywrightCrawlingContext, RequestOptions, AutoscaledPoolOptions } from 'crawlee';
  4 | import { MemoryStorage } from '@crawlee/memory-storage';
  5 | import { ServerResponse } from 'http';
  6 | import { TimeMeasure, UserData, VerboseResult, CrawlerOptions } from './types.js';
  7 | import { addResponse, sendErrorResponseById } from './responses.js';
  8 | import { router } from './router.js';
  9 | import { pushLogData } from './utils.js';
 10 | import { Label } from './const.js';
 11 | 
 12 | const crawlers = new Map<string, PlaywrightCrawler>();
 13 | 
 14 | export const DEFAULT_CRAWLER_OPTIONS: CrawlerOptions = {
 15 |     proxyConfigurationOptions: {},
 16 | };
 17 | 
 18 | export const createAndStartCrawler = async (crawlerOptions: CrawlerOptions = DEFAULT_CRAWLER_OPTIONS) => {
 19 |     const client = new MemoryStorage();
 20 |     const queue = await RequestQueue.open(undefined, { storageClient: client });
 21 | 
 22 |     const proxyConfig = await Actor.createProxyConfiguration(crawlerOptions.proxyConfigurationOptions);
 23 | 
 24 |     const crawler = new PlaywrightCrawler({
 25 |         keepAlive: true,
 26 |         proxyConfiguration: proxyConfig,
 27 |         maxRequestRetries: 4,
 28 |         requestQueue: queue,
 29 |         launchContext: {
 30 |             browserPerProxy: false,
 31 |         },
 32 |         statisticsOptions: {
 33 |             persistenceOptions: {
 34 |                 enable: false,
 35 |             },
 36 |         },
 37 |         requestHandlerTimeoutSecs: 3600,
 38 |         sessionPoolOptions: {
 39 |             persistenceOptions: {
 40 |                 enable: false,
 41 |             },
 42 |         },
 43 |         errorHandler: async ({ request }, err) => {
 44 |             const { requestDetails, timeMeasures, transparentStatusCode } = request.userData as UserData;
 45 |             timeMeasures.push({
 46 |                 event: 'error',
 47 |                 time: Date.now(),
 48 |             });
 49 | 
 50 |             requestDetails.requestErrors.push({
 51 |                 attempt: request.retryCount + 1,
 52 |                 errorMessage: err.message,
 53 |             });
 54 | 
 55 |             if (transparentStatusCode) {
 56 |                 request.noRetry = true;
 57 |             }
 58 |         },
 59 |         failedRequestHandler: async ({ request, response, page }, err) => {
 60 |             const {
 61 |                 requestDetails,
 62 |                 jsonResponse,
 63 |                 inputtedUrl,
 64 |                 parsedInputtedParams,
 65 |                 timeMeasures,
 66 |                 transparentStatusCode,
 67 |                 nonbrowserRequestStatus,
 68 |             } = request.userData as UserData;
 69 | 
 70 |             requestDetails.requestErrors.push({
 71 |                 attempt: request.retryCount + 1,
 72 |                 errorMessage: err.message,
 73 |             });
 74 | 
 75 |             const errorResponse = {
 76 |                 errorMessage: err.message,
 77 |             };
 78 | 
 79 |             const responseStatusCode = request.skipNavigation ? nonbrowserRequestStatus! : (response?.status() || null);
 80 |             let statusCode = 500;
 81 |             if (transparentStatusCode && responseStatusCode) {
 82 |                 statusCode = responseStatusCode;
 83 |             }
 84 |             if (jsonResponse) {
 85 |                 const verboseResponse: VerboseResult = {
 86 |                     body: errorResponse,
 87 |                     cookies: await page.context().cookies(request.url) || [],
 88 |                     evaluateResults: [],
 89 |                     jsScenarioReport: {},
 90 |                     headers: requestDetails.responseHeaders || {},
 91 |                     type: 'json',
 92 |                     iframes: [],
 93 |                     xhr: [],
 94 |                     initialStatusCode: responseStatusCode,
 95 |                     resolvedUrl: '',
 96 |                     screenshot: null,
 97 |                 };
 98 |                 await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors }, true);
 99 |                 sendErrorResponseById(request.uniqueKey, JSON.stringify(verboseResponse), statusCode);
100 |             } else {
101 |                 await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: errorResponse, errors: requestDetails.requestErrors }, true);
102 |                 sendErrorResponseById(request.uniqueKey, JSON.stringify(errorResponse), statusCode);
103 |             }
104 |         },
105 |         preNavigationHooks: [
106 |             async ({ request, page, blockRequests }) => {
107 |                 const { timeMeasures, blockResources, width, height, blockResourceTypes, jsonResponse, requestDetails } = request.userData as UserData;
108 |                 timeMeasures.push({
109 |                     event: 'pre-navigation hook',
110 |                     time: Date.now(),
111 |                 });
112 | 
113 |                 await page.setViewportSize({ width, height });
114 | 
115 |                 if (request.label === Label.BROWSER && blockResources) {
116 |                     await blockRequests({
117 |                         extraUrlPatterns: ['*.svg'],
118 |                     });
119 |                 }
120 | 
121 |                 if (request.label === Label.BROWSER && blockResourceTypes.length) {
122 |                     await page.route('**', async (route) => {
123 |                         if (blockResourceTypes.includes(route.request().resourceType())) {
124 |                             await route.abort();
125 |                         }
126 |                     });
127 |                 }
128 | 
129 |                 if (request.label === Label.BROWSER && jsonResponse) {
130 |                     page.on('response', async (resp) => {
131 |                         try {
132 |                             const req = resp.request();
133 |                             if (req.resourceType() !== 'xhr') {
134 |                                 return;
135 |                             }
136 | 
137 |                             requestDetails.xhr.push({
138 |                                 url: req.url(),
139 |                                 statusCode: resp.status(),
140 |                                 method: req.method(),
141 |                                 requestHeaders: req.headers(),
142 |                                 headers: resp.headers(),
143 |                                 body: (await resp.body()).toString(),
144 |                             });
145 |                         } catch (e) {
146 |                             log.warning((e as Error).message);
147 |                         }
148 |                     });
149 |                 }
150 |             },
151 |         ],
152 |         requestHandler: router,
153 |     });
154 | 
155 |     // TODO: This is just for Crawlee perf measurement, remove it once we properly understand the bottlenecks
156 |     // @ts-expect-error Overriding internal method
157 |     const origRunTaskFunction = crawler.autoscaledPoolOptions.runTaskFunction.bind(crawler);
158 |     // @ts-expect-error Overriding internal method
159 |     crawler.autoscaledPoolOptions.runTaskFunction = async function () {
160 |         // This code runs before we pull request from queue so we have to approximate that by having mutable global
161 |         // It will ofc be wrong if someone bombs requests with interval shorter than 1 sec
162 |         (global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure = {
163 |             event: 'crawlee internal run task',
164 |             time: Date.now(),
165 |         };
166 |         await (origRunTaskFunction as AutoscaledPoolOptions['runTaskFunction'])!();
167 |     };
168 | 
169 |     // @ts-expect-error Overriding internal method
170 |     const origRunRequestHandler = crawler._runRequestHandler.bind(crawler);
171 |     // @ts-expect-error Overriding internal method
172 |     crawler._runRequestHandler = async function (context: PlaywrightCrawlingContext<UserData>) {
173 |         context.request.userData.timeMeasures.push({
174 |             event: 'crawlee internal request handler',
175 |             time: Date.now(),
176 |         });
177 |         await origRunRequestHandler(context);
178 |     };
179 | 
180 |     await crawler.stats.stopCapturing();
181 |     crawler.run().then(() => log.warning(`Crawler ended`, crawlerOptions), () => { });
182 |     crawlers.set(JSON.stringify(crawlerOptions), crawler);
183 |     log.info('Crawler ready 🫡', crawlerOptions);
184 |     return crawler;
185 | };
186 | 
187 | export const addRequest = async (request: RequestOptions<UserData>, res: ServerResponse, crawlerOptions: CrawlerOptions) => {
188 |     const key = JSON.stringify(crawlerOptions);
189 |     const crawler = crawlers.has(key) ? crawlers.get(key)! : await createAndStartCrawler(crawlerOptions);
190 | 
191 |     addResponse(request.uniqueKey!, res);
192 | 
193 |     request.userData?.timeMeasures.push({
194 |         event: 'before queue add',
195 |         time: Date.now(),
196 |     });
197 |     await crawler.requestQueue!.addRequest(request);
198 | };
199 | 


--------------------------------------------------------------------------------
/src/errors.ts:
--------------------------------------------------------------------------------
1 | export class UserInputError extends Error {
2 |     constructor(message: string) {
3 |         super(message);
4 |         this.name = 'UserInputError';
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/src/extract_rules_utils.ts:
--------------------------------------------------------------------------------
  1 | import { AnyNode, Cheerio, CheerioAPI, load } from 'cheerio';
  2 | import { ExtractRule, ExtractRules } from './types.js';
  3 | import { UserInputError } from './errors.js';
  4 | 
  5 | // validation and transformation to full Extract Rules (i.e. including all parameters, not the shortened version, for easier scraping process)
  6 | 
  7 | function validateAndTransformFullOptionsRule(key: string, inputtedExtractRule: Record<string, unknown>): ExtractRule {
  8 |     const { selector, output = 'text', type = 'item', clean = true } = inputtedExtractRule;
  9 | 
 10 |     if (!selector || typeof selector !== 'string' || !selector.length) {
 11 |         throw new UserInputError(`Selector must be a non-empty string, rule for key: ${key}`);
 12 |     }
 13 | 
 14 |     if (typeof type !== 'string' || (type !== 'item' && type !== 'list')) {
 15 |         throw new UserInputError(`Type can be either 'item' or 'list', rule for a key: ${key}`);
 16 |     }
 17 | 
 18 |     if (typeof clean !== 'boolean') {
 19 |         throw new UserInputError('Clean can be set either to true or false');
 20 |     }
 21 | 
 22 |     if (typeof output === 'string') {
 23 |         const availableTypes = ['text', 'html', 'table_json', 'table_array'];
 24 |         const trimmed = (output as string).trim();
 25 |         if (availableTypes.includes(trimmed) || trimmed.startsWith('@')) {
 26 |             return {
 27 |                 selector,
 28 |                 type,
 29 |                 output: trimmed,
 30 |                 clean,
 31 |             };
 32 |         }
 33 | 
 34 |         throw new UserInputError(
 35 |             `Result in the extract rule for ${key} has invalid value, expected one of ${JSON.stringify(availableTypes)} or an attribute name starting with '@'`,
 36 |         );
 37 |     }
 38 | 
 39 |     if (typeof output === 'object') {
 40 |         const nestedRules = validateAndTransformExtractRules(output as Record<string, unknown>);
 41 |         return {
 42 |             selector,
 43 |             type,
 44 |             output: nestedRules,
 45 |             clean,
 46 |         };
 47 |     }
 48 | 
 49 |     throw new UserInputError(`Output in the extract rule for ${key} in a wrong format, expected object or a string`);
 50 | }
 51 | 
 52 | function validateAndTransformShortenedRule(key: string, inputtedRule: string): ExtractRule {
 53 |     const trimmedRule = inputtedRule.trim();
 54 | 
 55 |     if (trimmedRule.includes('@')) {
 56 |         const selector = trimmedRule.split('@').shift() as string;
 57 |         if (!selector.length) {
 58 |             throw new UserInputError(`Selector cannot be an empty string, rule: ${trimmedRule} for key ${key}`);
 59 |         }
 60 | 
 61 |         const attributeName = trimmedRule.slice(selector.length);
 62 |         if (!attributeName.length) {
 63 |             throw new UserInputError(`Attribute name cannot be an empty string, rule: ${trimmedRule} for key ${key}`);
 64 |         }
 65 | 
 66 |         return {
 67 |             selector,
 68 |             type: 'item',
 69 |             output: attributeName,
 70 |             clean: true,
 71 |         };
 72 |     }
 73 | 
 74 |     return {
 75 |         selector: trimmedRule,
 76 |         type: 'item',
 77 |         output: 'text',
 78 |         clean: true,
 79 |     };
 80 | }
 81 | 
 82 | export function validateAndTransformExtractRules(inputtedExtractRules: Record<string, unknown>): ExtractRules {
 83 |     const extractRules: ExtractRules = {};
 84 | 
 85 |     for (const entry of Object.entries(inputtedExtractRules)) {
 86 |         const key = entry[0];
 87 |         const keyValue = entry[1];
 88 |         if (typeof keyValue === 'object') {
 89 |             extractRules[key] = validateAndTransformFullOptionsRule(key, keyValue as Record<string, unknown>);
 90 |         } else if (typeof keyValue === 'string') {
 91 |             extractRules[key] = validateAndTransformShortenedRule(key, keyValue);
 92 |         } else {
 93 |             throw new UserInputError(`Extract rule for ${key} in a wrong format, expected object or a string`);
 94 |         }
 95 |     }
 96 | 
 97 |     return extractRules;
 98 | }
 99 | 
100 | // scraping based on full Extract Rules
101 | 
102 | function scrapeTable(item: Cheerio<AnyNode>) {
103 |     const $ = load(item.html() || '');
104 |     const headings: string[] = [];
105 |     item.find('tr').has('th').eq(0).find('th')
106 |         .each((_, el) => {
107 |             headings.push($(el).text().trim());
108 |         });
109 |     if (!headings.length) {
110 |         return [];
111 |     }
112 | 
113 |     const data: Record<string, string>[] = [];
114 |     item.find('tr').has('td').each((_, el) => {
115 |         const rowData: Record<string, string> = {};
116 |         const tdElements = $(el).find('td');
117 |         for (let i = 0; i < headings.length; i++) {
118 |             const val = tdElements.eq(i).text().trim();
119 |             rowData[headings[i]] = val;
120 |         }
121 |         data.push(rowData);
122 |     });
123 |     return data;
124 | }
125 | 
126 | function scrapeItems(item: Cheerio<AnyNode>, output: string | ExtractRules, clean: boolean) {
127 |     if (output === 'text') {
128 |         if (clean) {
129 |             return item.text().trim() || null;
130 |         }
131 |         return item.text() || '';
132 |     }
133 | 
134 |     if (output === 'html') {
135 |         // we do this so the HTML od the whole element returns, not just its inner HTML
136 |         const $ = load('');
137 |         const newHtmlWithItem = $('body').append(item);
138 |         return newHtmlWithItem.html() || '';
139 |     }
140 | 
141 |     if (output === 'table_json' || output === 'table_array') {
142 |         const data = scrapeTable(item);
143 |         if (output === 'table_json') {
144 |             return data;
145 |         }
146 |         return data.map((row) => Object.values(row));
147 |     }
148 | 
149 |     if (typeof output === 'string' && output.startsWith('@')) {
150 |         return item.attr(output.slice(1)) || '';
151 |     }
152 | 
153 |     if (typeof output === 'object') {
154 |         /*
155 |         This is here to have an option to work with already selected element(s). Scraping bee
156 |         does it like this, we could replace it with something like '.' to refer the element itself.
157 |         Example why this is needed:
158 |             {
159 |                 allLinks: {
160 |                     type: 'list',
161 |                     selector: 'a',    <--- selects all 'a' elements
162 |                     result: {
163 |                         linkTitle: 'a', <--- refers to each 'a' element that were selected before (in the level above)
164 |                         link: 'a@href'  <--- refers to each 'a' element that were selected before (in the level above)
165 |                     }
166 |                 }
167 |             }
168 |         */
169 |         const $ = load('');
170 |         const newHtmlWithItem = $('body').append(item);
171 |         return scrapeExtractRules(newHtmlWithItem, output);
172 |     }
173 |     throw new UserInputError('Invalid output value');
174 | }
175 | 
176 | function scrapeExtractRules($: Cheerio<AnyNode>, extractRules: ExtractRules) {
177 |     const scrapedData: Record<string, unknown> = {};
178 | 
179 |     for (const entries of Object.entries(extractRules)) {
180 |         const key = entries[0];
181 |         const rule = entries[1];
182 | 
183 |         const { selector, type, output, clean } = rule;
184 | 
185 |         const itemsFoundBySelector = $.find(selector);
186 |         if (type === 'item') {
187 |             scrapedData[key] = scrapeItems(itemsFoundBySelector.eq(0), output, clean);
188 |         } else {
189 |             const resultList: unknown[] = [];
190 |             itemsFoundBySelector.each((i) => {
191 |                 resultList.push(scrapeItems(itemsFoundBySelector.eq(i), output, clean));
192 |             }).get();
193 |             scrapedData[key] = resultList;
194 |         }
195 |     }
196 |     return scrapedData;
197 | }
198 | 
199 | export function scrapeBasedOnExtractRules($: CheerioAPI, extractRules: ExtractRules) {
200 |     const html = $('html');
201 |     return scrapeExtractRules(html, extractRules);
202 | }
203 | 


--------------------------------------------------------------------------------
/src/instructions_utils.ts:
--------------------------------------------------------------------------------
  1 | import { Page } from 'playwright';
  2 | import { sleep } from 'crawlee';
  3 | import { Action, FullJsScenarioReport, IndividualInstructionReport, Instruction, JsScenario } from './types.js';
  4 | import { UserInputError } from './errors.js';
  5 | 
  6 | export const parseAndValidateInstructions = (rawInput: string): JsScenario => {
  7 |     const input = JSON.parse(rawInput);
  8 | 
  9 |     let strictMode = true;
 10 |     if (input.strict !== undefined) {
 11 |         if (typeof input.strict !== 'boolean') {
 12 |             throw new UserInputError('Parameter strict in js_scenario can be only true or false');
 13 |         }
 14 |         strictMode = input.strict;
 15 |     }
 16 | 
 17 |     if (!input.instructions || !Array.isArray(input.instructions)) {
 18 |         return {
 19 |             strict: strictMode,
 20 |             instructions: [],
 21 |         };
 22 |     }
 23 | 
 24 |     const instructions = input.instructions as Record<string | number | symbol, unknown>[];
 25 |     const parsedInstructions: Instruction[] = [];
 26 |     for (const instruction of instructions) {
 27 |         if (typeof instruction !== 'object') {
 28 |             throw new UserInputError('Instruction must be an object');
 29 |         }
 30 |         if (Object.keys(instruction).length !== 1) {
 31 |             throw new UserInputError('Instruction must include only one action with params');
 32 |         }
 33 |         const action = Object.keys(instruction)[0];
 34 |         const param = instruction[action];
 35 | 
 36 |         const possibleActions = ['wait', 'wait_for', 'click', 'scroll_x', 'scroll_y', 'fill', 'evaluate', 'wait_for_and_click']; // todo
 37 |         if (typeof action !== 'string' || !possibleActions.includes(action)) {
 38 |             throw new UserInputError(`Unsupported instruction: ${action}`);
 39 |         }
 40 | 
 41 |         if (typeof param !== 'string' && typeof param !== 'number' && !Array.isArray(param)) {
 42 |             throw new UserInputError(`Unsupported params: ${action}, can be either number, string, or an array of strings`);
 43 |         }
 44 | 
 45 |         if (action === 'wait_for_and_click') {
 46 |             parsedInstructions.push({ action: 'wait_for', param });
 47 |             parsedInstructions.push({ action: 'click', param });
 48 |             continue;
 49 |         }
 50 | 
 51 |         parsedInstructions.push({ action: action as Action, param });
 52 |     }
 53 | 
 54 |     return {
 55 |         instructions: parsedInstructions,
 56 |         strict: strictMode,
 57 |     };
 58 | };
 59 | 
 60 | const performInstruction = async (instruction: Instruction, page: Page): Promise<{ success: boolean, errorMessage?: string | undefined; result?: string; }> => {
 61 |     try {
 62 |         let result;
 63 |         switch (instruction.action) {
 64 |             case 'wait': {
 65 |                 await sleep(instruction.param as number);
 66 |                 break;
 67 |             }
 68 |             case 'click': {
 69 |                 await page.click(instruction.param as string, { timeout: 5000 });
 70 |                 break;
 71 |             }
 72 |             case 'wait_for': {
 73 |                 await page.waitForSelector(instruction.param as string);
 74 |                 break;
 75 |             }
 76 |             case 'fill': {
 77 |                 const params = instruction.param as string[];
 78 |                 await page.fill(params[0], params[1]);
 79 |                 break;
 80 |             }
 81 |             case 'scroll_x': {
 82 |                 const paramX = instruction.param as number;
 83 |                 await page.mouse.wheel(paramX, 0);
 84 |                 break;
 85 |             }
 86 |             case 'scroll_y': {
 87 |                 const paramY = instruction.param as number;
 88 |                 await page.mouse.wheel(0, paramY);
 89 |                 break;
 90 |             }
 91 |             case 'wait_browser': {
 92 |                 await page.waitForLoadState(instruction.param as 'load' | 'domcontentloaded' | 'networkidle');
 93 |                 break;
 94 |             }
 95 |             case 'evaluate': {
 96 |                 const evaluateResult = await page.evaluate(instruction.param as string);
 97 |                 if (['boolean', 'number', 'string'].includes(typeof evaluateResult)) {
 98 |                     result = String(evaluateResult);
 99 |                 } else if (typeof evaluateResult === 'object') {
100 |                     result = JSON.stringify(evaluateResult);
101 |                 }
102 |                 break;
103 |             }
104 |             default: {
105 |                 return { success: false, errorMessage: 'unknown instruction' };
106 |             }
107 |         }
108 |         return { success: true, result };
109 |     } catch (e) {
110 |         return { success: false, errorMessage: (e as Error).message };
111 |     }
112 | };
113 | 
114 | export const performInstructionsAndGenerateReport = async (jsScenario: JsScenario, page: Page): Promise<FullJsScenarioReport> => {
115 |     const { strict, instructions } = jsScenario;
116 | 
117 |     let executed: number = 0;
118 |     let success: number = 0;
119 |     let failed: number = 0;
120 |     const reports: IndividualInstructionReport[] = [];
121 |     const evaluateResults: string[] = [];
122 |     const start = Date.now();
123 | 
124 |     for (const instruction of instructions) {
125 |         const instructionStart = Date.now();
126 |         const instructionResult = await performInstruction(instruction, page);
127 |         const instructionDuration = (Date.now() - instructionStart) / 1000;
128 | 
129 |         executed += 1;
130 |         if (instructionResult.success) {
131 |             success += 1;
132 |             if (instruction.action === 'evaluate' && instructionResult.result) {
133 |                 evaluateResults.push(instructionResult.result);
134 |             }
135 |         } else {
136 |             failed += 1;
137 |         }
138 | 
139 |         reports.push({
140 |             task: instruction.action,
141 |             params: instruction.param,
142 |             duration: instructionDuration,
143 |             success: instructionResult.success,
144 |         });
145 | 
146 |         if (strict && !instructionResult.success) {
147 |             break;
148 |         }
149 |     }
150 |     const totalDuration = (Date.now() - start) / 1000;
151 |     return {
152 |         jsScenarioReport: {
153 |             totalDuration,
154 |             taskExecuted: executed,
155 |             taskSuccess: success,
156 |             taskFailure: failed,
157 |             tasks: reports,
158 |         },
159 |         evaluateResults,
160 |     };
161 | };
162 | 


--------------------------------------------------------------------------------
/src/main.ts:
--------------------------------------------------------------------------------
 1 | import { Actor, log } from 'apify';
 2 | import { createServer } from 'http';
 3 | import { CrawlerOptions } from './types.js';
 4 | import { addRequest, createAndStartCrawler, DEFAULT_CRAWLER_OPTIONS } from './crawlers.js';
 5 | import { addTimeoutToAllResponses, sendErrorResponseById } from './responses.js';
 6 | import { ScrapingBee } from './params.js';
 7 | import { createProxyOptions, createRequestForCrawler, parseParameters } from './utils.js';
 8 | import { UserInputError } from './errors.js';
 9 | 
10 | await Actor.init();
11 | 
12 | if (Actor.isAtHome() && Actor.getEnv().metaOrigin !== 'STANDBY') {
13 |     await Actor.fail('The Actor must start by being called using its Standby endpoint.');
14 | }
15 | 
16 | Actor.on('migrating', () => {
17 |     addTimeoutToAllResponses(60);
18 | });
19 | 
20 | const server = createServer(async (req, res) => {
21 |     const requestReceivedTime = Date.now();
22 |     if (req.method !== 'HEAD') {
23 |         log.info(`Request received: ${req.method} ${req.url}`);
24 |     }
25 |     try {
26 |         const params = parseParameters(req.url!);
27 |         const crawlerRequest = createRequestForCrawler(params, req);
28 |         crawlerRequest.userData?.timeMeasures.push({
29 |             event: 'request received',
30 |             time: requestReceivedTime,
31 |         });
32 | 
33 |         let timeout = 140000;
34 |         if (params[ScrapingBee.timeout]) {
35 |             const timeoutNumber = Number.parseInt(params[ScrapingBee.timeout] as string, 10);
36 |             if (Number.isNaN(timeoutNumber)) {
37 |                 throw new UserInputError('Parameter timeout must be a number');
38 |             }
39 |             if (timeoutNumber < 1000 || timeoutNumber > 3600000) {
40 |                 throw new UserInputError('Parameter timeout must be between 1000 and 3600000 ms (1 hour)');
41 |             }
42 |             timeout = timeoutNumber;
43 |         }
44 | 
45 |         setTimeout(() => {
46 |             const timeoutErrorMessage = {
47 |                 errorMessage: `Response timed out.`,
48 |             };
49 |             sendErrorResponseById(crawlerRequest.uniqueKey!, JSON.stringify(timeoutErrorMessage));
50 |         }, timeout);
51 | 
52 |         const crawlerOptions: CrawlerOptions = {
53 |             proxyConfigurationOptions: createProxyOptions(params),
54 |         };
55 |         await addRequest(crawlerRequest, res, crawlerOptions);
56 |     } catch (e) {
57 |         const error = e as Error;
58 |         const errorMessage = {
59 |             errorMessage: error.message,
60 |         };
61 |         const statusCode = error instanceof UserInputError ? 400 : 500;
62 |         res.writeHead(statusCode, { 'Content-Type': 'application/json' });
63 |         res.end(JSON.stringify(errorMessage));
64 |     }
65 | });
66 | 
67 | const port = Actor.isAtHome() ? process.env.ACTOR_STANDBY_PORT : 8080;
68 | server.listen(port, async () => {
69 |     log.info('SuperScraper is listening for user requests');
70 | 
71 |     // Pre-create common crawlers because crawler init can take about 1 sec
72 |     await Promise.all([
73 |         createAndStartCrawler(DEFAULT_CRAWLER_OPTIONS),
74 |         createAndStartCrawler({ ...DEFAULT_CRAWLER_OPTIONS, proxyConfigurationOptions: { groups: ['RESIDENTIAL'] } }),
75 |     ]);
76 | });
77 | 


--------------------------------------------------------------------------------
/src/params.ts:
--------------------------------------------------------------------------------
 1 | export enum ScrapingBee {
 2 |     // skipped for now: session_id, block_ads
 3 |     url = 'url',
 4 |     extractRules = 'extract_rules',
 5 |     device = 'device',
 6 |     jsScenario = 'js_scenario',
 7 |     renderJs = 'render_js',
 8 |     wait = 'wait',
 9 |     waitFor = 'wait_for',
10 |     waitBrowser = 'wait_browser',
11 |     screenshot = 'screenshot',
12 |     screenshotFullPage = 'screenshot_full_page',
13 |     screenshotSelector = 'screenshot_selector',
14 |     windowWidth = 'window_width',
15 |     windowHeight = 'window_height',
16 |     returnPageSource = 'return_page_source',
17 |     transparentStatusCode = 'transparent_status_code',
18 |     forwardHeaders = 'forward_headers',
19 |     forwardHeadersPure = 'forward_headers_pure',
20 |     cookies = 'cookies',
21 |     timeout = 'timeout',
22 |     customGoogle = 'custom_google',
23 |     ownProxy = 'own_proxy',
24 |     premiumProxy = 'premium_proxy',
25 |     stealthProxy = 'stealth_proxy',
26 |     countryCode = 'country_code',
27 |     jsonResponse = 'json_response',
28 |     blockResources = 'block_resources'
29 | }
30 | 
31 | export enum ScrapingAnt {
32 |     // we already have: url, return_page_source, cookies
33 |     browser = 'browser',
34 |     jsSnippet = 'js_snippet',
35 |     proxyType = 'proxy_type',
36 |     waitForSelector = 'wait_for_selector',
37 |     blockResource = 'block_resource',
38 |     proxyCountry = 'proxy_country',
39 | }
40 | 
41 | export enum ScraperApi {
42 |     // we already have: wait_for_selector, country_code
43 |     // skipped for now: session_number, autoparse
44 |     render = 'render',
45 |     premium = 'premium',
46 |     binaryTarget = 'binary_target',
47 |     keepHeaders = 'keep_headers',
48 |     deviceType = 'device_type',
49 |     ultraPremium = 'ultra_premium',
50 | }
51 | 
52 | export const EquivalentParameters = {
53 |     [ScrapingBee.device]: [ScrapingBee.device, ScraperApi.deviceType],
54 |     [ScrapingBee.renderJs]: [ScrapingAnt.browser, ScraperApi.render],
55 |     [ScrapingBee.waitFor]: [ScrapingAnt.waitForSelector],
56 |     [ScrapingBee.premiumProxy]: [ScrapingBee.stealthProxy, ScraperApi.premium, ScraperApi.ultraPremium],
57 |     [ScrapingBee.countryCode]: [ScrapingAnt.proxyCountry],
58 | };
59 | 


--------------------------------------------------------------------------------
/src/responses.ts:
--------------------------------------------------------------------------------
 1 | import { log } from 'apify';
 2 | import { ServerResponse } from 'http';
 3 | 
 4 | const responses = new Map<string, ServerResponse>();
 5 | 
 6 | export const sendSuccResponseById = (responseId: string, result: unknown, contentType: string) => {
 7 |     const res = responses.get(responseId);
 8 |     if (!res) {
 9 |         log.info(`Response for request ${responseId} not found`);
10 |         return;
11 |     }
12 |     res.writeHead(200, { 'Content-Type': contentType });
13 |     res.end(result);
14 |     responses.delete(responseId);
15 | };
16 | 
17 | export const sendErrorResponseById = (responseId: string, result: string, statusCode: number = 500) => {
18 |     const res = responses.get(responseId);
19 |     if (!res) {
20 |         log.info(`Response for request ${responseId} not found`);
21 |         return;
22 |     }
23 |     res.writeHead(statusCode, { 'Content-Type': 'application/json' });
24 |     res.end(result);
25 |     responses.delete(responseId);
26 | };
27 | 
28 | export const addResponse = (responseId: string, response: ServerResponse) => {
29 |     responses.set(responseId, response);
30 | };
31 | 
32 | export const addTimeoutToAllResponses = (timeoutInSeconds: number = 60) => {
33 |     const migrationErrorMessage = {
34 |         errorMessage: `Actor had to migrate to another server. Please, retry your request.`,
35 |     };
36 | 
37 |     const responseKeys = Object.keys(responses);
38 | 
39 |     for (const key of responseKeys) {
40 |         setTimeout(() => {
41 |             sendErrorResponseById(key, JSON.stringify(migrationErrorMessage));
42 |         }, timeoutInSeconds * 1000);
43 |     }
44 | };
45 | 


--------------------------------------------------------------------------------
/src/router.ts:
--------------------------------------------------------------------------------
  1 | import { createPlaywrightRouter } from 'crawlee';
  2 | import { CheerioAPI, load } from 'cheerio';
  3 | import { Label } from './const.js';
  4 | import { FullJsScenarioReport, IFrameData, TimeMeasure, UserData, VerboseResult } from './types.js';
  5 | import { performInstructionsAndGenerateReport } from './instructions_utils.js';
  6 | import { sendSuccResponseById } from './responses.js';
  7 | import { scrapeBasedOnExtractRules } from './extract_rules_utils.js';
  8 | import { pushLogData } from './utils.js';
  9 | 
 10 | export const router = createPlaywrightRouter();
 11 | 
 12 | router.addHandler<UserData>(Label.BROWSER, async ({ request, page, response, parseWithCheerio }) => {
 13 |     const {
 14 |         requestDetails,
 15 |         jsonResponse,
 16 |         extractRules,
 17 |         screenshotSettings,
 18 |         inputtedUrl,
 19 |         parsedInputtedParams,
 20 |         timeMeasures,
 21 |         jsScenario,
 22 |         returnPageSource,
 23 |     } = request.userData;
 24 | 
 25 |     // See comment in crawler.autoscaledPoolOptions.runTaskFunction override
 26 |     timeMeasures.push((global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure);
 27 | 
 28 |     const responseId = request.uniqueKey;
 29 | 
 30 |     timeMeasures.push({
 31 |         event: 'page loaded',
 32 |         time: Date.now(),
 33 |     });
 34 | 
 35 |     const jsScenarioReportFull: FullJsScenarioReport = {};
 36 |     if (jsScenario.instructions.length) {
 37 |         const { jsScenarioReport, evaluateResults } = await performInstructionsAndGenerateReport(jsScenario, page);
 38 |         jsScenarioReportFull.jsScenarioReport = jsScenarioReport;
 39 |         jsScenarioReportFull.evaluateResults = evaluateResults;
 40 |     }
 41 | 
 42 |     requestDetails.resolvedUrl = response?.url() || '';
 43 |     requestDetails.responseHeaders = response?.headers() || {};
 44 |     const $ = await parseWithCheerio();
 45 |     const statusCode = response?.status() || null;
 46 | 
 47 |     const cookies = await page.context().cookies(request.url) || [];
 48 | 
 49 |     const iframes: IFrameData[] = [];
 50 |     if (jsonResponse) {
 51 |         const frames = page.frames();
 52 |         for (const frame of frames) {
 53 |             let frameEl;
 54 |             try {
 55 |                 frameEl = await frame.frameElement();
 56 |             } catch (e) {
 57 |                 continue;
 58 |             }
 59 | 
 60 |             const src = await frameEl.getAttribute('src') || '';
 61 |             const content = await frame.content();
 62 | 
 63 |             iframes.push({
 64 |                 src,
 65 |                 content,
 66 |             });
 67 |         }
 68 |     }
 69 | 
 70 |     let screenshot = null;
 71 |     if (screenshotSettings.screenshotType !== 'none') {
 72 |         const { screenshotType, selector } = screenshotSettings;
 73 |         let screenshotBuffer: Buffer;
 74 |         if (screenshotType === 'full') {
 75 |             screenshotBuffer = await page.screenshot({ fullPage: true });
 76 |         } else if (screenshotType === 'window') {
 77 |             screenshotBuffer = await page.screenshot();
 78 |         } else {
 79 |             screenshotBuffer = await page.locator(selector as string).screenshot();
 80 |         }
 81 |         screenshot = screenshotBuffer.toString('base64');
 82 | 
 83 |         if (!jsonResponse) {
 84 |             await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: screenshot, errors: requestDetails.requestErrors });
 85 |             sendSuccResponseById(responseId, screenshotBuffer, 'image/png');
 86 |             return;
 87 |         }
 88 |     }
 89 | 
 90 |     if (extractRules) {
 91 |         const resultFromExtractRules = scrapeBasedOnExtractRules($ as CheerioAPI, extractRules);
 92 |         if (jsonResponse) {
 93 |             const verboseResponse: VerboseResult = {
 94 |                 body: resultFromExtractRules,
 95 |                 cookies,
 96 |                 evaluateResults: jsScenarioReportFull.evaluateResults || [],
 97 |                 jsScenarioReport: jsScenarioReportFull.jsScenarioReport || {},
 98 |                 headers: requestDetails.responseHeaders,
 99 |                 type: 'json',
100 |                 iframes,
101 |                 xhr: requestDetails.xhr,
102 |                 initialStatusCode: statusCode,
103 |                 resolvedUrl: requestDetails.resolvedUrl,
104 |                 screenshot,
105 |             };
106 |             await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors });
107 |             sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json');
108 |         } else {
109 |             await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: resultFromExtractRules, errors: requestDetails.requestErrors });
110 |             sendSuccResponseById(responseId, JSON.stringify(resultFromExtractRules), 'application/json');
111 |         }
112 |         return;
113 |     }
114 | 
115 |     // response.body() contains HTML of the page before js rendering
116 |     const htmlResult = returnPageSource
117 |         ? (await response?.body())?.toString() as string
118 |         : $.html();
119 | 
120 |     if (jsonResponse) {
121 |         const verboseResponse: VerboseResult = {
122 |             body: htmlResult,
123 |             cookies,
124 |             evaluateResults: jsScenarioReportFull.evaluateResults || [],
125 |             jsScenarioReport: jsScenarioReportFull.jsScenarioReport || {},
126 |             headers: requestDetails.responseHeaders,
127 |             type: 'html',
128 |             iframes,
129 |             xhr: requestDetails.xhr,
130 |             initialStatusCode: statusCode,
131 |             resolvedUrl: requestDetails.resolvedUrl,
132 |             screenshot,
133 |         };
134 |         await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors });
135 |         sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json');
136 |         return;
137 |     }
138 |     await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: htmlResult, errors: requestDetails.requestErrors });
139 |     sendSuccResponseById(responseId, htmlResult, 'text/html');
140 | });
141 | 
142 | router.addHandler<UserData>(Label.HTTP, async ({ request, sendRequest }) => {
143 |     const {
144 |         requestDetails,
145 |         jsonResponse,
146 |         extractRules,
147 |         inputtedUrl,
148 |         parsedInputtedParams,
149 |         timeMeasures,
150 |     } = request.userData as UserData;
151 | 
152 |     // See comment in crawler.autoscaledPoolOptions.runTaskFunction override
153 |     timeMeasures.push((global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure);
154 | 
155 |     const responseId = request.uniqueKey;
156 | 
157 |     const resp = await sendRequest({
158 |         url: request.url,
159 |         throwHttpErrors: false,
160 |         headers: request.headers,
161 |     });
162 | 
163 |     timeMeasures.push({
164 |         event: 'page loaded',
165 |         time: Date.now(),
166 |     });
167 | 
168 |     const { statusCode } = resp;
169 |     if (resp.statusCode >= 300 && resp.statusCode !== 404) {
170 |         (request.userData as UserData).nonbrowserRequestStatus = resp.statusCode;
171 |         throw new Error(`HTTPError: Response code ${resp.statusCode}`);
172 |     }
173 | 
174 |     requestDetails.resolvedUrl = resp.url;
175 |     requestDetails.responseHeaders = resp.headers as Record<string, string | string[]>;
176 | 
177 |     if (extractRules) {
178 |         const $ = load(resp.body);
179 |         const resultFromExtractRules = scrapeBasedOnExtractRules($, extractRules);
180 |         if (jsonResponse) {
181 |             const verboseResponse: VerboseResult = {
182 |                 body: resultFromExtractRules,
183 |                 cookies: [],
184 |                 evaluateResults: [],
185 |                 jsScenarioReport: {},
186 |                 headers: requestDetails.responseHeaders,
187 |                 type: 'json',
188 |                 iframes: [],
189 |                 xhr: [],
190 |                 initialStatusCode: statusCode,
191 |                 resolvedUrl: requestDetails.resolvedUrl,
192 |                 screenshot: null,
193 |             };
194 |             await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors });
195 |             sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json');
196 |         } else {
197 |             await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: resultFromExtractRules, errors: requestDetails.requestErrors });
198 |             sendSuccResponseById(responseId, JSON.stringify(resultFromExtractRules), 'application/json');
199 |         }
200 |         return;
201 |     }
202 | 
203 |     const htmlResult = resp.body;
204 |     if (jsonResponse) {
205 |         const verboseResponse: VerboseResult = {
206 |             body: htmlResult,
207 |             cookies: [],
208 |             evaluateResults: [],
209 |             jsScenarioReport: {},
210 |             headers: requestDetails.responseHeaders,
211 |             type: 'html',
212 |             iframes: [],
213 |             xhr: [],
214 |             initialStatusCode: statusCode,
215 |             resolvedUrl: requestDetails.resolvedUrl,
216 |             screenshot: null,
217 |         };
218 |         await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors });
219 |         sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json');
220 |         return;
221 |     }
222 |     await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: htmlResult, errors: requestDetails.requestErrors });
223 |     sendSuccResponseById(responseId, htmlResult, 'text/html');
224 | });
225 | 
226 | router.addHandler<UserData>(Label.BINARY_TARGET, async ({ request, sendRequest }) => {
227 |     const {
228 |         requestDetails,
229 |         jsonResponse,
230 |         inputtedUrl,
231 |         parsedInputtedParams,
232 |         timeMeasures,
233 |     } = request.userData as UserData;
234 | 
235 |     // See comment in crawler.autoscaledPoolOptions.runTaskFunction override
236 |     timeMeasures.push((global as unknown as { latestRequestTaskTimeMeasure: TimeMeasure }).latestRequestTaskTimeMeasure);
237 | 
238 |     const responseId = request.uniqueKey;
239 | 
240 |     const resp = await sendRequest({
241 |         url: request.url,
242 |         throwHttpErrors: false,
243 |         headers: request.headers,
244 |     });
245 | 
246 |     timeMeasures.push({
247 |         event: 'page loaded',
248 |         time: Date.now(),
249 |     });
250 | 
251 |     const { statusCode } = resp;
252 |     if (resp.statusCode >= 300 && resp.statusCode !== 404) {
253 |         (request.userData as UserData).nonbrowserRequestStatus = resp.statusCode;
254 |         throw new Error(`HTTPError: Response code ${resp.statusCode}`);
255 |     }
256 | 
257 |     requestDetails.resolvedUrl = resp.url;
258 |     requestDetails.responseHeaders = resp.headers as Record<string, string | string[]>;
259 |     const result = resp.rawBody;
260 |     const contentType = resp.headers['content-type'];
261 |     if (!contentType) {
262 |         throw new Error(`No content-type returned in the response`);
263 |     }
264 | 
265 |     if (jsonResponse) {
266 |         const verboseResponse: VerboseResult = {
267 |             body: result.toString(),
268 |             cookies: [],
269 |             evaluateResults: [],
270 |             jsScenarioReport: {},
271 |             headers: requestDetails.responseHeaders,
272 |             type: 'file',
273 |             iframes: [],
274 |             xhr: [],
275 |             initialStatusCode: statusCode,
276 |             resolvedUrl: requestDetails.resolvedUrl,
277 |             screenshot: null,
278 |         };
279 |         await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result: verboseResponse, errors: requestDetails.requestErrors });
280 |         sendSuccResponseById(responseId, JSON.stringify(verboseResponse), 'application/json');
281 |         return;
282 |     }
283 | 
284 |     await pushLogData(timeMeasures, { inputtedUrl, parsedInputtedParams, result, errors: requestDetails.requestErrors });
285 |     sendSuccResponseById(responseId, result, contentType);
286 | });
287 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
  1 | import type { ProxyConfigurationOptions } from 'apify';
  2 | import { Cookie } from 'crawlee';
  3 | 
  4 | export interface RequestDetails {
  5 |     requestErrors: { attempt: number, errorMessage: string }[],
  6 |     resolvedUrl: string | null,
  7 |     responseHeaders: Record<string, string | string[]> | null,
  8 |     xhr: XHRRequestData[],
  9 | }
 10 | 
 11 | export interface XHRRequestData {
 12 |     url: string,
 13 |     statusCode: number,
 14 |     method: string,
 15 |     requestHeaders: Record<string, string>,
 16 |     headers: Record<string, string>,
 17 |     body: string,
 18 | }
 19 | 
 20 | export interface IFrameData {
 21 |     src: string,
 22 |     content: string,
 23 | }
 24 | 
 25 | export interface VerboseResult {
 26 |     body: string | Record<string, unknown>,
 27 |     cookies: Cookie[],
 28 |     evaluateResults: string[],
 29 |     jsScenarioReport: JsScenarioReport | Record<string, never>,
 30 |     headers: Record<string, string | string[]>,
 31 |     type: 'html' | 'json' | 'file',
 32 |     screenshot: string | null,
 33 |     iframes: IFrameData[],
 34 |     xhr: XHRRequestData[],
 35 |     initialStatusCode: number | null,
 36 |     resolvedUrl: string,
 37 |     metadata?: string,
 38 | }
 39 | 
 40 | export interface ExtractRule {
 41 |     selector: string,
 42 |     type: 'list' | 'item',
 43 |     output: string | Record<string, ExtractRule>
 44 |     clean: boolean,
 45 | }
 46 | 
 47 | export type ExtractRules = Record<string, ExtractRule>;
 48 | 
 49 | export interface TimeMeasure {
 50 |     event: 'request received' | 'before queue add' | 'crawlee internal run task' | 'crawlee internal request handler' | 'pre-navigation hook' |
 51 |     'page loaded' | 'handler end' | 'error' | 'failed request',
 52 |     time: number,
 53 | }
 54 | 
 55 | export type Action = 'wait' | 'wait_for' | 'click' | 'scroll_x' | 'scroll_y' | 'fill' | 'wait_browser' | 'evaluate';
 56 | type ActionParam = number | string | string[];
 57 | 
 58 | export interface Instruction {
 59 |     action: Action,
 60 |     param: ActionParam,
 61 | }
 62 | 
 63 | export interface JsScenario {
 64 |     instructions: Instruction[],
 65 |     strict: boolean,
 66 | }
 67 | 
 68 | export interface IndividualInstructionReport {
 69 |     task: Action,
 70 |     params: ActionParam,
 71 |     success: boolean,
 72 |     duration: number,
 73 | }
 74 | 
 75 | export interface JsScenarioReport {
 76 |     tasks: IndividualInstructionReport[],
 77 |     taskExecuted: number,
 78 |     taskSuccess: number,
 79 |     taskFailure: number,
 80 |     totalDuration: number,
 81 | }
 82 | 
 83 | export interface FullJsScenarioReport {
 84 |     evaluateResults?: string[],
 85 |     jsScenarioReport?: JsScenarioReport,
 86 | }
 87 | 
 88 | export interface ScreenshotSettings {
 89 |     screenshotType: 'none' | 'window' | 'full' | 'selector',
 90 |     selector?: string,
 91 | }
 92 | 
 93 | export interface UserData {
 94 |     jsonResponse: boolean,
 95 |     screenshotSettings: ScreenshotSettings,
 96 |     requestDetails: RequestDetails,
 97 |     extractRules: ExtractRules | null,
 98 |     inputtedUrl: string,
 99 |     parsedInputtedParams: Record<string, string | string[] | undefined>,
100 |     timeMeasures: TimeMeasure[],
101 |     jsScenario: JsScenario,
102 |     blockResources: boolean,
103 |     blockResourceTypes: string[],
104 |     height: number,
105 |     width: number,
106 |     returnPageSource: boolean,
107 |     transparentStatusCode: boolean,
108 |     nonbrowserRequestStatus?: number,
109 |     binaryTarget: boolean,
110 | }
111 | 
112 | export interface CrawlerOptions {
113 |     proxyConfigurationOptions: ProxyConfigurationOptions;
114 | }
115 | 


--------------------------------------------------------------------------------
/src/utils.ts:
--------------------------------------------------------------------------------
  1 | import type { ParsedUrlQuery } from 'querystring';
  2 | import { parse } from 'querystring';
  3 | import type { IncomingMessage } from 'http';
  4 | import { RequestOptions } from 'crawlee';
  5 | import { v4 as uuidv4 } from 'uuid';
  6 | import { HeaderGenerator } from 'header-generator';
  7 | import { Actor, ProxyConfigurationOptions, log } from 'apify';
  8 | import { TimeMeasure, JsScenario, RequestDetails, ScreenshotSettings, UserData } from './types.js';
  9 | import { EquivalentParameters, ScrapingBee, ScraperApi, ScrapingAnt } from './params.js';
 10 | import { UserInputError } from './errors.js';
 11 | import { validateAndTransformExtractRules } from './extract_rules_utils.js';
 12 | import { parseAndValidateInstructions } from './instructions_utils.js';
 13 | import { Label, VALID_RESOURCES } from './const.js';
 14 | 
 15 | const transformTimeMeasuresToRelative = (timeMeasures: TimeMeasure[]): TimeMeasure[] => {
 16 |     const firstMeasure = timeMeasures[0].time;
 17 |     return timeMeasures.map((measure) => {
 18 |         return {
 19 |             event: measure.event,
 20 |             time: measure.time - firstMeasure,
 21 |         };
 22 |     }).sort((a, b) => a.time - b.time);
 23 | };
 24 | 
 25 | export async function pushLogData(timeMeasures: TimeMeasure[], data: Record<string, unknown>, failed = false) {
 26 |     timeMeasures.push({
 27 |         event: failed ? 'failed request' : 'handler end',
 28 |         time: Date.now(),
 29 |     });
 30 |     const relativeMeasures = transformTimeMeasuresToRelative(timeMeasures);
 31 |     log.info(`Response sent (${relativeMeasures.at(-1)?.time} ms) ${data.inputtedUrl}`, { ...relativeMeasures });
 32 |     await Actor.pushData({
 33 |         ...data,
 34 |         measures: relativeMeasures,
 35 |     });
 36 | }
 37 | 
 38 | const isValidResourceType = (resource: string) => {
 39 |     return VALID_RESOURCES.includes(resource);
 40 | };
 41 | 
 42 | function mapEquivalentParams(params: ParsedUrlQuery) {
 43 |     for (const [ScrapingBeeParam, EquivalentParams] of Object.entries(EquivalentParameters)) {
 44 |         if (params[ScrapingBeeParam]) {
 45 |             continue;
 46 |         }
 47 |         for (const eqParam of EquivalentParams) {
 48 |             if (params[eqParam]) {
 49 |                 params[ScrapingBeeParam] = params[eqParam];
 50 |                 continue;
 51 |             }
 52 |         }
 53 |     }
 54 |     return params;
 55 | }
 56 | 
 57 | export function parseParameters(url: string) {
 58 |     const params = parse(url.slice(2));
 59 |     return mapEquivalentParams(params);
 60 | }
 61 | 
 62 | function generateHeaders(device: 'mobile' | 'desktop') {
 63 |     const headerGenerator = new HeaderGenerator({
 64 |         devices: [device],
 65 |     });
 66 |     const generatedHeaders = headerGenerator.getHeaders();
 67 |     // remove 'te' header as it is causing page.goto: net::ERR_INVALID_ARGUMENT error
 68 |     // eslint-disable-next-line @typescript-eslint/no-unused-vars
 69 |     const { te, ...rest } = generatedHeaders;
 70 |     return rest;
 71 | }
 72 | 
 73 | export function createRequestForCrawler(params: ParsedUrlQuery, req: IncomingMessage): RequestOptions<UserData> {
 74 |     if (!params[ScrapingBee.url] || !params[ScrapingBee.url].length) {
 75 |         throw new UserInputError('Parameter url is either missing or empty');
 76 |     }
 77 |     const urlToScrape = params[ScrapingBee.url] as string;
 78 | 
 79 |     const useExtractRules = !!params[ScrapingBee.extractRules]; // using !! casts non-bool to bool
 80 |     let inputtedExtractRules;
 81 |     if (useExtractRules) {
 82 |         inputtedExtractRules = JSON.parse(params[ScrapingBee.extractRules] as string);
 83 |     }
 84 | 
 85 |     let selectedDevice: 'desktop' | 'mobile' = 'desktop';
 86 |     if (params[ScrapingBee.device]) {
 87 |         const device = params[ScrapingBee.device] as string;
 88 |         if (device === 'mobile') {
 89 |             selectedDevice = 'mobile';
 90 |         }
 91 | 
 92 |         if (device !== 'desktop' && device !== 'mobile') {
 93 |             throw new UserInputError('Param device can be either desktop or mobile');
 94 |         }
 95 |     }
 96 | 
 97 |     const generatedHeaders = generateHeaders(selectedDevice);
 98 | 
 99 |     const doScenario = !!params[ScrapingBee.jsScenario];
100 |     const jsScenario: JsScenario = doScenario
101 |         ? parseAndValidateInstructions(params[ScrapingBee.jsScenario] as string)
102 |         : { instructions: [], strict: false };
103 | 
104 |     const renderJs = !(params[ScrapingBee.renderJs] === 'false'
105 |         || params[ScrapingAnt.browser] === 'false'
106 |         || params[ScraperApi.render] === 'false');
107 | 
108 |     if (renderJs && params[ScrapingBee.wait]) {
109 |         const parsedWait = Number.parseInt(params[ScrapingBee.wait] as string, 10);
110 |         if (Number.isNaN(parsedWait)) {
111 |             throw new UserInputError('Number value expected for wait parameter');
112 |         } else {
113 |             jsScenario.instructions.unshift({
114 |                 action: 'wait',
115 |                 param: Math.min(parsedWait, 35000),
116 |             });
117 |         }
118 |     }
119 | 
120 |     if (renderJs && (params[ScrapingBee.waitFor])) {
121 |         const waitForSelector = params[ScrapingBee.waitFor];
122 |         if (typeof waitForSelector !== 'string' || !waitForSelector.length) {
123 |             throw new UserInputError('Non-empty selector expected for wait_for and wait_for_selector parameters');
124 |         } else {
125 |             jsScenario.instructions.unshift({
126 |                 action: 'wait_for',
127 |                 param: waitForSelector,
128 |             });
129 |         }
130 |     }
131 | 
132 |     if (renderJs && params[ScrapingBee.waitBrowser]) {
133 |         const waitForBrowserState = params[ScrapingBee.waitBrowser] as string;
134 |         if (!['load', 'domcontentloaded', 'networkidle'].includes(waitForBrowserState)) {
135 |             throw new UserInputError('Unsupported value for wait_browser parameter');
136 |         } else {
137 |             jsScenario.instructions.unshift({
138 |                 action: 'wait_browser',
139 |                 param: waitForBrowserState,
140 |             });
141 |         }
142 |     }
143 | 
144 |     if (renderJs && params[ScrapingAnt.jsSnippet]) {
145 |         const jsSnippetBase64 = params[ScrapingAnt.jsSnippet] as string;
146 |         if (!jsSnippetBase64.length) {
147 |             throw new UserInputError('Parameter js_snippet must be a non empty string');
148 |         }
149 |         const jsSnippet = Buffer.from(jsSnippetBase64, 'base64').toString();
150 |         if (!jsSnippet.length) {
151 |             throw new UserInputError('Decoding of js_snippet was not successful');
152 |         }
153 |         jsScenario.instructions.unshift({
154 |             action: 'evaluate',
155 |             param: jsSnippet,
156 |         });
157 |     }
158 | 
159 |     const requestDetails: RequestDetails = {
160 |         requestErrors: [],
161 |         resolvedUrl: null,
162 |         responseHeaders: null,
163 |         xhr: [],
164 |     };
165 | 
166 |     const screenshotSettings: ScreenshotSettings = {
167 |         screenshotType: 'none',
168 |     };
169 |     if (params[ScrapingBee.screenshot] === 'true') {
170 |         screenshotSettings.screenshotType = 'window';
171 |     }
172 |     if (params[ScrapingBee.screenshotFullPage] === 'true') {
173 |         screenshotSettings.screenshotType = 'full';
174 |     }
175 |     if (params[ScrapingBee.screenshotSelector]) {
176 |         if (typeof params[ScrapingBee.screenshotSelector] !== 'string') {
177 |             throw new UserInputError('Parameter screenshot_selector must be a string');
178 |         }
179 |         screenshotSettings.screenshotType = 'selector';
180 |         screenshotSettings.selector = params[ScrapingBee.screenshotSelector];
181 |     }
182 | 
183 |     let blockResourceTypes: string[] = [];
184 |     if (params[ScrapingAnt.blockResource]) {
185 |         const paramValue = params[ScrapingAnt.blockResource];
186 |         const resources = Array.isArray(paramValue) ? paramValue : [paramValue];
187 |         const resourcesToBlock = new Set<string>();
188 |         for (const resource of resources) {
189 |             if (isValidResourceType(resource)) {
190 |                 resourcesToBlock.add(resource);
191 |             } else {
192 |                 throw new UserInputError(`Unsupported value in block_resource: ${resource}`);
193 |             }
194 |         }
195 |         blockResourceTypes = Array.from(resourcesToBlock.values());
196 |     }
197 | 
198 |     let binaryTarget = false;
199 |     if (params[ScraperApi.binaryTarget]) {
200 |         const binaryTargetIsTrue = params[ScraperApi.binaryTarget] === 'true';
201 |         binaryTarget = binaryTargetIsTrue;
202 |     }
203 | 
204 |     const finalRequest: RequestOptions<UserData> = {
205 |         url: urlToScrape,
206 |         uniqueKey: uuidv4(),
207 |         headers: {
208 |             ...generatedHeaders,
209 |         },
210 |         skipNavigation: !renderJs,
211 |         userData: {
212 |             jsonResponse: params[ScrapingBee.jsonResponse] === 'true',
213 |             screenshotSettings,
214 |             requestDetails,
215 |             extractRules: useExtractRules ? validateAndTransformExtractRules(inputtedExtractRules) : null,
216 |             inputtedUrl: req.url as string,
217 |             parsedInputtedParams: params,
218 |             timeMeasures: [],
219 |             jsScenario,
220 |             blockResources: !(params[ScrapingBee.blockResources] === 'false'),
221 |             width: Number.parseInt(params[ScrapingBee.windowWidth] as string, 10) || 1920,
222 |             height: Number.parseInt(params[ScrapingBee.windowHeight] as string, 10) || 1080,
223 |             returnPageSource: params[ScrapingBee.returnPageSource] === 'true',
224 |             transparentStatusCode: params[ScrapingBee.transparentStatusCode] === 'true',
225 |             blockResourceTypes,
226 |             binaryTarget,
227 |         },
228 |     };
229 | 
230 |     // headers with ant/spb prefixes
231 |     if (params[ScrapingBee.forwardHeaders] === 'true' || params[ScrapingBee.forwardHeadersPure] === 'true') {
232 |         const reqHeaders = req.headers;
233 |         const headersToForward: Record<string, string> = {};
234 |         for (const headerKey of Object.keys(reqHeaders)) {
235 |             if (headerKey.startsWith('spb-') || headerKey.startsWith('ant-')) {
236 |                 const withoutPrefixKey = headerKey.slice(4);
237 | 
238 |                 const skippedHeaders = ['cookie', 'set-cookie', 'host'];
239 |                 if (skippedHeaders.includes(withoutPrefixKey)) {
240 |                     continue;
241 |                 }
242 | 
243 |                 // header values other than 'set-cookie' should be string (not string[]), but there's a check just in case
244 |                 const headerValue = reqHeaders[headerKey];
245 |                 if (Array.isArray(headerValue)) {
246 |                     continue;
247 |                 }
248 |                 headersToForward[withoutPrefixKey] = headerValue as string;
249 |             }
250 |         }
251 | 
252 |         if (params[ScrapingBee.forwardHeaders] === 'true') {
253 |             const currentHeaders = finalRequest.headers;
254 |             finalRequest.headers = {
255 |                 ...currentHeaders,
256 |                 ...headersToForward,
257 |             };
258 |         } else {
259 |             // forward headers pure
260 |             finalRequest.headers = {
261 |                 ...headersToForward,
262 |             };
263 |         }
264 |     }
265 | 
266 |     // all headers
267 |     if (params[ScraperApi.keepHeaders] === 'true') {
268 |         const reqHeaders = req.headers;
269 |         const headersToForward: Record<string, string> = {};
270 |         for (const [key, val] of Object.entries(reqHeaders)) {
271 |             if (Array.isArray(val)) {
272 |                 continue;
273 |             }
274 |             headersToForward[key] = val as string;
275 |         }
276 |         finalRequest.headers = headersToForward;
277 |     }
278 | 
279 |     if (params[ScrapingBee.cookies]) {
280 |         finalRequest.headers!.Cookie = params[ScrapingBee.cookies] as string;
281 |     }
282 | 
283 |     if (binaryTarget) {
284 |         finalRequest.label = Label.BINARY_TARGET;
285 |         return finalRequest;
286 |     }
287 | 
288 |     finalRequest.label = renderJs ? Label.BROWSER : Label.HTTP;
289 |     return finalRequest;
290 | }
291 | 
292 | export function createProxyOptions(params: ParsedUrlQuery) {
293 |     const proxyOptions: ProxyConfigurationOptions = {};
294 | 
295 |     const proxyType = params[ScrapingAnt.proxyType] as string || 'datacenter';
296 |     if (proxyType !== 'datacenter' && proxyType !== 'residential') {
297 |         throw new UserInputError('Parameter proxy_type can be either residential or datacenter');
298 |     }
299 | 
300 |     const useGoogleProxy = params[ScrapingBee.customGoogle] === 'true';
301 |     const url = new URL(params[ScrapingBee.url] as string);
302 |     if (url.host.includes('google') && !useGoogleProxy) {
303 |         throw new UserInputError('Set param custom_google to true to scrape Google urls');
304 |     }
305 |     if (useGoogleProxy) {
306 |         proxyOptions.groups = ['GOOGLE_SERP'];
307 |         return proxyOptions;
308 |     }
309 | 
310 |     if (params[ScrapingBee.ownProxy]) {
311 |         proxyOptions.proxyUrls = [params[ScrapingBee.ownProxy] as string];
312 |         return proxyOptions;
313 |     }
314 | 
315 |     const usePremium = params[ScrapingBee.premiumProxy] === 'true' || proxyType === 'residential';
316 |     if (usePremium) {
317 |         proxyOptions.groups = ['RESIDENTIAL'];
318 |     }
319 | 
320 |     if (params[ScrapingBee.countryCode]) {
321 |         const countryCode = (params[ScrapingBee.countryCode] as string).toUpperCase();
322 |         if (countryCode.length !== 2) {
323 |             throw new UserInputError('Parameter for country code must be a string of length 2');
324 |         }
325 |         if (!usePremium && countryCode !== 'US') {
326 |             throw new UserInputError('Parameter for country code must be used with premium proxies when using non-US country');
327 |         }
328 |         proxyOptions.countryCode = countryCode;
329 |     }
330 |     return proxyOptions;
331 | }
332 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "extends": "@apify/tsconfig",
 3 |     "compilerOptions": {
 4 |         "module": "NodeNext",
 5 |         "moduleResolution": "NodeNext",
 6 |         "target": "ES2022",
 7 |         "outDir": "dist",
 8 |         "noUnusedLocals": false,
 9 |         "skipLibCheck": true,
10 |         "lib": ["DOM"]
11 |     },
12 |     "include": [
13 |         "./src/**/*"
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------