├── .circleci └── config.yml ├── .gitignore ├── .npmignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── crawler_primary.png ├── eslint.config.js ├── package.json ├── pnpm-lock.yaml ├── src ├── crawler.ts ├── index.ts ├── lib │ ├── index.ts │ ├── multiPriorityQueue.ts │ ├── queue.ts │ └── utils.ts ├── logger.ts ├── options.ts ├── rateLimiter │ ├── cluster.ts │ ├── index.ts │ └── rateLimiter.ts └── types │ ├── crawler.ts │ └── index.ts ├── test ├── binaryDataStream.test.js ├── cacheOptions.js ├── callback.js ├── cookieJar.js ├── direct.js ├── encoding.js ├── errorHandling.js ├── examples.js ├── http2ErrorHanding.js ├── http2Response.js ├── lib │ ├── avaTestCb.js │ └── iso8859.html ├── limiter.js ├── preRequest.js ├── priority.js ├── rateLimit.js ├── requests.js ├── urlOptions.js └── userAgent.js └── tsconfig.json /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Use the latest 2.1 version of CircleCI pipeline process engine. 2 | # See: https://circleci.com/docs/2.0/configuration-reference 3 | version: 2.1 4 | 5 | jobs: 6 | # Below is the definition of your job to build and test your app, you can rename and customize it as you want. 7 | build-and-test: 8 | # These next lines define a Docker executor: https://circleci.com/docs/2.0/executor-types/ 9 | # You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub. 10 | # A list of available CircleCI Docker Convenience Images are available here: https://circleci.com/developer/images/image/cimg/node 11 | docker: 12 | - image: cimg/node:18.20.3 13 | # Then run your tests! 14 | # CircleCI will report the results back to your VCS provider. 15 | steps: 16 | # Checkout the code as the first step. 17 | - checkout 18 | - run: | 19 | npm install --prefix=$HOME/.local -g pnpm 20 | pnpm install 21 | pnpm eslint 22 | pnpm build 23 | pnpm test 24 | workflows: 25 | # Below is the definition of your workflow. 26 | # Inside the workflow, you provide the jobs you want to run, e.g this workflow runs the build-and-test job above. 27 | # CircleCI will run this workflow on every commit. 28 | # For more details on extending your workflow, see the configuration docs: https://circleci.com/docs/2.0/configuration-reference/#workflows 29 | sample: 30 | jobs: 31 | - build-and-test 32 | # For running simple node tests, you could optionally use the node/test job from the orb to replicate and replace the job above in fewer lines. 33 | # - node/test 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | coverage 4 | 5 | dist 6 | node_modules 7 | archive 8 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .circleci 2 | .vscode 3 | .idea 4 | .gitignore 5 | .npmignore 6 | coverage 7 | 8 | node_modules 9 | src 10 | test 11 | CHANGELOG.md 12 | pnpm-lock.yaml 13 | tsconfig.json 14 | eslint.config.js 15 | crawler_primary.png 16 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | node-crawler ChangeLog 2 | ------------------------- 3 | 2.0.3 4 | - Add support for legacy code written using crawler v1. 5 | 6 | 2.0.2 7 | - Add **Global Only Options : `silence`** to mute all crawler messages, including warnings and errors. 8 | - Change default error message when request failed in **Crawler.add** 9 | - Update dependencies 10 | 11 | 2.0.1 12 | - [#474](https://github.com/bda-research/node-crawler/issues/474) Solve the 'userAgents' is not working 13 | - Add some unit tests 14 | - Migration from Mocha to AVA 15 | - Use c8 to add code coverage report 16 | - Update dependencies 17 | - Others 18 | 19 | 2.0.0 20 | - Crawler V2 has been released, which is a complete rewrite of the original crawler. 21 | - see [README.md](https://github.com/bda-research/node-crawler/blob/master/README.md) for details. 22 | 23 | 1.5.0 24 | - fix bug of `:path` and `:authority` header in http/2 (@mike442144) 25 | - add `ciphers` to both http/1.1 and http/2 (@mike442144) 26 | 27 | 1.4.0 28 | - [#437](https://github.com/bda-research/node-crawler/pull/437) Update README.md (@kxxxo) 29 | - [#420](https://github.com/bda-research/node-crawler/pull/420) Fast stack buffer sync spider crawl example (@j-mendez) 30 | - [#395](https://github.com/bda-research/node-crawler/pull/395) Fixed typo in README.md (@psh0502) 31 | - [#386](https://github.com/bda-research/node-crawler/pull/386) Circleci project setup (@mike442144) 32 | - [#374](https://github.com/bda-research/node-crawler/pull/374) Unify formatting in README.md (@rauno56) 33 | 34 | 1.3.0 35 | - [#367](https://github.com/bda-research/node-crawler/pull/367) add http2 functionality (@BeijingProtoHuman) 36 | - [#364](https://github.com/bda-research/node-crawler/pull/364) Fix some typos (@pzmarzly) 37 | - [#363](https://github.com/bda-research/node-crawler/pull/363) Remove stale vendored jQuery version (@pzmarzly) 38 | 39 | 1.2.2 40 | - [#353](https://github.com/bda-research/node-crawler/pull/353) Release automate (@mike442144) 41 | - [#338](https://github.com/bda-research/node-crawler/pull/338) #comment Adding support for Https socks5. Agent is imported directly … (@djpavlovic) 42 | - [#336](https://github.com/bda-research/node-crawler/pull/336) Update README.md (@DanielHabenicht) 43 | - [#329](https://github.com/bda-research/node-crawler/pull/329) add support for removeRefererHeader request option to preserve referer during redirects (@petskratt) 44 | - [#314](https://github.com/bda-research/node-crawler/pull/314) docs: fix typo (@Jason-Cooke) 45 | 46 | 1.2.1 47 | * [#310](https://github.com/bda-research/node-crawler/issues/310) Upgrade dependencies' version(@mike442144) 48 | * [#303](https://github.com/bda-research/node-crawler/issues/303) Update seenreq to v3(@mike442144) 49 | * [#304](https://github.com/bda-research/node-crawler/pull/304) Replacement of istanbul with nyc (@kossidts) 50 | * [#300](https://github.com/bda-research/node-crawler/pull/300) Add formData arg to requestArgs (@humandevmode) 51 | * [#280](https://github.com/bda-research/node-crawler/pull/280) 20180611 updatetestwithnock (@Dong-Gao) 52 | 53 | 1.2.0 54 | * [#278](https://github.com/bda-research/node-crawler/pull/278) Added filestream require to download section (@swosko) 55 | * Use `nock` to mock testing instead of httpbin 56 | * Replace jshint by eslint 57 | * Fix code to pass eslint rules 58 | 59 | 1.1.4 60 | * Tolerate incorrect `Content-Type` header [#270](https://github.com/bda-research/node-crawler/pull/270), [#193](https://github.com/bda-research/node-crawler/issues/193) 61 | * Added examples [#272](https://github.com/bda-research/node-crawler/pull/272), [267](https://github.com/bda-research/node-crawler/issues/267) 62 | * Fixed "skipDuplicates" and "retries" config incompatible bug [#261](https://github.com/bda-research/node-crawler/issues/261) 63 | * Fix typo in README [#268](https://github.com/bda-research/node-crawler/pull/268) 64 | 65 | 1.1.3 66 | * Upgraded `request.js` and `lodash` 67 | 68 | 1.1.2 69 | * Recognize all XML MIME types to inject jQuery [#245](https://github.com/bda-research/node-crawler/pull/245) 70 | * Allow options to specify the Agent for Request [#246](https://github.com/bda-research/node-crawler/pull/246) 71 | * Added logo 72 | 73 | 1.1.1 74 | * added a way to replace the global options.headers keys by queuing options.headers [#241](https://github.com/bda-research/node-crawler/issues/241) 75 | * fix bug of using last jar object if current options doesn't contain `jar` option [#240](https://github.com/bda-research/node-crawler/issues/240) 76 | * fix bug of encoding [#233](https://github.com/bda-research/node-crawler/issues/233) 77 | * added seenreq options [#208](https://github.com/bda-research/node-crawler/issues/208) 78 | * added preRequest, setLimiterProperty, direct request functions 79 | 80 | 1.0.5 81 | * fix missing debugging messages [#213](https://github.com/bda-research/node-crawler/issues/213) 82 | * fix bug of 'drain' never called [#210](https://github.com/bda-research/node-crawler/issues/210) 83 | 84 | 1.0.4 85 | * fix bug of charset detecting [#203](https://github.com/bda-research/node-crawler/issues/203) 86 | * keep node version up to date in travis scripts 87 | 88 | 1.0.3 89 | * fix bug, skipDuplicate and rotateUA don't work even if set true 90 | 91 | 1.0.0 92 | * upgrade jsdom up to 9.6.x 93 | * remove 0.10 and 0.12 support [#170](https://github.com/bda-research/node-crawler/issues/170) 94 | * control dependencies version using ^ and ~ [#169](https://github.com/bda-research/node-crawler/issues/169) 95 | * remove node-pool 96 | * notify bottleneck until a task is completed 97 | * replace bottleneck by bottleneckp, which has priority 98 | * change default log function 99 | * use event listener on `request` and `drain` instead of global function [#144](https://github.com/bda-research/node-crawler/issues/144) 100 | * default set forceUTF8 to true 101 | * detect `ESOCKETTIMEDOUT` instead of `ETIMEDOUT` when timeout in test 102 | * add `done` function in callback to avoid async trap 103 | * do not convert response body to string if `encoding` is null [#118](https://github.com/bda-research/node-crawler/issues/118) 104 | * add result document [#68](https://github.com/bda-research/node-crawler/issues/68) [#116](https://github.com/bda-research/node-crawler/issues/116) 105 | * add event `schedule` which is emitted when a task is being added to scheduler 106 | * in callback, move $ into `res` because of weird API 107 | * change rateLimits to rateLimit 108 | 109 | 0.7.5 110 | * delete entity in options before copy, and assgin after, `jar` is one of the typical properties which is an `Entity` wich functions [#177](https://github.com/bda-research/node-crawler/issues/177) 111 | * upgrade `request` to version 2.74.0 112 | 113 | 0.7.4 114 | * change `debug` option to instance level instead of `options` 115 | * update README.md to detail error handling 116 | * call `onDrain` with scope of `this` 117 | * upgrade `seenreq` version to 0.1.7 118 | 119 | 0.7.0 120 | * cancel recursion in queue 121 | * upgrade `request` version to v2.67.0 122 | 123 | 0.6.9 124 | * use `bottleneckConcurrent` instead of `maxConnections`, default `10000` 125 | * add debug info 126 | 127 | 0.6.5 128 | * fix a deep and big bug when initializing Pool, that may lead to sequence execution. [#2](https://github.com/bda-research/node-webcrawler/issues/2) 129 | * print log of Pool status 130 | 131 | 0.6.3 132 | * you could also get `result.options` from callback even when some errors ouccurred [#127](https://github.com/bda-research/node-crawler/issues/127) [#86](https://github.com/bda-research/node-crawler/issues/86) 133 | * add test for `bottleneck` 134 | 135 | 0.6.0 136 | * add `bottleneck` to implement rate limit, one can set limit for each connection at same time. 137 | 138 | 0.5.2 139 | * you can manually terminate all the resources in your pool, when `onDrain` called, before their timeouts have been reached 140 | * add a read-only property `queueSize` to crawler [#148](https://github.com/bda-research/node-crawler/issues/148) [#76](https://github.com/bda-research/node-crawler/issues/76) [#107](https://github.com/bda-research/node-crawler/issues/107) 141 | 142 | 0.5.1 143 | * remove cache feature, it's useless 144 | * add `localAddress`, `time`, `tunnel`, `proxyHeaderWhiteList`, `proxyHeaderExclusiveList` properties to pass to `request` [#155](https://github.com/bda-research/node-crawler/issues/155) 145 | 146 | 0.5.0 147 | * parse charset from `content-type` in http headers or meta tag in html, then convert 148 | * big5 charset is avaliable as the `iconv-lite` has already supported it 149 | * default enable gzip in request header 150 | * remove unzip code in crawler since `request` will do this 151 | * body will return as a Buffer if encoding is null which is an option in `request` 152 | * remove cache and skip duplicate `request` for `GET`, `POST`(only for type `urlencode`), `HEAD` 153 | * add log feature, you can use `winston` to set `logger:winston`, or crawler will output to console 154 | * rotate user-agent in case some sites ban your requests 155 | 156 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 MiniAst 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Node.js 4 | 5 |

6 | 7 | ------ 8 | 9 | [![npm package](https://nodei.co/npm/crawler.png?downloads=true&downloadRank=true&stars=true)](https://www.npmjs.com/package/crawler/v/2.0.2) 10 | 11 | [![CircleCI](https://circleci.com/gh/bda-research/node-crawler/tree/master.svg?style=svg)](https://circleci.com/gh/bda-research/node-crawler/tree/master) 12 | [![NPM download][download-image]][download-url] 13 | [![Package Quality][quality-image]][quality-url] 14 | 15 | [quality-image]: https://packagequality.com/shield/crawler.svg 16 | [quality-url]: https://packagequality.com/#?package=crawler 17 | [download-image]: https://img.shields.io/npm/dm/crawler.svg?style=flat-square 18 | [download-url]: https://npmjs.org/package/crawler 19 | 20 | Crawler v2 : Advanced and Typescript version of [node-crawler](https://www.npmjs.com/package/crawler/v/1.5.0) 21 | 22 | Features: 23 | 24 | - Server-side DOM & automatic jQuery insertion with Cheerio (default), 25 | - Configurable pool size and retries, 26 | - Control rate limit, 27 | - Priority queue of requests, 28 | - let crawler deal for you with charset detection and conversion, 29 | 30 | If you have prior experience with Crawler v1, for fast migration, please proceed to the section [Differences and Breaking Changes](#differences-and-breaking-changes). 31 | 32 | # Quick start 33 | 34 | ## Install 35 | 36 | Requires Node.js 18 or above. 37 | 38 | **IMPORTANT:** If you are using a Linux OS, we currently recommend sticking with **Node.js version 18** for the time being, rather than opting for higher versions (even if some dependencies suggest 20 or later). Our unit tests have encountered stability issues on Linux with higher versions of Node.js, which may be caused by more profound underlying reasons. However, at present, we do not have the resources to address these issues. 39 | 40 | ```sh 41 | $ npm install crawler 42 | ``` 43 | 44 | **Warning:** Given the dependencies involved (Especially migrating from request to got) , **Crawler v2** has been designed as a native [ESM](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules) and no longer offers a CommonJS export. We would also like to recommend that you [convert to ESM](https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c). Note that making this transition is generally not too difficult.If you have a large codebase built with Crawler v1, you can upgrade to v2.0.3-beta (using ```npm install crawler@beta```), which supports both ESM and CommonJS builds.Please note that code previously using the "body" parameter to send form data in POST requests will need to be updated to use "form" even in the beta version. 45 | 46 | ## Usage 47 | 48 | ### Execute asynchronously via custom options 49 | 50 | ```js 51 | import Crawler from "crawler"; 52 | 53 | const c = new Crawler({ 54 | maxConnections: 10, 55 | // This will be called for each crawled page 56 | callback: (error, res, done) => { 57 | if (error) { 58 | console.log(error); 59 | } else { 60 | const $ = res.$; 61 | // $ is Cheerio by default 62 | //a lean implementation of core jQuery designed specifically for the server 63 | console.log($("title").text()); 64 | } 65 | done(); 66 | }, 67 | }); 68 | 69 | // Add just one URL to queue, with default callback 70 | c.add("http://www.amazon.com"); 71 | 72 | // Add a list of URLs 73 | c.add(["http://www.google.com/", "http://www.yahoo.com"]); 74 | 75 | // Add URLs with custom callbacks & parameters 76 | c.add([ 77 | { 78 | url: "http://parishackers.org/", 79 | jQuery: false, 80 | 81 | // The global callback won't be called 82 | callback: (error, res, done) => { 83 | if (error) { 84 | console.log(error); 85 | } else { 86 | console.log("Grabbed", res.body.length, "bytes"); 87 | } 88 | done(); 89 | }, 90 | }, 91 | ]); 92 | 93 | // Add some HTML code directly without grabbing (mostly for tests) 94 | c.add([ 95 | { 96 | html: "This is a test", 97 | }, 98 | ]); 99 | ``` 100 | 101 | please refer to [options](#options) for detail. 102 | 103 | ## Slow down 104 | 105 | Use `rateLimit` to slow down when you are visiting web sites. 106 | 107 | ```js 108 | import Crawler from "crawler"; 109 | 110 | const c = new Crawler({ 111 | rateLimit: 1000, // `maxConnections` will be forced to 1 112 | callback: (err, res, done) => { 113 | console.log(res.$("title").text()); 114 | done(); 115 | }, 116 | }); 117 | 118 | c.add(tasks); //between two tasks, minimum time gap is 1000 (ms) 119 | ``` 120 | 121 | ## Custom parameters 122 | 123 | Sometimes you have to access variables from previous request/response session, what should you do is passing parameters in **options.userParams** : 124 | 125 | ```js 126 | c.add({ 127 | url: "http://www.google.com", 128 | userParams: { 129 | parameter1: "value1", 130 | parameter2: "value2", 131 | parameter3: "value3", 132 | }, 133 | }); 134 | ``` 135 | 136 | then access them in callback via `res.options` 137 | 138 | ```js 139 | console.log(res.options.userParams); 140 | ``` 141 | 142 | ## Raw body 143 | 144 | If you are downloading files like image, pdf, word etc, you have to save the raw response body which means Crawler shouldn't convert it to string. To make it happen, you need to set encoding to null 145 | 146 | ```js 147 | import Crawler from "crawler"; 148 | import fs from "fs"; 149 | 150 | const c = new Crawler({ 151 | encoding: null, 152 | jQuery: false, // set false to suppress warning message. 153 | callback: (err, res, done) => { 154 | if (err) { 155 | console.error(err.stack); 156 | } else { 157 | fs.createWriteStream(res.options.userParams.filename).write(res.body); 158 | } 159 | done(); 160 | }, 161 | }); 162 | 163 | c.add({ 164 | url: "https://raw.githubusercontent.com/bda-research/node-crawler/master/crawler_primary.png", 165 | userParams: { 166 | filename: "crawler.png", 167 | }, 168 | }); 169 | ``` 170 | 171 | ## preRequest 172 | 173 | If you want to do something either synchronously or asynchronously before each request, you can try the code below. Note that direct requests won't trigger preRequest. 174 | 175 | ```js 176 | import Crawler from "crawler"; 177 | 178 | const c = new Crawler({ 179 | preRequest: (options, done) => { 180 | // 'options' here is not the 'options' you pass to 'c.queue', instead, it's the options that is going to be passed to 'request' module 181 | console.log(options); 182 | // when done is called, the request will start 183 | done(); 184 | }, 185 | callback: (err, res, done) => { 186 | if (err) { 187 | console.log(err); 188 | } else { 189 | console.log(res.statusCode); 190 | } 191 | }, 192 | }); 193 | 194 | c.add({ 195 | url: "http://www.google.com", 196 | // this will override the 'preRequest' defined in crawler 197 | preRequest: (options, done) => { 198 | setTimeout(() => { 199 | console.log(options); 200 | done(); 201 | }, 1000); 202 | }, 203 | }); 204 | ``` 205 | 206 | ### Direct request 207 | 208 | Support both Promise and callback 209 | 210 | ```js 211 | import Crawler from "crawler"; 212 | 213 | const crawler = new Crawler(); 214 | 215 | // When using directly "send", the preRequest won't be called and the "Event:request" won't be triggered 216 | const response = await crawler.send("https://github.com/"); 217 | console.log(response.options); 218 | // console.log(response.body); 219 | 220 | crawler.send({ 221 | url: "https://github.com/", 222 | // When calling `send`, `callback` must be defined explicitly, with two arguments `error` and `response` 223 | callback: (error, response) => { 224 | if (error) { 225 | console.error(error); 226 | } else { 227 | console.log("Hello World!"); 228 | } 229 | }, 230 | }); 231 | ``` 232 | 233 | ### 234 | 235 | # Table 236 | 237 | - [Content](#content) 238 | - [Work with Http2](#work-with-http2) 239 | - [Work with rateLimiters](#work-with-ratelimiters) 240 | - [Class: Crawler](#class-crawler) 241 | - [Event: 'schedule'](#event-schedule) 242 | - [Event: 'limiterChange'](#event-limiterchange) 243 | - [Event: 'request'](#event-request) 244 | - [Event: 'drain'](#event-drain) 245 | - [crawler.add(url|options)](#crawleraddurloptions) 246 | - [crawler.queueSize](#crawlerqueuesize) 247 | - [Options](#options) 248 | - [Global only options](#global-only-options) 249 | - [`silence`](#silence) 250 | - [`maxConnections`](#maxconnections) 251 | - [`priorityLevels`](#prioritylevels) 252 | - [`rateLimit`](#ratelimit) 253 | - [`skipDuplicates`](#skipduplicates) 254 | - [`homogeneous`](#homogeneous) 255 | - [`userAgents`](#useragents) 256 | - [Crawler General options](#crawler-general-options) 257 | - [`url | method | headers | body | searchParams...`](#url--method--headers--body--searchparams) 258 | - [`forceUTF8`](#forceutf8) 259 | - [`jQuery`](#jquery) 260 | - [`encoding`](#encoding) 261 | - [`rateLimiterId`](#ratelimiterid) 262 | - [`retries`](#retries) 263 | - [`retryInterval`](#retryinterval) 264 | - [`timeout`](#timeout) 265 | - [`priority`](#priority) 266 | - [`skipEventRequest`](#skipeventrequest) 267 | - [`html`](#html) 268 | - [`proxies`](#proxies) 269 | - [`proxy`](#proxy) 270 | - [`http2`](#http2) 271 | - [`referer`](#referer) 272 | - [`userParams`](#userparams) 273 | - [`preRequest`](#prerequest-1) 274 | - [`Callback`](#callback) 275 | - [Work with Cheerio](#work-with-cheerio) 276 | - [Differences and Breaking Changes](#differences-and-breaking-changes) 277 | - [renaming](#renaming) 278 | - [Crawler Options](#crawler-options) 279 | - [Origin Request Options](#origin-request-options) 280 | - [Behavior Changes](#behavior-changes) 281 | - [How to test](#how-to-test) 282 | 283 | 284 | # Content 285 | 286 | ## Work with Http2 287 | 288 | Now we offer hassle-free support for using HTTP/2: just set `http2` to true, and Crawler will operate as smoothly as with HTTP (including proxies). 289 | 290 | **Note:** As most developers using this library with proxies also work with **Charles**, it is expected to set `rejectAuthority` to `false` in order to prevent the so-called **'self-signed certificate'** errors." 291 | 292 | ```js 293 | crawler.send({ 294 | url: "https://nghttp2.org/httpbin/status/200", 295 | method: "GET", 296 | http2: true, 297 | callback: (error, response) => { 298 | if (error) { 299 | console.error(error); 300 | } 301 | console.log(`inside callback`); 302 | console.log(response.body); 303 | }, 304 | }); 305 | ``` 306 | 307 | ## Work with rateLimiters 308 | 309 | Control the rate limit. All tasks submit to a rateLimiter will abide the `rateLimit` and `maxConnections` restrictions of the limiter. `rateLimit` is the minimum time gap between two tasks. `maxConnections` is the maximum number of tasks that can be running at the same time. rateLimiters are independent of each other. One common use case is setting different rateLimiters for different proxies. One thing is worth noticing, when `rateLimit` is set to a non-zero value, `maxConnections` will be forced to 1. 310 | 311 | ```js 312 | import Crawler from "crawler"; 313 | 314 | const c = new Crawler({ 315 | rateLimit: 2000, 316 | maxConnections: 1, 317 | callback: (error, res, done) => { 318 | if (error) { 319 | console.log(error); 320 | } else { 321 | const $ = res.$; 322 | console.log($("title").text()); 323 | } 324 | done(); 325 | }, 326 | }); 327 | 328 | // if you want to crawl some website with 2000ms gap between requests 329 | c.add("http://www.somewebsite.com/page/1"); 330 | c.add("http://www.somewebsite.com/page/2"); 331 | c.add("http://www.somewebsite.com/page/3"); 332 | 333 | // if you want to crawl some website using proxy with 2000ms gap between requests for each proxy 334 | c.add({ 335 | url: "http://www.somewebsite.com/page/1", 336 | rateLimiterId: 1, 337 | proxy: "proxy_1", 338 | }); 339 | c.add({ 340 | url: "http://www.somewebsite.com/page/2", 341 | rateLimiterId: 2, 342 | proxy: "proxy_2", 343 | }); 344 | c.add({ 345 | url: "http://www.somewebsite.com/page/3", 346 | rateLimiterId: 3, 347 | proxy: "proxy_3", 348 | }); 349 | c.add({ 350 | url: "http://www.somewebsite.com/page/4", 351 | rateLimiterId: 4, 352 | proxy: "proxy_1", 353 | }); 354 | ``` 355 | 356 | Normally, all ratelimiters instances in the limiter cluster of crawler are instantiated with options specified in crawler constructor. You can change property of any rateLimiter by calling the code below. Currently, we only support changing property 'rateLimit' of it. Note that the default rateLimiter can be accessed by `crawler.setLimiter(0, "rateLimit", 1000);`. We strongly recommend that you leave limiters unchanged after their instantiation unless you know clearly what you are doing. 357 | 358 | ```js 359 | const crawler = new Crawler(); 360 | crawler.setLimiter(0, "rateLimit", 1000); 361 | ``` 362 | 363 | ## Class: Crawler 364 | 365 | ### Event: 'schedule' 366 | 367 | - `options` 368 | 369 | Emitted when a task is being added to scheduler. 370 | 371 | ```js 372 | crawler.on("schedule", options => { 373 | options.proxy = "http://proxy:port"; 374 | }); 375 | ``` 376 | 377 | ### Event: 'limiterChange' 378 | 379 | - `options` 380 | - `rateLimiterId` : `number` 381 | 382 | Emitted when limiter has been changed. 383 | 384 | ### Event: 'request' 385 | 386 | - `options` 387 | 388 | Emitted when crawler is ready to send a request. 389 | 390 | If you are going to modify options at last stage before requesting, just listen on it. 391 | 392 | ```js 393 | crawler.on("request", options => { 394 | options.searchParams.timestamp = new Date().getTime(); 395 | }); 396 | ``` 397 | 398 | ### Event: 'drain' 399 | 400 | Emitted when queue is empty. 401 | 402 | ```js 403 | crawler.on("drain", () => { 404 | // For example, release a connection to database. 405 | db.end(); // close connection to MySQL 406 | }); 407 | ``` 408 | 409 | ### crawler.add(url|options) 410 | 411 | - `url | options` 412 | 413 | Add a task to queue and wait for it to be executed. 414 | 415 | ### crawler.queueSize 416 | 417 | - `Number` 418 | 419 | Size of queue, read-only 420 | 421 | ## Options 422 | 423 | You can pass these options to the **Crawler()** constructor if you want them to be global or as 424 | items in the **crawler.add()** calls if you want them to be specific to that item (overwriting global options) 425 | 426 | - For using easily, simply passing a url string as Options is also accepted. 427 | - Options can also be an array composed of multiple options, in which case multiple tasks will be added at once. 428 | - When constructing options, all native [got options](https://github.com/sindresorhus/got/blob/main/documentation/2-options.md) are accepted and passed through directly. Additionally, the options are tailored to process only those parameters that are identifiable by the Crawler. 429 | 430 | ### Global only options 431 | 432 | #### `silence` 433 | - **Type:** `boolean` 434 | - **Default** : false 435 | - If true, the crawler will mute all warning and error messages. The request error will be still reported. 436 | 437 | #### `maxConnections` 438 | 439 | - **Type:** `number` 440 | - **Default** : 10 441 | - The maximum number of requests that can be sent simultaneously. If the value is 10, the crawler will send at most 10 requests at the same time. 442 | 443 | #### `priorityLevels` 444 | 445 | - **Type:** `number` 446 | - **Default** : 10 447 | - The number of levels of priority. Can be only assigned at the beginning. 448 | 449 | #### `rateLimit` 450 | 451 | - **Type:** `number` 452 | 453 | - **Default** : 0 454 | 455 | - 1000 means 1000 milliseconds delay between after the first request. 456 | 457 | - **Note:** This options is list as global only options because it will be set as the "default rateLimit value". This value is bound to a specific rate limiter and can **only be modified** through the `crawler.setLimiter` method. Please avoid passing redundant rateLimit property in local requests; instead, use `options.rateLimiterId` to specify a particular limiter. 458 | 459 | - **Example:** 460 | 461 | ```js 462 | crawler.on("schedule", options => { 463 | options.rateLimiterId = Math.floor(Math.random() * 15); 464 | }); 465 | ``` 466 | 467 | #### `skipDuplicates` 468 | 469 | - **Type:** `boolean` 470 | - **Default** : false 471 | - If true, the crawler will skip duplicate tasks. If the task is already in the queue, the crawler will not add it again. 472 | 473 | #### `homogeneous` 474 | 475 | - **Type:** `boolean` 476 | - **Default** : false 477 | - If true, the crawler will dynamically reallocate the tasks within the queue blocked due to header blocking to other queues. 478 | 479 | #### `userAgents` 480 | 481 | - **Type:** `string | string[]` 482 | - **Default** : undefined 483 | - If passed, the crawler will rotate the user agent for each request. The "userAgents" option must be an array if activated. 484 | 485 | ### Crawler General options 486 | 487 | #### `url | method | headers | body | searchParams...` 488 | 489 | - Same as the options of [options](https://github.com/sindresorhus/got/blob/main/documentation/2-options.md) 490 | 491 | #### `forceUTF8` 492 | 493 | - **Type:** `boolean` 494 | - **Default** : false 495 | - If true, the crawler will detect the charset from the HTTP headers or the meta tag in the HTML and convert it to UTF-8 if necessary. 496 | 497 | #### `jQuery` 498 | 499 | - **Type:** `boolean` 500 | - **Default** : true 501 | - If true, the crawler will use the cheerio library to parse the HTML content. 502 | 503 | #### `encoding` 504 | 505 | - **Type:** `string` 506 | - **Default** : 'utf8' 507 | - The encoding of the response body. 508 | 509 | #### `rateLimiterId` 510 | 511 | - **Type:** `number` 512 | - **Default** : 0 513 | - The rateLimiter ID. 514 | 515 | #### `retries` 516 | 517 | - **Type:** `number` 518 | - **Default** : 2 519 | - The number of retries if the request fails. 520 | 521 | #### `retryInterval` 522 | 523 | - **Type:** `number` 524 | - **Default** : 3000 525 | - The number of milliseconds to wait before retrying. 526 | 527 | #### `timeout` 528 | 529 | - **Type:** `number` 530 | - **Default** : 20000 531 | - The number of milliseconds to wait before the request times out. 532 | 533 | #### `priority` 534 | 535 | - **Type:** `number` 536 | - **Default** : 5 537 | - The priority of the request. 538 | 539 | #### `skipEventRequest` 540 | 541 | - **Type:** `boolean` 542 | - **Default** : false 543 | - If true, the crawler will not trigger the 'request' event. 544 | 545 | #### `html` 546 | 547 | - **Type:** `boolean` 548 | - **Default** : true 549 | - If true, the crawler will parse the response body as HTML. 550 | 551 | #### `proxies` 552 | 553 | - **Type:** `string[]` 554 | - **Default** : [] 555 | - The list of proxies. If passed, the proxy will be rotated by requests. 556 | - **Warning:** It is recommended to avoid the usage of "proxies", better to use the following method instead. (Probably you can understand why...) 557 | 558 | ```js 559 | const ProxyManager = { 560 | index: 0, 561 | proxies: JSON.parse(fs.readFileSync("../proxies.json")), 562 | setProxy: function (options) { 563 | let proxy = this.proxies[this.index]; 564 | this.index = ++this.index % this.proxies.length; 565 | options.proxy = proxy; 566 | options.rateLimiterId = Math.floor(Math.random() * 15); 567 | }, 568 | }; 569 | 570 | crawler.on("schedule", options => { 571 | // options.proxy = "http://127.0.0.1:8000"; 572 | ProxyManager.setProxy(options); 573 | }); 574 | ``` 575 | 576 | #### `proxy` 577 | 578 | - **Type:** `string` 579 | - **Default** : undefined 580 | - The proxy to use. The priority is higher than the "proxies" option. 581 | 582 | #### `http2` 583 | 584 | - **Type:** `boolean` 585 | - **Default** : false 586 | - If true, the request will be sent in the HTTP/2 protocol. 587 | 588 | #### `referer` 589 | 590 | - **Type:** `string` 591 | - **Default** : undefined 592 | - If truthy, sets the HTTP referer header. 593 | 594 | #### `userParams` 595 | 596 | - **Type:** `unknown` 597 | - **Default** : undefined 598 | - The user parameters. You can access them in the callback via `res.options`. 599 | 600 | #### `preRequest` 601 | 602 | - **Type:** `(options, done) => unknown` 603 | - **Default** : undefined 604 | - The function to be called before each request. Only works for the `crawler.add` method. 605 | 606 | #### `Callback` 607 | 608 | - **Type:** `(error, response, done) => unknown` 609 | - Function that will be called after a request was completed 610 | 611 | - `error`: [Error](https://nodejs.org/api/errors.html) catched by the crawler 612 | - `response` : A response of standard IncomingMessage includes `$` and `options` 613 | - `response.options`: [Options](#options) of this task 614 | - `response.$`: [jQuery Selector](https://api.jquery.com/category/selectors/) A selector for html or xml document. 615 | - `response.statusCode`: `Number` HTTP status code. E.G.`200` 616 | - `response.body`: `Buffer` | `String` | `JSON` HTTP response content which could be a html page, plain text or xml document e.g. 617 | - `response.headers`: HTTP response headers 618 | - `done` : The function must be called when you've done your work in callback. This is the only way to tell the crawler that the task is finished. 619 | 620 | ## Work with Cheerio 621 | 622 | Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio). We are temporarily no longer supporting jsdom for certain reasons, may be later. 623 | 624 | # Differences and Breaking Changes 625 | 626 | ## renaming 627 | 628 | *Options list here are renamed but most of the old ones are still supported for backward compatibility.* 629 | 630 | ### Crawler Options 631 | `options.priorityRange` → `options.priorityLevels` 632 | 633 | `options.uri` → `options.url` 634 | 635 | `options.json` → `options.isJson` (Boolean. The "json" option is now work completely different.) 636 | 637 | `options.limiter` → `options.rateLimiterId` 638 | 639 | `options.retryTimeout` → `options.retryInterval` 640 | 641 | `crawler.direct` → `crawler.send` 642 | 643 | `crawler.queue` → `crawler.add` 644 | 645 | `crawler.setLimiterProperty` → `crawler.setLimiter` 646 | 647 | ### Origin Request Options 648 | `incomingEncoding` → `encoding` 649 | 650 | `qs` → `searchParams` 651 | 652 | `strictSSL` → `rejectUnauthorized` 653 | 654 | `gzip` → `decompress` 655 | 656 | `jar` → `cookieJar` (accepts `tough-cookie` jar) 657 | 658 | `jsonReviver` → `parseJson` 659 | 660 | `jsonReplacer` → `stringifyJson` 661 | 662 | ## Behavior Changes 663 | 664 | - default retries: 3 => 2 665 | 666 | **Some practices that were acceptable and offen used in version 1 but not in version 2:** 667 | 668 | - use "body" as the POST form => **Please use "form" instead. For more, see [options](https://github.com/sindresorhus/got/blob/main/documentation/2-options.md) .** 669 | - add custom options on request options => **Not allowed. Only options.userParams could pass through the response.** 670 | - We are temporarily no longer supporting jsdom for certain reasons. 671 | 672 | # How to test 673 | 674 | Crawler uses `nock` to mock http request, thus testing no longer relying on http server. 675 | 676 | ```bash 677 | $ pnpm test 678 | ``` 679 | -------------------------------------------------------------------------------- /crawler_primary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bda-research/node-crawler/5f6219c02925299953918de9d39954895e17b187/crawler_primary.png -------------------------------------------------------------------------------- /eslint.config.js: -------------------------------------------------------------------------------- 1 | import globals from "globals"; 2 | import eslint from "@eslint/js"; 3 | import tseslint from "typescript-eslint"; 4 | 5 | const options = [ 6 | // { languageOptions: { globals: globals.node } }, 7 | eslint.configs.recommended, 8 | ...tseslint.configs.recommended, 9 | { 10 | ignores: ["test/*", "dist/*", "**/*.config.js"], 11 | }, 12 | { 13 | languageOptions: { 14 | globals: { ...globals.node }, 15 | ecmaVersion: 2020, 16 | sourceType: "module", 17 | }, 18 | }, 19 | { 20 | name: "Crawler", 21 | files: ["src/**/*.ts"], 22 | rules: { 23 | "@typescript-eslint/no-explicit-any": "warn", 24 | "@typescript-eslint/no-unused-vars": [ 25 | "error", 26 | { 27 | "argsIgnorePattern": "^_", 28 | "varsIgnorePattern": "^_", 29 | "caughtErrorsIgnorePattern": "^_", 30 | }, 31 | ], 32 | "no-console": "error", 33 | "no-empty": "error", 34 | "quotes": ["error", "double", { "avoidEscape": true }], 35 | "semi": ["error", "always"], 36 | }, 37 | }, 38 | ]; 39 | 40 | export default options; 41 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "crawler", 3 | "version": "2.0.3-beta.2", 4 | "description": "Crawler is a ready-to-use web spider that works with proxies, asynchrony, rate limit, configurable request pools, jQuery, and HTTP/2 support.", 5 | "repository": { 6 | "type": "git", 7 | "url": "https://github.com/bda-research/node-crawler.git" 8 | }, 9 | "exports": { 10 | ".": { 11 | "require": "./dist/index.cjs", 12 | "import": "./dist/index.js" 13 | } 14 | }, 15 | "scripts": { 16 | "build": "tsup src/index.ts --format cjs,esm --clean", 17 | "prepublishOnly": "npm run build", 18 | "test": "NODE_ENV=test ava", 19 | "cover": "NODE_ENV=test c8 ava" 20 | }, 21 | "engines": { 22 | "node": ">=18" 23 | }, 24 | "type": "module", 25 | "keywords": [ 26 | "javascript", 27 | "crawler", 28 | "spider", 29 | "scraper", 30 | "scraping", 31 | "jquery", 32 | "nodejs", 33 | "http", 34 | "https", 35 | "http2", 36 | "got", 37 | "request", 38 | "url", 39 | "network", 40 | "gzip" 41 | ], 42 | "license": "MIT", 43 | "dependencies": { 44 | "cheerio": "1.0.0-rc.12", 45 | "got": "^14.4.2", 46 | "hpagent": "^1.2.0", 47 | "http2-wrapper": "^2.2.1", 48 | "iconv-lite": "^0.6.3", 49 | "seenreq": "^3.0.0", 50 | "tslog": "^4.9.3", 51 | "tsup": "^8.4.0" 52 | }, 53 | "devDependencies": { 54 | "@eslint/js": "^9.8.0", 55 | "@types/got": "^9.6.12", 56 | "@types/node": "^20.14.13", 57 | "ava": "^6.1.3", 58 | "c8": "^10.1.2", 59 | "eslint": "^9.8.0", 60 | "globals": "^15.8.0", 61 | "nock": "^13.5.4", 62 | "sinon": "^18.0.0", 63 | "tough-cookie": "^4.1.4", 64 | "tsx": "^4.16.3", 65 | "typescript": "^5.5.4", 66 | "typescript-eslint": "^8.0.0" 67 | }, 68 | "ava": { 69 | "files": [ 70 | "test/*.js", 71 | "!test/*test.js" 72 | ], 73 | "timeout": "20s", 74 | "extensions": { 75 | "js": true 76 | }, 77 | "verbose": true 78 | }, 79 | "c8": { 80 | "reporter": [ 81 | "lcov", 82 | "html", 83 | "text" 84 | ], 85 | "clean": true 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/crawler.ts: -------------------------------------------------------------------------------- 1 | import { EventEmitter } from "events"; 2 | import { Cluster } from "./rateLimiter/index.js"; 3 | import { isBoolean, isFunction, setDefaults, flattenDeep, lowerObjectKeys, isNumber } from "./lib/utils.js"; 4 | import { getValidOptions, alignOptions, getCharset, renameOptionParams } from "./options.js"; 5 | import { getLogger } from "./logger.js"; 6 | import type { CrawlerOptions, RequestOptions, RequestConfig, CrawlerResponse } from "./types/crawler.js"; 7 | import { load } from "cheerio"; 8 | import seenreq from "seenreq"; 9 | import iconv from "iconv-lite"; 10 | import { GotInstance, GotFn } from "got"; 11 | // @todo: remove seenreq dependency 12 | const log = getLogger(); 13 | 14 | // 缓存 got 实例 15 | let gotInstance: GotInstance | null = null; 16 | 17 | async function loadGot() { 18 | if (!gotInstance) { 19 | gotInstance = (await import("got")).default; 20 | } 21 | return gotInstance; 22 | } 23 | 24 | class Crawler extends EventEmitter { 25 | private _limiters: Cluster; 26 | private _UAIndex = 0; 27 | private _proxyIndex = 0; 28 | 29 | public options: CrawlerOptions; 30 | public seen: any; 31 | 32 | constructor(options?: CrawlerOptions) { 33 | super(); 34 | const defaultOptions: CrawlerOptions = { 35 | maxConnections: 10, 36 | rateLimit: 0, 37 | priorityLevels: 10, 38 | skipDuplicates: false, 39 | homogeneous: false, 40 | method: "GET", 41 | forceUTF8: false, 42 | jQuery: true, 43 | priority: 5, 44 | retries: 2, 45 | retryInterval: 3000, 46 | timeout: 20000, 47 | isJson: false, 48 | silence: false, 49 | rejectUnauthorized: false, // set to "true" in production environment. 50 | userAgents: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36" 51 | }; 52 | options = renameOptionParams(options); 53 | this.options = { ...defaultOptions, ...options }; 54 | if (this.options.rateLimit! > 0) { 55 | this.options.maxConnections = 1; 56 | } 57 | if (this.options.silence) { 58 | log.settings.minLevel = 7; 59 | } 60 | 61 | this._limiters = new Cluster({ 62 | maxConnections: this.options.maxConnection !== undefined ? this.options.maxConnection : this.options.maxConnections!, 63 | rateLimit: this.options.limiter !== undefined ? this.options.limiter : this.options.rateLimit!, 64 | priorityLevels: this.options.priorityLevels!, 65 | defaultPriority: this.options.priority!, 66 | homogeneous: this.options.homogeneous, 67 | }); 68 | 69 | this.seen = new seenreq(this.options.seenreq); 70 | this.seen 71 | .initialize() 72 | .then(() => { 73 | log.debug("seenreq initialized"); 74 | }) 75 | .catch((error: unknown) => { 76 | log.error(error); 77 | }); 78 | this.on("_release", () => { 79 | log.debug(`Queue size: ${this.queueSize}`); 80 | if (this._limiters.empty) this.emit("drain"); 81 | }); 82 | } 83 | 84 | private _detectHtmlOnHeaders = (headers: Record): boolean => { 85 | const contentType = headers["content-type"] as string; 86 | if (/xml|html/i.test(contentType)) return true; 87 | return false; 88 | }; 89 | 90 | private _schedule = (options: CrawlerOptions): void => { 91 | this.emit("schedule", options); 92 | this._limiters 93 | .getRateLimiter(options.rateLimiterId) 94 | .submit(options.priority as number, (done, rateLimiterId) => { 95 | options.release = () => { 96 | done(); 97 | this.emit("_release"); 98 | }; 99 | options.callback = options.callback || options.release; 100 | 101 | if (rateLimiterId) { 102 | this.emit("limiterChange", options, rateLimiterId); 103 | } 104 | 105 | if (options.html) { 106 | options.url = options.url ?? ""; 107 | this._handler(null, options, { body: options.html, headers: { "content-type": "text/html" } }); 108 | } else { 109 | options.url = options.url ?? options.uri; 110 | if (typeof options.url === "function") { 111 | options.url((url: string) => { 112 | options.url = url; 113 | this._execute(options); 114 | }); 115 | } else { 116 | delete options.uri; 117 | this._execute(options); 118 | } 119 | } 120 | }); 121 | }; 122 | 123 | private _execute = async (options: CrawlerOptions): Promise => { 124 | if (options.proxy) log.debug(`Using proxy: ${options.proxy}`); 125 | else if (options.proxies) log.debug(`Using proxies: ${options.proxies}`); 126 | 127 | options.headers = options.headers ?? {}; 128 | options.headers = lowerObjectKeys(options.headers); 129 | 130 | if (options.forceUTF8 || options.isJson) options.encoding = "utf8"; 131 | 132 | if (Array.isArray(options.userAgents)) { 133 | this._UAIndex = this._UAIndex % options.userAgents.length; 134 | options.headers["user-agent"] = options.userAgents[this._UAIndex]; 135 | this._UAIndex++; 136 | } else { 137 | options.headers["user-agent"] = options.headers["user-agent"] ?? options.userAgents; 138 | } 139 | 140 | if (!options.proxy && Array.isArray(options.proxies)) { 141 | this._proxyIndex = this._proxyIndex % options.proxies.length; 142 | options.proxy = options.proxies[this._proxyIndex]; 143 | this._proxyIndex++; 144 | } 145 | 146 | const request = async () => { 147 | if (options.skipEventRequest !== true) { 148 | this.emit("request", options); 149 | } 150 | let response: CrawlerResponse; 151 | try { 152 | const got = await loadGot(); // 动态加载 got 153 | response = await got(alignOptions(options)); 154 | } catch (error) { 155 | log.debug(error); 156 | return this._handler(error, options); 157 | } 158 | return this._handler(null, options, response); 159 | }; 160 | 161 | if (isFunction(options.preRequest)) { 162 | try { 163 | options.preRequest!(options, async (err?: Error | null) => { 164 | if (err) { 165 | log.debug(err); 166 | return this._handler(err, options); 167 | } 168 | return await request(); 169 | }); 170 | } catch (err) { 171 | log.error(err); 172 | throw err; 173 | } 174 | } else { 175 | return await request(); 176 | } 177 | }; 178 | 179 | private _handler = (error: unknown, options: RequestOptions, response?: CrawlerResponse): CrawlerResponse => { 180 | if (error) { 181 | if (options.retries && options.retries > 0) { 182 | log.warn(`${error} occurred on ${options.url}. ${options.retries ? `(${options.retries} retries left)` : ""}`); 183 | setTimeout(() => { 184 | options.retries!--; 185 | this._execute(options as CrawlerOptions); 186 | }, options.retryInterval); 187 | return; 188 | } else { 189 | log.error(`${error} occurred on ${options.url}. Request failed.`); 190 | if (options.callback && typeof options.callback === "function") { 191 | return options.callback(error, { options }, options.release); 192 | } else { 193 | throw error; 194 | } 195 | } 196 | } 197 | if (!response.body) response.body = ""; 198 | log.debug("Got " + (options.url || "html") + " (" + response.body.length + " bytes)..."); 199 | response.options = options; 200 | 201 | response.charset = getCharset(response.headers); 202 | if (!response.charset) { 203 | const match = response.body.toString().match(/charset=['"]?([\w.-]+)/i); 204 | response.charset = match ? match[1].trim().toLowerCase() : null; 205 | } 206 | log.debug("Charset: " + response.charset); 207 | 208 | if (options.encoding !== null) { 209 | options.encoding = options.encoding ?? response.charset ?? "utf8"; 210 | try { 211 | if (!Buffer.isBuffer(response.body)) response.body = Buffer.from(response.body); 212 | response.body = iconv.decode(response.body, options.encoding as string); 213 | response.body = response.body.toString(); 214 | } catch (err) { 215 | log.error(err); 216 | } 217 | } 218 | 219 | if (options.isJson) { 220 | try { 221 | response.body = JSON.parse(response.body); 222 | } catch (_err) { 223 | log.warn("JSON parsing failed, body is not JSON. Set isJson to false to mute this warning."); 224 | } 225 | } 226 | 227 | if (options.jQuery === true && !options.isJson) { 228 | if (response.body === "" || !this._detectHtmlOnHeaders(response.headers)) { 229 | log.warn("response body is not HTML, skip injecting. Set jQuery to false to mute this warning."); 230 | } else { 231 | try { 232 | response.$ = load(response.body); 233 | } catch (_err) { 234 | log.warn("HTML detected failed. Set jQuery to false to mute this warning."); 235 | } 236 | } 237 | } 238 | 239 | if (options.callback && typeof options.callback === "function") { 240 | return options.callback(null, response, options.release); 241 | } 242 | return response; 243 | }; 244 | 245 | public get queueSize(): number { 246 | return 0; 247 | } 248 | 249 | /** 250 | * @param rateLimiterId 251 | * @param property 252 | * @param value 253 | * @description Set the rate limiter property. 254 | * @version 2.0.0 Only support `rateLimit` change. 255 | * @example 256 | * ```js 257 | * const crawler = new Crawler(); 258 | * crawler.setLimiter(0, "rateLimit", 1000); 259 | * ``` 260 | */ 261 | public setLimiter(rateLimiterId: number, property: string, value: unknown): void { 262 | if (!isNumber(rateLimiterId)) { 263 | log.error("rateLimiterId must be a number"); 264 | return; 265 | } 266 | if (property === "rateLimit") { 267 | this._limiters.getRateLimiter(rateLimiterId).setRateLimit(value as number); 268 | } 269 | // @todo other properties 270 | } 271 | /** 272 | * @param options 273 | * @returns if there is a "callback" function in the options, return the result of the callback function. \ 274 | * Otherwise, return a promise, which resolves when the request is successful and rejects when the request fails. 275 | * In the case of the promise, the resolved value will be the response object. 276 | * @description Send a request directly. 277 | * @example 278 | * ```js 279 | * const crawler = new Crawler(); 280 | * crawler.send({ 281 | * url: "https://example.com", 282 | * callback: (error, response, done) => { done(); } 283 | * }); 284 | * await crawler.send("https://example.com"); 285 | * ``` 286 | */ 287 | public send = async (options: RequestConfig): Promise => { 288 | options = getValidOptions(options); 289 | options.retries = options.retries ?? 0; 290 | setDefaults(options, this.options); 291 | options.skipEventRequest = isBoolean(options.skipEventRequest) ? options.skipEventRequest : true; 292 | delete options.preRequest; 293 | return await this._execute(options); 294 | }; 295 | /** 296 | * @deprecated 297 | * @description Old interface version. It is recommended to use `Crawler.send()` instead. 298 | * @see Crawler.send 299 | */ 300 | public direct = async (options: RequestConfig): Promise => { 301 | return await this.send(options); 302 | }; 303 | /** 304 | * @param options 305 | * @description Add a request to the queue. 306 | * @example 307 | * ```js 308 | * const crawler = new Crawler(); 309 | * crawler.add({ 310 | * url: "https://example.com", 311 | * callback: (error, response, done) => { done(); } 312 | * }); 313 | * ``` 314 | */ 315 | public add = (options: RequestConfig): void => { 316 | let optionsArray = Array.isArray(options) ? options : [options]; 317 | optionsArray = flattenDeep(optionsArray); 318 | optionsArray.forEach(options => { 319 | try { 320 | options = getValidOptions(options) as RequestOptions; 321 | } catch (err) { 322 | log.warn(err); 323 | return; 324 | } 325 | setDefaults(options, this.options); 326 | options.headers = { ...this.options.headers, ...options.headers }; 327 | if (!this.options.skipDuplicates) { 328 | this._schedule(options as CrawlerOptions); 329 | return; 330 | } 331 | 332 | this.seen 333 | .exists(options, options.seenreq) 334 | .then((rst: any) => { 335 | if (!rst) { 336 | this._schedule(options as CrawlerOptions); 337 | } 338 | }) 339 | .catch((error: unknown) => log.error(error)); 340 | }); 341 | }; 342 | /** 343 | * @deprecated 344 | * @description Old interface version. It is recommended to use `Crawler.add()` instead. 345 | * @see Crawler.add 346 | */ 347 | public queue = (options: RequestConfig): void => { 348 | return this.add(options); 349 | }; 350 | } 351 | 352 | export default Crawler; -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import Crawler from "./crawler.js"; 2 | if (typeof module !== "undefined" && module.exports) { 3 | module.exports = Crawler; 4 | } 5 | 6 | export default Crawler; -------------------------------------------------------------------------------- /src/lib/index.ts: -------------------------------------------------------------------------------- 1 | import multiPriorityQueue from "./multiPriorityQueue.js"; 2 | import { getType, isNumber } from "./utils.js"; 3 | 4 | export { multiPriorityQueue, getType, isNumber }; 5 | -------------------------------------------------------------------------------- /src/lib/multiPriorityQueue.ts: -------------------------------------------------------------------------------- 1 | import Queue from "./queue.js"; 2 | 3 | class multiPriorityQueue { 4 | private _elements: Queue[] = []; 5 | private _size: number; 6 | 7 | constructor(priorities: number) { 8 | priorities = Math.max(+priorities | 0, 1); 9 | for (let i = 0; i < priorities; i += 1) { 10 | this._elements.push(new Queue()); 11 | } 12 | this._size = 0; 13 | } 14 | 15 | size(): number { 16 | if (this._size) return this._size; 17 | let totalSize = 0; 18 | for (const queue of this._elements) { 19 | totalSize += queue.length; 20 | } 21 | return (this._size = totalSize); 22 | } 23 | 24 | enqueue(value: T, priority: number): void { 25 | priority = (priority && +priority | 0) || 0; 26 | if (priority < 0 || priority >= this._elements.length) { 27 | priority = this._elements.length - 1; 28 | throw new RangeError(`Invalid priority: ${priority} must be between 0 and ${this._elements.length - 1}`); 29 | } 30 | this._elements[priority].enqueue(value); 31 | this._size++; 32 | } 33 | 34 | dequeue(): T | undefined { 35 | for (let i = 0; i < this._elements.length; i++) { 36 | if (this._elements[i].length > 0) { 37 | this._size--; 38 | return this._elements[i].dequeue(); 39 | } 40 | } 41 | throw new ReferenceError("multiPriorityQueue is empty"); 42 | } 43 | } 44 | export default multiPriorityQueue; 45 | -------------------------------------------------------------------------------- /src/lib/queue.ts: -------------------------------------------------------------------------------- 1 | // This part of the code uses a portion of the lago library 2 | // Project: https://github.com/yangshun/lago 3 | export interface AbstractNode { 4 | next?: AbstractNode | null; 5 | prev?: AbstractNode | null; 6 | } 7 | 8 | class Node implements AbstractNode { 9 | public value: T; 10 | public next: AbstractNode | null; 11 | public prev: AbstractNode | null; 12 | 13 | constructor(value: T) { 14 | this.value = value; 15 | this.next = null; 16 | this.prev = null; 17 | } 18 | } 19 | 20 | class DummyHeadNode implements AbstractNode { 21 | public next: AbstractNode | null; 22 | 23 | constructor() { 24 | this.next = null; 25 | } 26 | } 27 | 28 | class DummyTailNode implements AbstractNode { 29 | public prev: AbstractNode | null; 30 | 31 | constructor() { 32 | this.prev = null; 33 | } 34 | } 35 | 36 | class Queue { 37 | private _dummyHead: DummyHeadNode; 38 | private _dummyTail: DummyTailNode; 39 | private _length: number; 40 | 41 | constructor() { 42 | this._dummyHead = new DummyHeadNode(); 43 | this._dummyTail = new DummyTailNode(); 44 | this._dummyHead.next = this._dummyTail; 45 | this._dummyTail.prev = this._dummyHead; 46 | this._length = 0; 47 | } 48 | 49 | /** 50 | * Adds an element to the back of the Queue. 51 | * @param {*} element 52 | * @return {number} The new length of the Queue. 53 | */ 54 | enqueue(value: T): number { 55 | const node = new Node(value); 56 | const prevLast = this._dummyTail.prev as Node | DummyHeadNode; 57 | prevLast.next = node; 58 | 59 | node.prev = prevLast; 60 | node.next = this._dummyTail; 61 | this._dummyTail.prev = node; 62 | this._length++; 63 | return this._length; 64 | } 65 | 66 | /** 67 | * Removes the element at the front of the Queue. 68 | * @return {*} The element at the front of the Queue. 69 | */ 70 | dequeue(): T | undefined { 71 | if (this.isEmpty()) { 72 | return undefined; 73 | } 74 | 75 | const node = this._dummyHead.next as Node; 76 | const newFirst = node.next as Node | DummyTailNode; 77 | this._dummyHead.next = newFirst; 78 | newFirst.prev = this._dummyHead; 79 | node.next = null; 80 | this._length--; 81 | return node.value; 82 | } 83 | 84 | /** 85 | * Returns true if the Queue has no elements. 86 | * @return {boolean} Whether the Queue has no elements. 87 | */ 88 | isEmpty(): boolean { 89 | return this._length === 0; 90 | } 91 | 92 | /** 93 | * Returns the element at the front of the Queue. 94 | * @return {*} The element at the front of the Queue. 95 | */ 96 | front(): T | undefined { 97 | if (this.isEmpty()) { 98 | return undefined; 99 | } 100 | 101 | return (this._dummyHead.next as Node).value; 102 | } 103 | 104 | /** 105 | * Returns the element at the back of the Queue. 106 | * @return {*} The element at the back of the Queue. 107 | */ 108 | back(): T | undefined { 109 | if (this.isEmpty()) { 110 | return undefined; 111 | } 112 | 113 | return (this._dummyTail.prev as Node).value; 114 | } 115 | 116 | /** 117 | * Returns the number of elements in the Queue. 118 | * @return {number} Number of elements in the Queue. 119 | */ 120 | get length(): number { 121 | return this._length; 122 | } 123 | /** 124 | * Returns the number of elements in the Queue (same as length). 125 | * @return {number} Number of elements in the Queue. 126 | */ 127 | get size(): number { 128 | return this._length; 129 | } 130 | } 131 | 132 | export default Queue; -------------------------------------------------------------------------------- /src/lib/utils.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @returns type of param, a lower case string 3 | */ 4 | export const getType = (value: unknown): string => 5 | Object.prototype.toString.call(value).slice(8, -1).toLocaleLowerCase(); 6 | 7 | export const isNumber = (value: unknown): boolean => getType(value) === "number" && !isNaN(value as number); 8 | 9 | export const isFunction = (value: unknown): boolean => getType(value) === "function"; 10 | 11 | export const isBoolean = (value: unknown): boolean => getType(value) === "boolean"; 12 | /** 13 | * @param target 14 | * @param source 15 | * @returns target with source's properties added if they don't exist in target 16 | * @description 17 | * This function is used to set default values for an object. 18 | * Add properties from source to target if they don't exist in target. 19 | */ 20 | export const setDefaults = (target: Record, source: Record) => { 21 | for (const key in source) { 22 | if (target[key] === undefined) { 23 | target[key] = source[key]; 24 | } 25 | } 26 | return target; 27 | }; 28 | 29 | export const isValidUrl = (url: string): boolean => { 30 | try { 31 | new URL(url); 32 | return true; 33 | } catch (_e) { 34 | return false; 35 | } 36 | }; 37 | 38 | // export function flattenDeep(array: T[]): T[]; 39 | /** 40 | * 41 | * @param array 42 | * @returns a flattened array 43 | * @description 44 | * Flattens an array of arrays recursively. 45 | * 46 | */ 47 | export function flattenDeep(array: (T | T[])[]): T[] { 48 | const result: T[] = []; 49 | array.forEach(element => { 50 | if (Array.isArray(element)) { 51 | result.push(...flattenDeep(element)); 52 | } else { 53 | result.push(element); 54 | } 55 | }); 56 | return result; 57 | } 58 | 59 | export function pick(target: T, keys: K[]): Pick { 60 | const result = {} as Pick; 61 | keys.forEach(key => { 62 | if (target[key] !== undefined) { 63 | result[key] = target[key]; 64 | } 65 | }); 66 | return result; 67 | } 68 | /** 69 | * 70 | * @param obj 71 | * @returns a cleaned object 72 | * @description 73 | * Removes all undefined and null values from an object, this will be done recursively. 74 | * But it will not remove empty objects. (i.e. {}) 75 | */ 76 | export const cleanObject = (obj: Record): Record => { 77 | Object.keys(obj).forEach(key => { 78 | if (getType(obj[key]) === "object") { 79 | obj[key] = cleanObject(obj[key] as Record); 80 | } 81 | if (obj[key] === undefined || obj[key] === null) { 82 | delete obj[key]; 83 | } 84 | }); 85 | return obj; 86 | }; 87 | /** 88 | * 89 | * @param obj 90 | * @returns an object with all keys in lowercase 91 | * @description 92 | * Converts all keys of an object to lowercase. 93 | */ 94 | export const lowerObjectKeys = (obj: Record): Record => { 95 | const result: Record = {}; 96 | Object.keys(obj).forEach(key => { 97 | result[key.toLowerCase()] = obj[key]; 98 | }); 99 | return result; 100 | }; -------------------------------------------------------------------------------- /src/logger.ts: -------------------------------------------------------------------------------- 1 | import { Logger } from "tslog"; 2 | 3 | const logLevelsByEnv: Record = { 4 | "debug": 0, 5 | "production": 3, 6 | "test": 7, 7 | }; 8 | 9 | export const logOptions = { 10 | type: "pretty" as "json" | "pretty" | "hidden", 11 | name: "Crawler", 12 | hideLogPositionForProduction: true, 13 | prettyLogTemplate: "{{name}} {{logLevelName}} ", 14 | prettyLogStyles: { 15 | logLevelName: { 16 | SILLY: ["bold", "white"], 17 | TRACE: ["bold", "whiteBright"], 18 | DEBUG: ["bold", "green"], 19 | INFO: ["bold", "blue"], 20 | WARN: ["bold", "yellow"], 21 | ERROR: ["bold", "red"], 22 | FATAL: ["bold", "redBright"], 23 | }, 24 | name: ["bold", "green"], 25 | dateIsoStr: "white", 26 | filePathWithLine: "white", 27 | nameWithDelimiterPrefix: ["white", "bold"], 28 | nameWithDelimiterSuffix: ["white", "bold"], 29 | errorName: ["bold", "bgRedBright", "whiteBright"], 30 | fileName: ["yellow"], 31 | }, 32 | minLevel: 0, 33 | }; 34 | 35 | logOptions.minLevel = process.env.NODE_ENV ? logLevelsByEnv[process.env.NODE_ENV] : 3; 36 | 37 | export const getLogger = () => new Logger(logOptions); -------------------------------------------------------------------------------- /src/options.ts: -------------------------------------------------------------------------------- 1 | import { GotUrl } from "got"; 2 | import { HttpProxyAgent, HttpsProxyAgent } from "hpagent"; 3 | import http2Wrapper from "http2-wrapper"; 4 | import { cleanObject, getType, isValidUrl } from "./lib/utils.js"; 5 | import { RequestConfig, RequestOptions, CrawlerOptions } from "./types/crawler.js"; 6 | 7 | export const globalOnlyOptions = [ 8 | "maxConnections", 9 | "rateLimit", 10 | "priorityLevels", 11 | "skipDuplicates", 12 | "homogeneous", 13 | "userAgents", 14 | "silence", 15 | ]; 16 | export const crawlerOnlyOptions = [ 17 | "rateLimiterId", 18 | "forceUTF8", 19 | "jQuery", 20 | "retryInterval", 21 | "priority", 22 | "proxy", 23 | "retries", 24 | "preRequest", 25 | "callback", 26 | "release", 27 | "isJson", 28 | "referer", 29 | "rejectUnauthorized", 30 | "userParams", 31 | ].concat(globalOnlyOptions); 32 | export const deprecatedOptions = [ 33 | "uri", 34 | "qs", 35 | "strictSSL", 36 | "incomingEncoding", 37 | "gzip", 38 | "jar", 39 | "jsonReviver", 40 | "jsonReplacer", 41 | "skipEventRequest", 42 | "logger", 43 | "debug", 44 | "time", 45 | "limiter", 46 | "gene", 47 | "jquery", 48 | "userAgent", 49 | ]; 50 | 51 | export const getCharset = (headers: Record): null | string => { 52 | let charset = null; 53 | const contentType = headers["content-type"] as string; 54 | if (contentType) { 55 | const match = contentType.match(/charset=['"]?([\w.-]+)/i); 56 | if (match) { 57 | charset = match[1].trim().toLowerCase(); 58 | } 59 | } 60 | return charset; 61 | }; 62 | 63 | export const getValidOptions = (options: RequestConfig): RequestOptions => { 64 | const type = getType(options); 65 | if (type === "string") { 66 | try { 67 | if (isValidUrl(options as string)) return { url: options } as RequestOptions; 68 | options = JSON.parse(options as string); 69 | return options as object; 70 | } catch (_err) { 71 | throw new TypeError(`Invalid options: ${JSON.stringify(options)}`); 72 | } 73 | } else if (type === "object") { 74 | const prototype = Object.getPrototypeOf(options); 75 | if (prototype === Object.prototype || prototype === null) return options as object; 76 | } 77 | throw new TypeError(`Invalid options: ${JSON.stringify(options)}`); 78 | }; 79 | 80 | export const renameOptionParams = (options: CrawlerOptions | undefined): CrawlerOptions | undefined => { 81 | if (options == undefined) { 82 | return undefined; 83 | } 84 | const renamedOptions: CrawlerOptions = { 85 | ...options, 86 | url: options.uri ?? options.url, 87 | searchParams: options.qs ?? options.searchParams, 88 | rejectUnauthorized: options.strictSSL ?? options.rejectUnauthorized, 89 | encoding: options.incomingEncoding ?? options.encoding, 90 | decompress: options.gzip ?? options.decompress, 91 | cookieJar: options.jar ?? options.cookieJar, 92 | parseJson: options.jsonReviver ?? options.parseJson, 93 | stringifyJson: options.jsonReplacer ?? options.stringifyJson, 94 | rateLimit: options.limiter ?? options.rateLimit, 95 | userParams: options.gene ?? options.userParams, 96 | jQuery: options.jquery ?? options.JQuery ?? options.jQuery, 97 | }; 98 | return renamedOptions; 99 | }; 100 | 101 | export const alignOptions = (options: RequestOptions): GotUrl => { 102 | const gotOptions = { 103 | ...options, 104 | timeout: { request: options.timeout }, 105 | } as any; 106 | 107 | const sslConfig = options.rejectUnauthorized; 108 | if (sslConfig !== undefined) { 109 | if (gotOptions.https === undefined) { 110 | gotOptions.https = { rejectUnauthorized: sslConfig }; 111 | } 112 | else { 113 | gotOptions.https.rejectUnauthorized = sslConfig; 114 | } 115 | } 116 | 117 | const defaultagent = options["proxy"] ? { 118 | https: new HttpsProxyAgent({ proxy: options["proxy"] }), 119 | http: new HttpProxyAgent({ proxy: options["proxy"] }), 120 | } : undefined; 121 | 122 | // http2 proxy 123 | if (options.http2 === true && options.proxy) { 124 | const { proxies: Http2Proxies } = http2Wrapper; 125 | const protocol = options.proxy.startsWith("https") ? "https" : "http"; 126 | const http2Agent = 127 | protocol === "https" 128 | ? new Http2Proxies.Http2OverHttps({ 129 | proxyOptions: { url: options.proxy }, 130 | }) 131 | : new Http2Proxies.Http2OverHttp({ 132 | proxyOptions: { url: options.proxy }, 133 | }); 134 | gotOptions.agent = { http2: http2Agent }; 135 | } else { 136 | gotOptions.agent = gotOptions.agent ?? (options.proxy ? defaultagent : undefined); 137 | } 138 | 139 | /** 140 | * @deprecated The support of incomingEncoding will be removed in the next major version. 141 | */ 142 | gotOptions.responseType = "buffer"; 143 | 144 | const invalidOptions = crawlerOnlyOptions.concat(deprecatedOptions); 145 | invalidOptions.forEach(key => { 146 | if (key in gotOptions) { 147 | delete gotOptions[key]; 148 | } 149 | }); 150 | 151 | const headers = gotOptions.headers; 152 | cleanObject(gotOptions); 153 | gotOptions.headers = headers; 154 | 155 | if (!gotOptions.headers.referer) { 156 | if (options.referer) { 157 | gotOptions.headers.referer = options.referer; 158 | } 159 | else { 160 | const domain = gotOptions.url.match(/^(\w+):\/\/([^/]+)/); 161 | if (domain) gotOptions.headers.referer = domain[0]; 162 | } 163 | } 164 | 165 | gotOptions.retry = { limit: 0 }; 166 | return gotOptions; 167 | }; 168 | -------------------------------------------------------------------------------- /src/rateLimiter/cluster.ts: -------------------------------------------------------------------------------- 1 | import RateLimiter, { RateLimiterOptions, TaskWrapper } from "./rateLimiter.js"; 2 | 3 | export type ClusterOptions = RateLimiterOptions & { 4 | homogeneous?: boolean; 5 | }; 6 | 7 | class Cluster { 8 | private _rateLimiters: Record; 9 | private _homogeneous: boolean; 10 | private _interval: NodeJS.Timeout | null = null; 11 | 12 | public globalMaxConnections: number; 13 | public globalRateLimit: number; 14 | public globalpriorityLevels: number; 15 | public globalDefaultPriority: number; 16 | 17 | constructor({ maxConnections, rateLimit, priorityLevels, defaultPriority, homogeneous }: ClusterOptions) { 18 | this.globalMaxConnections = maxConnections; 19 | this.globalRateLimit = rateLimit; 20 | this.globalpriorityLevels = priorityLevels; 21 | this.globalDefaultPriority = defaultPriority; 22 | 23 | this._homogeneous = homogeneous || false; 24 | this._rateLimiters = {}; 25 | } 26 | /** 27 | * Alternative to Old Cluster.prototype.key 28 | */ 29 | getRateLimiter(id?: number): RateLimiter { 30 | id = id ?? 0; 31 | if (!this._rateLimiters[id]) { 32 | this._rateLimiters[id] = new RateLimiter({ 33 | "maxConnections": this.globalMaxConnections, 34 | "rateLimit": this.globalRateLimit, 35 | "priorityLevels": this.globalpriorityLevels, 36 | "defaultPriority": this.globalDefaultPriority, 37 | "cluster": this._homogeneous ? this : void 0, 38 | }); 39 | this._rateLimiters[id].setId(id); 40 | return this._rateLimiters[id]; 41 | } else { 42 | return this._rateLimiters[id]; 43 | } 44 | } 45 | 46 | hasRateLimiter(id: number): boolean { 47 | return !!this._rateLimiters[id]; 48 | } 49 | 50 | deleteRateLimiter(id: number): boolean { 51 | id = id ?? 0; 52 | return delete this._rateLimiters[id]; 53 | } 54 | 55 | /** 56 | * @deprecated use waitingSize instead 57 | */ 58 | get waitingClients(): number { 59 | return this.waitingSize; 60 | } 61 | get waitingSize(): number { 62 | return Object.values(this._rateLimiters).reduce( 63 | (waitingCount, rateLimiter) => waitingCount + rateLimiter.waitingSize, 64 | 0 65 | ); 66 | } 67 | 68 | /** 69 | * @deprecated use unfinishedSize instead 70 | */ 71 | get unfinishedClients(): number { 72 | return this.unfinishedSize; 73 | } 74 | get unfinishedSize(): number { 75 | return Object.values(this._rateLimiters).reduce( 76 | (unfinishedCount, rateLimiter) => unfinishedCount + rateLimiter.runningSize + rateLimiter.waitingSize, 77 | 0 78 | ); 79 | } 80 | 81 | hasWaitingTasks(): boolean { 82 | return Object.values(this._rateLimiters).some(rateLimiter => rateLimiter.hasWaitingTasks()); 83 | } 84 | 85 | dequeue(): TaskWrapper | undefined { 86 | for (const rateLimiter of Object.values(this._rateLimiters)) { 87 | if (rateLimiter.waitingSize) { 88 | return { 89 | "next": rateLimiter.directDequeue(), 90 | "rateLimiterId": rateLimiter.id, 91 | }; 92 | } else { 93 | // @todo The logic design of the code is not up to the mark. 94 | // this.deleteRateLimiter(rateLimiter.id as number); 95 | } 96 | } 97 | return void 0; 98 | } 99 | 100 | get status(): string { 101 | const status: string[] = []; 102 | Object.keys(this._rateLimiters).forEach(key => { 103 | const id = Number(key); 104 | status.push( 105 | [ 106 | "Id: " + id, 107 | "running: " + this._rateLimiters[id].runningSize, 108 | "waiting: " + this._rateLimiters[id].waitingSize, 109 | ].join() 110 | ); 111 | }); 112 | return status.join(";"); 113 | } 114 | 115 | // startCleanup(): void { 116 | // clearInterval(this._interval as NodeJS.Timeout); 117 | // const base = (this._interval = setInterval(() => { 118 | // const time = Date.now(); 119 | // Object.keys(this._rateLimiters).forEach(key => { 120 | // const id = Number(key); 121 | // const rateLimiter = this._rateLimiters[id]; 122 | // if (rateLimiter.nextRequestTime + 1000 * 60 * 5 < time) { 123 | // this.deleteRateLimiter(id); 124 | // } 125 | // }); 126 | // }, 1000 * 30)); 127 | // if (typeof base.unref === "function") { 128 | // base.unref(); 129 | // } 130 | // } 131 | 132 | get empty(): boolean { 133 | return this.unfinishedSize === 0; 134 | } 135 | } 136 | export default Cluster; 137 | -------------------------------------------------------------------------------- /src/rateLimiter/index.ts: -------------------------------------------------------------------------------- 1 | import RateLimiter from "./rateLimiter.js"; 2 | import Cluster from "./cluster.js"; 3 | 4 | export { RateLimiter, Cluster }; 5 | -------------------------------------------------------------------------------- /src/rateLimiter/rateLimiter.ts: -------------------------------------------------------------------------------- 1 | import { multiPriorityQueue } from "../lib/index.js"; 2 | import Cluster from "./cluster.js"; 3 | 4 | export type Task = { 5 | (done: () => void, limiter?: number): void; 6 | }; 7 | 8 | export type TaskWrapper = { 9 | next: Task; 10 | rateLimiterId?: number; 11 | }; 12 | 13 | export type RateLimiterOptions = { 14 | maxConnections: number; 15 | rateLimit: number; 16 | priorityLevels: number; 17 | defaultPriority: number; 18 | cluster?: Cluster; 19 | }; 20 | 21 | class RateLimiter { 22 | private _waitingTasks: multiPriorityQueue; 23 | private _cluster?: Cluster; 24 | 25 | public id?: number; 26 | public maxConnections: number; 27 | public nextRequestTime: number; 28 | public rateLimit: number; 29 | public runningSize: number; 30 | public priorityLevels: number; 31 | public defaultPriority: number; 32 | 33 | constructor({ maxConnections, rateLimit, priorityLevels = 1, defaultPriority = 0, cluster }: RateLimiterOptions) { 34 | if (!Number.isInteger(maxConnections) || !Number.isInteger(rateLimit) || !Number.isInteger(priorityLevels)) { 35 | throw new Error("maxConnections, rateLimit and priorityLevels must be positive integers"); 36 | } 37 | this.maxConnections = maxConnections; 38 | this.priorityLevels = priorityLevels; 39 | this.defaultPriority = Number(defaultPriority); 40 | this.defaultPriority = Number.isInteger(defaultPriority) ? Math.min(Math.max(defaultPriority, 0), priorityLevels - 1) : Math.floor(priorityLevels / 2); 41 | this.nextRequestTime = Date.now(); 42 | 43 | this._waitingTasks = new multiPriorityQueue(priorityLevels); 44 | this._cluster = cluster; 45 | 46 | this.rateLimit = rateLimit; 47 | this.runningSize = 0; 48 | } 49 | 50 | get waitingSize(): number { 51 | return this._waitingTasks.size(); 52 | } 53 | 54 | hasWaitingTasks(): boolean { 55 | return this.waitingSize > 0 || (this._cluster !== void 0 && this._cluster.hasWaitingTasks()); 56 | } 57 | 58 | setId(id: number) { 59 | this.id = id; 60 | } 61 | 62 | setRateLimit(rateLimit: number): void { 63 | if (!Number.isInteger(rateLimit) || rateLimit < 0) { 64 | throw new Error("rateLimit must be non negative integers"); 65 | } 66 | this.rateLimit = rateLimit; 67 | if (this.rateLimit > 0) this.maxConnections = 1; 68 | } 69 | 70 | submit(options: { priority: number } | number, task: Task): void { 71 | let priority = typeof options === "number" ? options : options.priority; 72 | priority = Number.isInteger(priority) ? priority : this.defaultPriority; 73 | priority = Math.min(priority, this.priorityLevels - 1); 74 | this._waitingTasks.enqueue(task, priority); 75 | this._schedule(); 76 | } 77 | 78 | private _schedule(): void { 79 | if (this.runningSize < this.maxConnections && this.hasWaitingTasks()) { 80 | ++this.runningSize; 81 | const delay = Math.max(this.nextRequestTime - Date.now(), 0); 82 | this.nextRequestTime = Date.now() + delay + this.rateLimit; 83 | 84 | const { next, rateLimiterId } = this.dequeue() as TaskWrapper; 85 | setTimeout(() => { 86 | const done = () => { 87 | --this.runningSize; 88 | this._schedule(); 89 | }; 90 | next(done, rateLimiterId); 91 | }, delay); 92 | } 93 | } 94 | 95 | directDequeue(): Task { 96 | return this._waitingTasks.dequeue() as Task; 97 | } 98 | 99 | dequeue(): TaskWrapper | undefined { 100 | if (this.waitingSize) { 101 | return { 102 | next: this._waitingTasks.dequeue() as Task, 103 | rateLimiterId: undefined, 104 | }; 105 | } 106 | return this._cluster?.dequeue(); 107 | } 108 | } 109 | 110 | export default RateLimiter; 111 | -------------------------------------------------------------------------------- /src/types/crawler.ts: -------------------------------------------------------------------------------- 1 | export type GlobalOnlyOptions = { 2 | /** 3 | * Global Only option. 4 | * @default 10 5 | * @description The maximum number of requests that can be sent simultaneously. 6 | * @example If the value is 10, the crawler will send at most 10 requests at the same time. 7 | * Note: The maxConnections(> 1) will be valid only if the global ratelimit is set to be 0. 8 | */ 9 | maxConnections: number; 10 | /** 11 | * Global Only option. 12 | * @default 10 13 | * @description The number of levels of priority. Can be only assigned at the beginning. 14 | */ 15 | priorityLevels: number; 16 | /** 17 | * Global Only option. 18 | * @default 0 19 | * @description The default priority of the tasks. Can be only assigned at the beginning. 20 | * @example 1000 means 1000 milliseconds delay between after the first request. 21 | */ 22 | rateLimit: number; 23 | /** 24 | * Global Only option. 25 | * @default false 26 | * @description If true, the crawler will skip duplicate tasks. 27 | * @example If the task is already in the queue, the crawler will not add it again. 28 | */ 29 | skipDuplicates: boolean; 30 | /** 31 | * Global Only option. 32 | * @default false 33 | * @description If true, the crawler will dynamically reallocate the tasks within the queue blocked due to header blocking to other queues. 34 | */ 35 | homogeneous: boolean; 36 | /** 37 | * Global Only option. 38 | * @default undefined 39 | * @description If passed, the crawler will rotate the user agent for each request. The "userAgents" option must be an array if activated. 40 | */ 41 | userAgents?: string | string[]; 42 | /** 43 | * Global Only option. 44 | * @default false 45 | * @description If true, the crawler will mute all warning and error messages. The request error will be still thrown. 46 | */ 47 | silence?: boolean; 48 | /** 49 | * @deprecated Please use "maxConnections" instead. 50 | */ 51 | maxConnection: number; 52 | /** 53 | * @deprecated Please use "rateLimit" instead. 54 | */ 55 | limiter?: number; 56 | }; 57 | 58 | export type RequestOptions = { 59 | forceUTF8?: boolean; 60 | /** 61 | * crawlerOption 62 | * @default true 63 | * @description If true, the crawler will use the cheerio library to parse the HTML content. 64 | * @see cheerio.load() 65 | * @example If inject successfully, the response object will have "$" property, which is a function to use jQuery. 66 | */ 67 | jQuery?: boolean; 68 | /** 69 | * @deprecated 70 | * @description Please use "jQuery" instead. 71 | */ 72 | jquery?: boolean; 73 | /** 74 | * @deprecated 75 | * @description Please use "jQuery" instead. 76 | */ 77 | JQuery?: boolean; 78 | /** 79 | * @deprecated 80 | * @description Please use "encoding" instead. 81 | */ 82 | incomingEncoding?: string | null; 83 | /** 84 | * @default "utf8" 85 | * @description The encoding of the response body. 86 | */ 87 | encoding?: string | null; 88 | /** 89 | * @default 0 90 | * @description rateLimiter ID 91 | */ 92 | rateLimiterId?: number; 93 | /** 94 | * @default 2 95 | * @description The retry count of the request. 96 | */ 97 | retries?: number; 98 | /** 99 | * @default 3000 100 | * @description The interval between retries in milliseconds. 101 | */ 102 | retryInterval?: number; 103 | /** 104 | * @default 20000 105 | * @description The global timeout of the request in milliseconds. 106 | */ 107 | timeout?: number; 108 | priority?: number; 109 | seenreq?: any; 110 | 111 | method?: string; 112 | skipEventRequest?: boolean; 113 | html?: boolean; 114 | proxies?: string[]; 115 | proxy?: string; 116 | http2?: boolean; 117 | body?: string | Record; 118 | headers?: Record; 119 | agent?: any; 120 | /** 121 | * @deprecated Please do not set 'logger' anymore. 122 | */ 123 | logger?: any; 124 | /** 125 | * @deprecated Please do not set 'debug' anymore. 126 | */ 127 | debug?: any; 128 | /** 129 | * @deprecated Please do not set 'time' anymore. 130 | */ 131 | time?: any; 132 | /** 133 | * @deprecated Please use "url" instead. 134 | */ 135 | uri?: string | ((urlFn: (url: string) => void) => void); 136 | url?: string | ((urlFn: (url: string) => void) => void); 137 | 138 | /** 139 | * @deprecated Please use "searchParams" instead. 140 | */ 141 | qs?: Record; 142 | searchParams?: Record; 143 | 144 | /** 145 | * @deprecated Please use "rejectUnauthorized" instead. 146 | */ 147 | strictSSL?: boolean; 148 | /** 149 | * @description If false, the crawler will ignore SSL certificate errors. 150 | * @default true 151 | */ 152 | rejectUnauthorized?: boolean; 153 | 154 | /** 155 | * @deprecated Please use "decompress" instead. 156 | */ 157 | gzip?: boolean; 158 | decompress?: boolean; 159 | 160 | /** 161 | * @deprecated Please use "cookieJar" instead. 162 | * @see tough-cookie https://github.com/sindresorhus/got/blob/main/documentation/migration-guides/request.md 163 | */ 164 | jar?: object; 165 | cookieJar?: object; 166 | 167 | /** 168 | * @default false 169 | * 170 | * If true, the crawler will parse the response body as JSON. 171 | * This will set 'jQuery' to false. 172 | */ 173 | isJson?: boolean; 174 | 175 | referer?: string; 176 | /** 177 | * @deprecated 178 | * @description Please use "userParams" instead. 179 | */ 180 | gene?: string | Record; 181 | userParams?: unknown; 182 | /** 183 | * @deprecated Please use "parseJson" instead. 184 | */ 185 | jsonReviver?: (text: string) => unknown; 186 | parseJson?: (text: string) => unknown; 187 | 188 | /** 189 | * @deprecated Please use "stringifyJson" instead. 190 | */ 191 | jsonReplacer?: (object: unknown) => string; 192 | stringifyJson?: (object: unknown) => string; 193 | 194 | preRequest?: (options: RequestOptions, done?: (error?: Error | null) => void) => void; 195 | release?: () => void; 196 | callback?: (error: unknown, response: CrawlerResponse, done?: unknown) => void; 197 | }; 198 | 199 | export type RequestConfig = string | RequestOptions | RequestOptions[]; 200 | export type CrawlerOptions = Partial & RequestOptions; 201 | export type CrawlerResponse = any; 202 | -------------------------------------------------------------------------------- /src/types/index.ts: -------------------------------------------------------------------------------- 1 | declare module "seenreq"; -------------------------------------------------------------------------------- /test/binaryDataStream.test.js: -------------------------------------------------------------------------------- 1 | import test from 'ava'; 2 | import Crawler from '../dist/index.js'; 3 | import nock from 'nock'; 4 | import { testCb } from "./lib/avaTestCb.js"; 5 | 6 | 7 | const binaryData = Buffer.from('Hello, World!', 'utf-8'); 8 | 9 | test.beforeEach(t => { 10 | nock('http://example.com') 11 | .get('/binary-data') 12 | .reply(200, binaryData, { 13 | 'Content-Type': 'application/octet-stream', 14 | }); 15 | 16 | t.context.crawler = new Crawler({ 17 | encoding: null, 18 | callback: (err, res, done) => { 19 | if (err) { 20 | console.error(err.stack); 21 | return done(err); 22 | } 23 | 24 | const buffers = []; 25 | res.body.on('data', chunk => buffers.push(chunk)); 26 | res.body.on('end', () => { 27 | const result = Buffer.concat(buffers); 28 | t.is(result.toString(), 'Hello, World!', 'The binary stream should match the expected content'); 29 | done(); 30 | }); 31 | }, 32 | }); 33 | }); 34 | 35 | 36 | testCb(test, 'should correctly handle and process a binary data stream', async t => { 37 | t.context.crawler.send({ 38 | url: 'http://example.com/binary-data', 39 | callback: (error, res) => { 40 | t.is(error, null); 41 | t.is(res.statusCode, 200); 42 | t.end(); 43 | }, 44 | }); 45 | }); -------------------------------------------------------------------------------- /test/cacheOptions.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | test.beforeEach(t => { 7 | t.context.scope = nock("http://target.com"); 8 | }); 9 | test.afterEach(t => { 10 | t.context.c = {}; 11 | }); 12 | 13 | testCb(test, "Should't skip one single url if duplicates are active.", async t => { 14 | t.context.scope.get("/").reply(200); 15 | t.context.c = new Crawler({ 16 | // silence: true, 17 | skipDuplicates: true, 18 | callback: (error, result, done) => { 19 | t.is(error, null); 20 | t.is(result.statusCode, 200); 21 | t.true(t.context.scope.isDone()); 22 | t.end(); 23 | }, 24 | }); 25 | t.context.c.add("http://target.com"); 26 | }); 27 | 28 | testCb(test, "Should notify the callback when an error occurs and 'retries' is disabled.", async t => { 29 | t.context.scope.get("/").replyWithError("Bad request."); 30 | t.context.c = new Crawler({ 31 | // silence: true, 32 | jQuery: false, 33 | skipDuplicates: true, 34 | retries: 0, 35 | callback: (error, result, done) => { 36 | t.truthy(error); 37 | t.true(t.context.scope.isDone()); 38 | t.end(); 39 | }, 40 | }); 41 | t.context.c.add("http://target.com"); 42 | }); 43 | 44 | testCb(test, "Should retry and notify the callback when an error occurs and 'retries' is enabled.", async t => { 45 | t.context.scope.get("/").replyWithError("Bad request.").persist(); 46 | t.context.c = new Crawler({ 47 | jQuery: false, 48 | skipDuplicates: true, 49 | retries: 1, 50 | retryInterval: 10, 51 | callback: (error, result, done) => { 52 | t.truthy(error); 53 | t.true(t.context.scope.isDone()); 54 | t.context.scope.persist(false); 55 | t.end(); 56 | }, 57 | }); 58 | t.context.c.add("http://target.com"); 59 | }); 60 | 61 | testCb(test, "Should skip previously crawled urls when 'skipDuplicates' is active.", async t => { 62 | t.context.scope.get("/").reply(200).persist(); 63 | t.plan(3); 64 | t.context.c = new Crawler({ 65 | jQuery: false, 66 | skipDuplicates: true, 67 | callback: (error, result, done) => { 68 | t.is(error, null); 69 | t.is(result.statusCode, 200); 70 | t.true(t.context.scope.isDone()); 71 | t.context.c.add("http://target.com"); 72 | done(); 73 | }, 74 | }); 75 | t.context.c.add("http://target.com"); 76 | t.context.c.on("drain", () => { 77 | t.end(); 78 | }); 79 | }); 80 | -------------------------------------------------------------------------------- /test/callback.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | const url = "http://www.whatever.com"; 7 | test.before(t => { 8 | nock.cleanAll(); 9 | }); 10 | test.beforeEach(t => { 11 | t.context.crawler = new Crawler({ 12 | // silence: true, 13 | retryInterval: 0, 14 | retries: 0, 15 | timeout: 100, 16 | }); 17 | }); 18 | test.afterEach(t => { 19 | t.context.crawler = null; 20 | }); 21 | 22 | testCb(test, "should end as expected without callback", async t => { 23 | t.context.scope = nock(url).get("/get").reply(200, "", { 24 | "Content-Type": "text/html", 25 | }); 26 | t.context.crawler.on("drain", () => { 27 | t.true(t.context.scope.isDone()); 28 | t.end(); 29 | }); 30 | t.context.crawler.add(`${url}/get`); 31 | }); 32 | 33 | testCb(test, "should end as expected without callback when timedout", async t => { 34 | t.context.scope = nock(url).get("/delay").delayBody(500).reply(200, "", { 35 | "Content-Type": "text/html", 36 | }); 37 | t.context.crawler.on("drain", () => { 38 | t.true(t.context.scope.isDone()); 39 | t.end(); 40 | }); 41 | t.context.crawler.add(`${url}/delay`); 42 | }); 43 | -------------------------------------------------------------------------------- /test/cookieJar.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | import { CookieJar } from "tough-cookie"; 6 | 7 | test.before(t => { 8 | nock.cleanAll(); 9 | nock("http://test.crawler.com/").get("/setCookie").reply(function () { 10 | let response = [200, "ok", 11 | { 12 | "Set-Cookie": `ping=pong; Domain=.crawler.com; Expires=${new Date( 13 | Date.now() + 86400000 14 | ).toUTCString()}; Path=/`, 15 | }, 16 | ]; 17 | return response; 18 | }).persist(); 19 | nock("http://test.crawler.com/").get("/getCookie").reply(200, function () { 20 | return this.req.headers.cookie; 21 | }).persist(); 22 | const jar = new CookieJar(); 23 | jar.setCookieSync("foo=bar", "http://test.crawler.com"); 24 | t.context.jar = jar; 25 | t.context.crawler = new Crawler({ 26 | // silence: true, 27 | jQuery: false, 28 | jar: t.context.jar, 29 | }); 30 | }); 31 | 32 | testCb(test, "should send with cookie when setting jar options", async t => { 33 | t.context.crawler.add({ 34 | url: "http://test.crawler.com/getCookie", 35 | callback: (error, response, done) => { 36 | t.is(error, null); 37 | t.is(response.body, t.context.jar.getCookieStringSync("http://test.crawler.com")); 38 | done(); 39 | t.end(); 40 | } 41 | }); 42 | }); 43 | 44 | testCb(test, "should set cookie when response set-cookie headers exist", async t => { 45 | t.context.crawler.add({ 46 | url: "http://test.crawler.com/setCookie", 47 | callback: (error, response, done) => { 48 | t.is(error, null); 49 | t.true(t.context.jar.getCookieStringSync("http://test.crawler.com").includes("ping=pong")); 50 | done(); 51 | t.end(); 52 | } 53 | }); 54 | }); 55 | -------------------------------------------------------------------------------- /test/direct.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | import sinon from "sinon"; 6 | 7 | test.before(t => { 8 | nock.cleanAll(); 9 | nock("http://test.crawler.com").get("/").reply(200, "ok").persist(); 10 | }); 11 | test.beforeEach(t => { 12 | t.context.cb = sinon.spy(); 13 | t.context.crawler = new Crawler({ 14 | // silence: true, 15 | jQuery: false, 16 | rateLimit: 100, 17 | preRequest: (options, done) => { 18 | t.context.cb("preRequest"); 19 | done(); 20 | }, 21 | callback: (err, res, done) => { 22 | if (err) { 23 | t.context.cb("error"); 24 | } else { 25 | t.context.cb("callback"); 26 | } 27 | done(); 28 | }, 29 | }); 30 | t.context.crawler.on("request", () => { 31 | t.context.cb("Event:request"); 32 | }); 33 | }); 34 | test.afterEach(t => { 35 | t.context.crawler = null; 36 | }); 37 | 38 | testCb(test, "should not trigger preRequest or callback of crawler instance", async t => { 39 | t.context.crawler.send({ 40 | url: "http://test.crawler.com/", 41 | callback: (error, res) => { 42 | t.is(error, null); 43 | t.is(res.statusCode, 200); 44 | t.is(res.body, "ok"); 45 | t.false(t.context.cb.called); 46 | t.end(); 47 | }, 48 | }); 49 | }); 50 | 51 | testCb(test, "should be sent directly regardless of current queue of crawler", async t => { 52 | t.context.crawler.add({ 53 | url: "http://test.crawler.com/", 54 | callback: (error, res, done) => { 55 | t.is(error, null); 56 | t.context.crawler.send({ 57 | url: "http://test.crawler.com/", 58 | callback: () => { 59 | t.is(t.context.cb.getCalls().length, 2); 60 | t.context.cb("direct"); 61 | }, 62 | }); 63 | done(); 64 | }, 65 | }); 66 | t.context.crawler.add("http://test.crawler.com/"); 67 | t.context.crawler.add("http://test.crawler.com/"); 68 | t.context.crawler.add({ 69 | url: "http://test.crawler.com/", 70 | callback: (error, res, done) => { 71 | t.is(error, null); 72 | const seq = [ 73 | "preRequest", 74 | "Event:request", 75 | "direct", 76 | "preRequest", 77 | "Event:request", 78 | "callback", 79 | "preRequest", 80 | "Event:request", 81 | "callback", 82 | "preRequest", 83 | "Event:request", 84 | ]; 85 | t.deepEqual( 86 | t.context.cb.args.map(args => args[0]), 87 | seq 88 | ); 89 | t.end(); 90 | }, 91 | }); 92 | }); 93 | 94 | testCb(test, "should not trigger Event:request by default.", async t => { 95 | t.context.crawler.send({ 96 | url: "http://test.crawler.com/", 97 | callback: (error, res) => { 98 | t.is(error, null); 99 | t.is(res.statusCode, 200); 100 | t.is(res.body, "ok"); 101 | t.false(t.context.cb.calledWith("Event:request")); 102 | t.end(); 103 | }, 104 | }); 105 | }); 106 | 107 | testCb(test, "should trigger Event:request if set.", async t => { 108 | t.context.crawler.send({ 109 | url: "http://test.crawler.com/", 110 | skipEventRequest: false, 111 | callback: (error, res) => { 112 | t.is(error, null); 113 | t.is(res.statusCode, 200); 114 | t.is(res.body, "ok"); 115 | t.true(t.context.cb.calledWith("Event:request")); 116 | t.end(); 117 | }, 118 | }); 119 | }); 120 | -------------------------------------------------------------------------------- /test/encoding.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | const origin = "http://czyborra.com"; 7 | const encodingFileName = "iso8859.html"; 8 | const charsetName = "ISO-8859-1"; 9 | const path = `/charsets/${encodingFileName}`; 10 | const url = `${origin}${path}`; 11 | const pathWithoutCharsetHeader = `/charsets-noheader/${encodingFileName}`; 12 | const urlWithoutCharsetHeader = `${origin}${pathWithoutCharsetHeader}`; 13 | 14 | test.before(t => { 15 | nock.cleanAll(); 16 | }); 17 | test.beforeEach(t => { 18 | t.context.crawler = new Crawler({ 19 | retries: 0 20 | }); 21 | 22 | nock(origin).get(path).replyWithFile(200, `test/lib/${encodingFileName}`, { "Content-Type": `text/html;charset=${charsetName}` }); 23 | nock(origin).get(pathWithoutCharsetHeader).replyWithFile(200, `test/lib/${encodingFileName}`, { "Content-Type": "text/html" }); 24 | }); 25 | test.afterEach(t => { 26 | t.context.crawler = null; 27 | }); 28 | 29 | testCb(test, "should parse latin-1", async t => { 30 | t.context.crawler.add({ 31 | url, 32 | callback: (error, result) => { 33 | t.is(error, null); 34 | t.is(result.charset, charsetName.toLowerCase()); 35 | t.true(result.body.indexOf("Jörg") > 0); 36 | t.end(); 37 | } 38 | }); 39 | }); 40 | 41 | testCb(test, "should return buffer if encoding = null", async t => { 42 | t.context.crawler.add({ 43 | url, 44 | encoding: null, 45 | callback: (error, result) => { 46 | t.is(error, null); 47 | t.true(result.body instanceof Buffer); 48 | t.end(); 49 | } 50 | }); 51 | }); 52 | 53 | testCb(test, "should parse latin-1 if encoding = ISO-8859-1", async t => { 54 | t.context.crawler.add({ 55 | url, 56 | encoding: charsetName, 57 | callback: (error, result) => { 58 | t.is(error, null); 59 | t.is(result.charset, charsetName.toLowerCase()); 60 | t.true(result.body.indexOf("Jörg") > 0); 61 | t.end(); 62 | } 63 | }); 64 | }); 65 | 66 | testCb(test, "could not parse latin-1 if encoding = gb2312", async t => { 67 | t.context.crawler.add({ 68 | url, 69 | encoding: "gb2312", 70 | callback: (error, result) => { 71 | t.is(error, null); 72 | t.is(result.body.indexOf("Jörg"), -1); 73 | t.end(); 74 | } 75 | }); 76 | }); 77 | 78 | testCb(test, "should parse charset from header", async t => { 79 | t.context.crawler.add({ 80 | url, 81 | callback: (error, result) => { 82 | t.is(error, null); 83 | t.is(result.charset, charsetName.toLowerCase()); 84 | t.true(result.body.indexOf("Jörg") > 0); 85 | t.end(); 86 | } 87 | }); 88 | }); 89 | 90 | testCb(test, "should parse charset from meta tag in html if header does not contain content-type key", async t => { 91 | t.context.crawler.add({ 92 | url: urlWithoutCharsetHeader, 93 | callback: (error, result) => { 94 | t.is(error, null); 95 | t.is(result.charset, charsetName.toLowerCase()); 96 | t.true(result.body.indexOf("Jörg") > 0); 97 | t.end(); 98 | } 99 | }); 100 | }); -------------------------------------------------------------------------------- /test/errorHandling.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | test.before(t => { 7 | nock.cleanAll(); 8 | nock("http://test.crawler.com").get("/delay/1").delay(1000).reply(200, "ok").persist(); 9 | nock("http://test.crawler.com").get("/status/400").reply(400, "Bad Request").persist(); 10 | nock("http://test.crawler.com").get("/status/401").reply(401, "Unauthorized").persist(); 11 | nock("http://test.crawler.com").get("/status/403").reply(403, "Forbidden").persist(); 12 | nock("http://test.crawler.com").get("/status/404").reply(404, "Not Found").persist(); 13 | nock("http://test.crawler.com").get("/status/500").reply(500, "Internal Error").persist(); 14 | nock("http://test.crawler.com").get("/status/204").reply(204, "").persist(); 15 | }); 16 | test.beforeEach(t => { 17 | t.context.crawler = new Crawler({ 18 | // silence: true, 19 | timeout: 500, 20 | retryInterval: 500, 21 | retries: 2, 22 | jQuery: false, 23 | }); 24 | }); 25 | test.afterEach(t => { 26 | t.context.crawler = null; 27 | }); 28 | 29 | testCb(test, "should retry after timeout", async t => { 30 | let options = { 31 | url: "http://test.crawler.com/delay/1", 32 | callback: (error, response, done) => { 33 | t.truthy(error); 34 | t.is(response.options.retries, 0); 35 | t.end(); 36 | }, 37 | }; 38 | t.context.crawler.add(options); 39 | t.is(options.retries, 2); 40 | }); 41 | 42 | testCb(test, "should return a timeout error after ~2sec", async t => { 43 | t.context.crawler.add({ 44 | url: "http://test.crawler.com/delay/1", 45 | callback: (error, response, done) => { 46 | t.truthy(error); 47 | t.true(error.code === "ETIMEDOUT" || error.code === "ESOCKETTIMEDOUT"); 48 | t.end(); 49 | }, 50 | }); 51 | }); 52 | 53 | testCb(test, "should not failed on empty response", async t => { 54 | t.context.crawler.add({ 55 | url: "http://test.crawler.com/status/204", 56 | callback: (error, response, done) => { 57 | t.falsy(error); 58 | t.is(response.statusCode, 204); 59 | t.end(); 60 | }, 61 | }); 62 | }); 63 | 64 | testCb(test, "should not failed on a malformed html if jQuery is false", async t => { 65 | t.context.crawler.add({ 66 | html: "

hello

dude

", 67 | callback: (error, response, done) => { 68 | t.falsy(error); 69 | t.truthy(response); 70 | t.end(); 71 | }, 72 | }); 73 | }); 74 | -------------------------------------------------------------------------------- /test/examples.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | import sinon from "sinon"; 6 | 7 | test.before(t => { 8 | nock.cleanAll(); 9 | }); 10 | test.beforeEach(t => { 11 | nock("http://nockhost") 12 | .get(url => url.indexOf("status") >= 0) 13 | .times(20) 14 | .reply(200, "Yes"); 15 | t.context.crawler = new Crawler({ 16 | // silence: true, 17 | maxConnections: 10, 18 | jQuery: false, 19 | }); 20 | }); 21 | test.afterEach(t => { 22 | t.context.crawler = null; 23 | t.context.cb = null; 24 | }); 25 | 26 | testCb(test, "should run the first readme examples.", async t => { 27 | t.context.crawler.add({ 28 | url: "http://github.com", 29 | callback: (err, res, done) => { 30 | t.falsy(err); 31 | t.is(typeof res.body, "string"); 32 | t.end(); 33 | }, 34 | }); 35 | }); 36 | 37 | testCb(test, "should run the readme examples.", async t => { 38 | t.context.crawler = new Crawler({ 39 | // silence: true, 40 | maxConnections: 10, 41 | jQuery: false, 42 | callback: (err, res, done) => { 43 | t.falsy(err); 44 | done(); 45 | }, 46 | }); 47 | t.context.cb = sinon.spy(t.context.crawler, "add"); 48 | t.context.crawler.add("http://nockhost/status/200"); 49 | t.context.crawler.add("http://nockhost/status/200"); 50 | t.context.crawler.on("drain", () => { 51 | t.true(t.context.cb.calledTwice); 52 | t.end(); 53 | }); 54 | }); 55 | 56 | testCb(test, "should run the with an array queue.", async t => { 57 | t.context.crawler.add([ 58 | { 59 | url: "http://www.github.com", 60 | jQuery: true, 61 | callback: (err, res, done) => { 62 | t.falsy(err); 63 | t.truthy(res.$); 64 | t.is(typeof res.body, "string"); 65 | done(); 66 | }, 67 | }, 68 | ]); 69 | t.context.crawler.on("drain", () => { 70 | t.end(); 71 | }); 72 | }); 73 | -------------------------------------------------------------------------------- /test/http2ErrorHanding.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import Crawler from "../dist/index.js"; 4 | 5 | test.before(t => { 6 | t.context.crawler = new Crawler({ 7 | // silence: true, 8 | timeout: 1000, 9 | retryInterval: 0, 10 | retries: 2, 11 | jQuery: false, 12 | http2: true, 13 | }); 14 | }); 15 | 16 | testCb(test, "http2: should retry after timeout.", async t => { 17 | const options = { 18 | url: "https://nghttp2.org/httpbin/delay/4", 19 | callback: (error, response, done) => { 20 | t.truthy(error); 21 | t.is(response.options.retries, 0); 22 | done(); 23 | t.end(); 24 | }, 25 | }; 26 | t.context.crawler.add(options); 27 | t.is(options.retries, 2); 28 | }); 29 | 30 | testCb(test, "http2: should return a timeout error after ~3sec.", async t => { 31 | t.context.crawler.add({ 32 | url: "https://nghttp2.org/httpbin/delay/4", 33 | callback: (error, response, done) => { 34 | t.truthy(error); 35 | t.true(error.code === "ETIMEDOUT" || error.code === "ESOCKETTIMEDOUT"); 36 | done(); 37 | t.end(); 38 | }, 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/http2Response.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import Crawler from "../dist/index.js"; 4 | 5 | test.afterEach(t => { 6 | t.context.crawler = null; 7 | }); 8 | 9 | testCb(test, "response statusCode.", async t => { 10 | t.context.crawler = new Crawler({ 11 | // silence: true, 12 | timeout: 10000, 13 | retryInterval: 1000, 14 | retries: 2, 15 | jQuery: false, 16 | http2: true, 17 | }); 18 | t.context.crawler.add({ 19 | url: "https://nghttp2.org/httpbin/status/200", 20 | callback: (error, response, done) => { 21 | t.is(response.statusCode, 200); 22 | done(); 23 | t.end(); 24 | }, 25 | }); 26 | }); 27 | 28 | testCb(test, "response headers.", async t => { 29 | t.context.crawler = new Crawler({ 30 | // silence: true, 31 | retryInterval: 1000, 32 | retries: 2, 33 | jQuery: false, 34 | http2: true, 35 | }); 36 | t.context.crawler.add({ 37 | url: "https://nghttp2.org/httpbin/status/200", 38 | callback: (error, response, done) => { 39 | t.truthy(response.headers); 40 | t.is(typeof response.headers, "object"); 41 | t.is(response.headers["content-type"], "text/html; charset=utf-8"); 42 | done(); 43 | t.end(); 44 | }, 45 | }); 46 | }); 47 | 48 | testCb(test, "html response body.", async t => { 49 | t.context.crawler = new Crawler({ 50 | // silence: true, 51 | retryInterval: 1000, 52 | retries: 2, 53 | jQuery: true, 54 | http2: true, 55 | }); 56 | t.context.crawler.add({ 57 | url: "https://nghttp2.org/httpbin/html", 58 | callback: (error, response, done) => { 59 | t.truthy(response.$); 60 | t.is(typeof response.$, "function"); 61 | t.is(response.$("body").length, 1); 62 | done(); 63 | t.end(); 64 | }, 65 | }); 66 | }); 67 | -------------------------------------------------------------------------------- /test/lib/avaTestCb.js: -------------------------------------------------------------------------------- 1 | export const testCbAsync = (test, description, assertions) => { 2 | test(description, async t => { 3 | await new Promise(resolve => { 4 | // eslint-disable-next-linse @typescript-eslint/no-explicit-any 5 | t.end = () => { 6 | resolve(undefined); 7 | }; 8 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 9 | assertions(t); 10 | }); 11 | }); 12 | }; 13 | export const testCbSync = (test, description, assertions) => { 14 | test.serial(description, async t => { 15 | await new Promise(resolve => { 16 | // eslint-disable-next-linse @typescript-eslint/no-explicit-any 17 | t.end = () => { 18 | resolve(undefined); 19 | }; 20 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 21 | assertions(t); 22 | }); 23 | }); 24 | }; 25 | 26 | // @todo: add async test version 27 | export const testCb = testCbSync; 28 | -------------------------------------------------------------------------------- /test/lib/iso8859.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bda-research/node-crawler/5f6219c02925299953918de9d39954895e17b187/test/lib/iso8859.html -------------------------------------------------------------------------------- /test/limiter.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | test.before(t => { 7 | nock.cleanAll(); 8 | }); 9 | test.beforeEach(t => { 10 | nock("http://nockHost") 11 | .get(url => url.indexOf("status") >= 0) 12 | .times(5) 13 | .reply(200, "Yes"); 14 | t.context.crawler = new Crawler({ 15 | // silence: true, 16 | jQuery: false, 17 | rateLimit: 500, 18 | retries: 0, 19 | callback: (err, result, done) => { 20 | t.is(err, null); 21 | t.is(result.statusCode, 200); 22 | done(); 23 | }, 24 | }); 25 | t.context.tsArrs = []; 26 | t.context.crawler.on("request", () => { 27 | t.context.tsArrs.push(Date.now()); 28 | }); 29 | }); 30 | test.afterEach(t => { 31 | t.context.crawler = null; 32 | t.context.tsArrs = []; 33 | }); 34 | 35 | testCb(test, "One limiter, tasks should execute one by one", async t => { 36 | for (let i = 0; i < 5; i++) { 37 | t.context.crawler.add({ url: "http://nockHost/status/200" }); 38 | } 39 | t.context.crawler.on("drain", () => { 40 | t.is(t.context.tsArrs.length, 5); 41 | // setTimeout in nodejs is delayed 42 | // 4 rateLimit +- 50ms = 4 * 500 +- 50 43 | t.true(t.context.tsArrs[4] - t.context.tsArrs[0] >= 1950); 44 | t.true(t.context.tsArrs[4] - t.context.tsArrs[0] <= 2050); 45 | t.end(); 46 | }); 47 | }); 48 | 49 | testCb(test, "Multiple limiters, tasks should execute in parallel", async t => { 50 | for (let i = 0; i < 5; i++) { 51 | t.context.crawler.add({ url: "http://nockHost/status/200", rateLimiterId: i }); 52 | } 53 | t.context.crawler.on("drain", () => { 54 | t.is(t.context.tsArrs.length, 5); 55 | // setTimeout in nodejs is delayed 56 | // request sent almost at same time 57 | t.true(t.context.tsArrs[4] - t.context.tsArrs[0] <= 50); 58 | t.end(); 59 | }); 60 | }); 61 | 62 | testCb(test, "Multiple limiters are mutual independent", async t => { 63 | for (let i = 0; i < 5; i++) { 64 | const limiter = i === 4 ? "second" : "default"; 65 | t.context.crawler.add({ url: "http://nockHost/status/200", rateLimiterId: limiter }); 66 | } 67 | t.context.crawler.on("drain", () => { 68 | t.is(t.context.tsArrs.length, 5); 69 | // setTimeout in nodejs is delayed 70 | // 3 rateLimit +- 50ms = 3 * 500 +- 50 71 | t.true(t.context.tsArrs[4] - t.context.tsArrs[0] >= 1450); 72 | t.true(t.context.tsArrs[4] - t.context.tsArrs[0] <= 1550); 73 | t.end(); 74 | }); 75 | }); 76 | 77 | testCb(test, "should modify maxConnections when rateLimit is set", async t => { 78 | nock.cleanAll(); 79 | nock("http://nockHost").get(url => url.indexOf("status") >= 0).times(1).reply(200, "Yes"); 80 | t.context.crawler.add({ 81 | url: "http://nockHost/status/200", 82 | callback: (err, result, done) => { 83 | t.is(err, null); 84 | t.is(result.statusCode, 200); 85 | done(); 86 | }, 87 | }); 88 | t.context.crawler.on("drain", () => { 89 | t.is(t.context.crawler.options.maxConnections, 1); 90 | t.end(); 91 | }); 92 | }); 93 | -------------------------------------------------------------------------------- /test/preRequest.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | import sinon from "sinon"; 6 | 7 | test.before(t => { 8 | nock.cleanAll(); 9 | nock("http://test.crawler.com").get("/").reply(200, "ok").persist(); 10 | }); 11 | test.beforeEach(t => { 12 | t.context.cb = sinon.spy(); 13 | }); 14 | 15 | testCb(test, "Should do preRequest before request when preRequest defined in crawler options.", async t => { 16 | t.context.crawler = new Crawler({ 17 | // silence: true, 18 | jQuery: false, 19 | preRequest: (options, done) => { 20 | setTimeout(() => { 21 | t.context.cb("preRequest"); 22 | done(); 23 | }, 50); 24 | }, 25 | }); 26 | t.context.crawler.add({ 27 | url: "http://test.crawler.com/", 28 | callback: (error, response, done) => { 29 | t.is(error, null); 30 | t.is(t.context.cb.getCalls().length, 1); 31 | t.is(t.context.cb.getCalls()[0].args[0], "preRequest"); 32 | done(); 33 | t.end(); 34 | }, 35 | }); 36 | }); 37 | 38 | testCb(test, "Should do preRequest before request when preRequest defined in add options.", async t => { 39 | t.context.crawler = new Crawler({ 40 | // silence: true, 41 | jQuery: false 42 | }); 43 | t.context.crawler.add({ 44 | url: "http://test.crawler.com/", 45 | preRequest: (options, done) => { 46 | setTimeout(() => { 47 | t.context.cb("preRequest"); 48 | done(); 49 | }, 50); 50 | }, 51 | callback: (error, response, done) => { 52 | t.is(error, null); 53 | t.is(t.context.cb.getCalls().length, 1); 54 | t.is(t.context.cb.getCalls()[0].args[0], "preRequest"); 55 | done(); 56 | t.end(); 57 | }, 58 | }); 59 | }); 60 | 61 | testCb(test, "preRequest should be executed the same times as request.", async t => { 62 | t.context.crawler = new Crawler({ 63 | // silence: true, 64 | jQuery: false, 65 | rateLimit: 50, 66 | preRequest: (options, done) => { 67 | t.context.cb("preRequest"); 68 | done(); 69 | }, 70 | callback: (error, response, done) => { 71 | t.is(error, null); 72 | t.context.cb("callback"); 73 | done(); 74 | }, 75 | }); 76 | const seq = []; 77 | for (let i = 0; i < 5; i++) { 78 | t.context.crawler.add("http://test.crawler.com/"); 79 | seq.push("preRequest"); 80 | seq.push("callback"); 81 | } 82 | t.context.crawler.add({ 83 | url: "http://test.crawler.com/", 84 | preRequest: (options, done) => done(), 85 | callback: (error, response, done) => { 86 | t.is(error, null); 87 | t.deepEqual( 88 | t.context.cb.getCalls().map(call => call.args[0]), 89 | seq 90 | ); 91 | done(); 92 | t.end(); 93 | }, 94 | }); 95 | }); 96 | 97 | testCb(test, "when preRequest fail, should retry two times by default.", async t => { 98 | t.context.crawler = new Crawler({ 99 | // silence: true, 100 | jQuery: false, 101 | rateLimit: 20, 102 | retryInterval: 0, 103 | preRequest: (options, done) => { 104 | t.context.cb("preRequest"); 105 | done(new Error("error")); 106 | }, 107 | callback: (error, response, done) => { 108 | t.truthy(error instanceof Error); 109 | t.is(t.context.cb.getCalls().length, 3); 110 | t.deepEqual( 111 | t.context.cb.getCalls().map(call => call.args[0]), 112 | ["preRequest", "preRequest", "preRequest"] 113 | ); 114 | done(); 115 | t.end(); 116 | }, 117 | }); 118 | t.context.crawler.add("http://test.crawler.com/"); 119 | }); 120 | -------------------------------------------------------------------------------- /test/priority.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | test.before(t => { 7 | nock.cleanAll(); 8 | nock('http://nockHost').get(url => url.indexOf('links') >= 0).times(4).reply(200, 'Yes'); 9 | t.context.crawler = new Crawler({ jQuery: false, maxConnections: 1 }); 10 | }); 11 | 12 | testCb(test, "should execute requests in the correct order", async t => { 13 | t.context.spf = []; 14 | let cnt = 0; 15 | t.context.crawler.add([{ 16 | url: 'http://nockHost/links/0', 17 | priority: 4, 18 | callback: (error, result, done) => { 19 | t.context.spf[cnt++] = 0; 20 | done(); 21 | } 22 | }]) 23 | t.context.crawler.add([{ 24 | url: 'http://nockHost/links/1', 25 | priority: 3, 26 | callback: (error, result, done) => { 27 | t.context.spf[cnt++] = 1; 28 | done(); 29 | } 30 | }]) 31 | t.context.crawler.add([{ 32 | url: 'http://nockHost/links/2', 33 | priority: 2, 34 | callback: (error, result, done) => { 35 | t.context.spf[cnt++] = 2; 36 | done(); 37 | } 38 | }]) 39 | t.context.crawler.add([{ 40 | url: 'http://nockHost/links/3', 41 | priority: 1, 42 | callback: (error, result, done) => { 43 | t.context.spf[cnt++] = 3; 44 | done(); 45 | } 46 | }]) 47 | t.context.crawler.on("drain", () => { 48 | t.deepEqual(t.context.spf, [0, 3, 2, 1]); 49 | t.end(); 50 | }); 51 | }); 52 | -------------------------------------------------------------------------------- /test/rateLimit.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | test.before(t => { 7 | nock.cleanAll(); 8 | }); 9 | test.beforeEach(t => { 10 | t.context.c = new Crawler({ 11 | // silence: true, 12 | retries: 0, 13 | rateLimit: 500, 14 | callback: (err, res, done) => { 15 | t.is(err, null); 16 | t.is(res.statusCode, 200); 17 | done(); 18 | } 19 | }); 20 | t.context.c.on('request', () => t.context.tsArrs.push(Date.now())); 21 | t.context.tsArrs = []; 22 | }); 23 | test.afterEach(t => { 24 | nock.cleanAll(); 25 | t.context.c = {}; 26 | t.context.tsArrs = []; 27 | }); 28 | 29 | testCb(test, "Interval of two requests should be no less than 500ms", async t => { 30 | nock('http://nockHost').get(url => url.includes('status')).times(2).delay(500).reply(200, 'Yes'); 31 | t.context.c.add({ url: 'http://nockHost/status/200' }); 32 | t.context.c.add({ 33 | url: 'http://nockHost/status/200', 34 | callback: (err, res, done) => { 35 | t.is(err, null); 36 | t.is(res.statusCode, 200); 37 | done(); 38 | t.is(t.context.tsArrs.length, 2); 39 | t.true(t.context.tsArrs[1] - t.context.tsArrs[0] >= 500); 40 | done(); 41 | } 42 | }); 43 | t.context.c.on("drain", t.end); 44 | }); 45 | 46 | testCb(test, "request speed should abide by rateLimit", async t => { 47 | nock('http://nockHost').get(url => url.includes('status')).times(5).reply(200, 'Yes'); 48 | for (let i = 0; i < 5; i++) { 49 | t.context.c.add('http://nockHost/status/200'); 50 | } 51 | t.context.c.on("drain", () => { 52 | t.is(t.context.tsArrs.length, 5); 53 | for (let i = 1; i < 5; i++) { 54 | const interval = t.context.tsArrs[i] - t.context.tsArrs[i - 1]; 55 | t.true(Math.abs(interval - 500) < 30); 56 | } 57 | t.end(); 58 | }); 59 | }); 60 | 61 | testCb(test, "should be able to change rateLimit", async t => { 62 | nock('http://nockHost').get(url => url.includes('status')).times(5).reply(200, 'Yes'); 63 | t.context.c.setLimiter(0, 'rateLimit', 300); 64 | for (let i = 0; i < 5; i++) { 65 | t.context.c.add('http://nockHost/status/200'); 66 | } 67 | t.context.c.on("drain", () => { 68 | t.is(t.context.tsArrs.length, 5); 69 | for (let i = 1; i < 5; i++) { 70 | const interval = t.context.tsArrs[i] - t.context.tsArrs[i - 1]; 71 | t.true(Math.abs(interval - 300) < 30); 72 | } 73 | t.end(); 74 | }); 75 | }); 76 | -------------------------------------------------------------------------------- /test/requests.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | const origin = 'http://www.whatever.com'; 7 | const path = '/get'; 8 | const headerPath = '/header'; 9 | test.before(t => { 10 | nock.cleanAll(); 11 | }); 12 | test.beforeEach(t => { 13 | t.context.crawler = new Crawler({ 14 | // silence: true, 15 | retries: 0, 16 | isJson: true, 17 | callback: (err, res, done) => { 18 | t.is(err, null); 19 | t.is(res.statusCode, 200); 20 | done(); 21 | } 22 | }); 23 | t.context.scope = nock(origin).get(path).reply(200).persist(); 24 | nock(origin).get(headerPath).reply(function () { 25 | return [200, this.req.headers, { 'Content-Type': 'application/json' }]; 26 | }); 27 | }); 28 | test.afterEach(t => { 29 | t.context.scope.persist(false); 30 | t.context.crawler = null; 31 | }); 32 | 33 | testCb(test, "should crawl one request", async t => { 34 | t.context.crawler.add({ 35 | url: `${origin}${path}`, callback: (error, res, done) => { 36 | t.is(error, null); 37 | t.is(res.statusCode, 200); 38 | done(); 39 | t.end(); 40 | } 41 | }); 42 | }); 43 | 44 | testCb(test, "should crawl two request request and emit the drain event.", async t => { 45 | const callback = function (error, res, next) { 46 | t.is(error, null); 47 | t.is(res.statusCode, 200); 48 | next(); 49 | }; 50 | 51 | t.context.crawler.on('drain', t.end); 52 | 53 | t.context.crawler.add({ 54 | url: `${origin}${path}`, 55 | callback: callback 56 | }); 57 | 58 | t.context.crawler.add({ 59 | url: `${origin}${path}`, 60 | callback: callback 61 | }); 62 | }); 63 | 64 | testCb(test, "should use the provided user-agent", async t => { 65 | const userAgent = 'test/1.2'; 66 | t.context.crawler.add({ 67 | url: `${origin}${path}`, 68 | headers: { "user-agent": userAgent }, 69 | callback: (error, res, done) => { 70 | t.is(error, null); 71 | t.is(res.statusCode, 200); 72 | t.is(res.options.headers['user-agent'], userAgent); 73 | done(); 74 | t.end(); 75 | } 76 | }); 77 | }); 78 | 79 | testCb(test, "should replace the global default user-agent", async t => { 80 | t.context.crawler = new Crawler({ 81 | // silence: true, 82 | isJson: true, 83 | headers: { "user-agent": "test/1.2" }, 84 | callback: (err, res, done) => { 85 | t.is(err, null); 86 | t.is(res.body['user-agent'], "foo/bar"); 87 | done(); 88 | t.end(); 89 | } 90 | }); 91 | t.context.crawler.add({ 92 | url: `${origin}${headerPath}`, 93 | headers: { "user-agent": "foo/bar" } 94 | }); 95 | }); 96 | 97 | testCb(test, "should spoof the referrer", async t => { 98 | const referer = 'http://spoofed.com'; 99 | t.context.crawler.add({ 100 | url: `${origin}${path}`, 101 | referer: referer, 102 | callback: (error, res, done) => { 103 | t.is(error, null); 104 | t.is(res.options.headers.referer, referer); 105 | done(); 106 | t.end(); 107 | } 108 | }); 109 | }); 110 | -------------------------------------------------------------------------------- /test/urlOptions.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | import sinon from "sinon"; 6 | 7 | test.before(t => { 8 | nock.cleanAll(); 9 | nock('http://test.crawler.com').get('/').reply(200, 'ok').persist(); 10 | t.context.crawler = new Crawler({ 11 | // silence: true, 12 | jQuery: false 13 | }); 14 | }); 15 | 16 | testCb(test, "should work if url is string", t => { 17 | t.context.crawler.add({ 18 | url: 'http://test.crawler.com/', 19 | callback: (error, response, done) => { 20 | t.is(error, null); 21 | done(); 22 | t.end(); 23 | } 24 | }); 25 | }); 26 | 27 | testCb(test, "should work if url is a function", t => { 28 | function urlFn(onUrl) { 29 | onUrl('http://test.crawler.com/'); 30 | } 31 | t.context.crawler.add({ 32 | url: urlFn, 33 | callback: (error, response, done) => { 34 | t.is(error, null); 35 | done(); 36 | t.end(); 37 | } 38 | }); 39 | }); 40 | 41 | testCb(test, "should skip if the url is undefined or an empty string", t => { 42 | const push = sinon.spy(t.context.crawler, '_schedule'); 43 | t.context.crawler.add([undefined, null, []]); 44 | t.context.crawler.add({ 45 | url: 'http://test.crawler.com/', 46 | callback: (error, response, done) => { 47 | t.true(push.calledOnce); 48 | done(); 49 | t.end(); 50 | } 51 | }); 52 | }); -------------------------------------------------------------------------------- /test/userAgent.js: -------------------------------------------------------------------------------- 1 | import test from "ava"; 2 | import { testCb } from "./lib/avaTestCb.js"; 3 | import nock from "nock"; 4 | import Crawler from "../dist/index.js"; 5 | 6 | test.before(t => { 7 | nock.cleanAll(); 8 | nock("http://nockhost").get(url => url.indexOf("status") >= 0).times(20).reply(200, "Yes"); 9 | t.context.calledAgents = []; 10 | t.context.crawler = new Crawler({ 11 | // silence: true, 12 | jQuery: false, 13 | userAgents: [ 14 | "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", 15 | "Googlebot/2.1 (+http://www.google.com/bot.html)", 16 | "test/1.0", 17 | "test/2.0" 18 | ], 19 | callback: (error, res, done) => { 20 | t.context.calledAgents.push(res.request.options.headers["user-agent"]); 21 | done(); 22 | } 23 | }); 24 | }); 25 | 26 | testCb(test, "should rotate user agents if userAgents is set.", async t => { 27 | t.context.crawler.add([ 28 | "http://nockhost/status1", 29 | "http://nockhost/status2", 30 | "http://nockhost/status3", 31 | "http://nockhost/status4", 32 | "http://nockhost/status1", 33 | ]) 34 | t.context.crawler.on("drain", () => { 35 | t.deepEqual(t.context.calledAgents, [ 36 | "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", 37 | "Googlebot/2.1 (+http://www.google.com/bot.html)", 38 | "test/1.0", 39 | "test/2.0", 40 | "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 41 | ]); 42 | t.end(); 43 | }); 44 | }); 45 | 46 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig to read more about this file */ 4 | 5 | /* Projects */ 6 | // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ 7 | // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ 8 | // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ 9 | // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ 10 | // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ 11 | // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ 12 | 13 | /* Language and Environment */ 14 | "target": "es2020" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */, 15 | // "lib": [ 16 | // "ESNext", 17 | // "DOM" 18 | // ] /* Specify a set of bundled library declaration files that describe the target runtime environment. */, 19 | // "jsx": "preserve", /* Specify what JSX code is generated. */ 20 | // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ 21 | // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ 22 | // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ 23 | // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ 24 | // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ 25 | // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ 26 | // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ 27 | // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ 28 | // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ 29 | 30 | /* Modules */ 31 | "module": "es2020" /* Specify what module code is generated. */, 32 | // "rootDir": "./", /* Specify the root folder within your source files. */ 33 | "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ 34 | // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ 35 | // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ 36 | // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ 37 | // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ 38 | // "types": [], /* Specify type package names to be included without being referenced in a source file. */ 39 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 40 | // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ 41 | // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ 42 | // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ 43 | // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ 44 | // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ 45 | // "resolveJsonModule": true, /* Enable importing .json files. */ 46 | // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ 47 | // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ 48 | 49 | /* JavaScript Support */ 50 | // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ 51 | // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ 52 | // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ 53 | 54 | /* Emit */ 55 | "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ 56 | "declarationMap": true, /* Create sourcemaps for d.ts files. */ 57 | // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ 58 | "sourceMap": true /* Create source map files for emitted JavaScript files. */, 59 | // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ 60 | // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ 61 | "outDir": "dist" /* Specify an output folder for all emitted files. */, 62 | // "removeComments": true, /* Disable emitting comments. */ 63 | // "noEmit": true, /* Disable emitting files from a compilation. */ 64 | // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ 65 | // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */ 66 | // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ 67 | // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ 68 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 69 | // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ 70 | // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ 71 | // "newLine": "crlf", /* Set the newline character for emitting files. */ 72 | // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ 73 | // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ 74 | // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ 75 | // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ 76 | // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ 77 | // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ 78 | 79 | /* Interop Constraints */ 80 | // "isolatedModules": false /* Ensure that each file can be safely transpiled without relying on other imports. */, 81 | // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ 82 | // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ 83 | "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */, 84 | // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ 85 | "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */, 86 | 87 | /* Type Checking */ 88 | "strict": true /* Enable all strict type-checking options. */, 89 | // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ 90 | // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ 91 | // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ 92 | // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ 93 | // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ 94 | // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ 95 | // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ 96 | // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ 97 | // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ 98 | // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ 99 | // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ 100 | // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ 101 | // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ 102 | // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ 103 | // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ 104 | // "noPropertyAccessFromIndexSignature": true /* Enforces using indexed accessors for keys declared using an indexed type. */, 105 | // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ 106 | // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ 107 | 108 | /* Completeness */ 109 | // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ 110 | "skipLibCheck": false /* Skip type checking all .d.ts files. */ 111 | }, 112 | "include": ["src/**/*"], 113 | "exclude": ["node_modules", "dist"] 114 | } 115 | --------------------------------------------------------------------------------