├── .gitignore
├── .jshintrc
├── .travis.yml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── LICENSE.txt
├── README.md
├── lib
├── crawler.js
└── debug.js
├── package.json
├── tests
├── cacheOption.test.js
├── encoding.test.js
├── errorHandling.test.js
├── examples.test.js
├── jqueryOption.test.js
├── linksResolving.test.js
├── memoryLeaks.test.js
├── rateLimitsOption.test.js
├── requests.test.js
└── uriOption.test.js
├── travis_scripts
└── before_install.sh
└── vendor
├── jquery-1.11.1.min.js
└── jquery-2.1.1.min.js
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .DS_Store
3 | .idea/
4 | npm-debug.log
--------------------------------------------------------------------------------
/.jshintrc:
--------------------------------------------------------------------------------
1 | {
2 | "bitwise" : true,
3 | "camelcase" : true,
4 | "curly" : true,
5 | "eqeqeq" : true,
6 | "freeze" : true,
7 | "forin" : true,
8 | "immed" : false,
9 | "indent" : 4,
10 | "latedef" : false,
11 | "newcap" : false,
12 | "noarg" : true,
13 | "noempty" : true,
14 | "nonbsp" : true,
15 | "nonew" : false,
16 | "plusplus" : false,
17 | "quotmark" : "single",
18 | "undef" : true,
19 | "unused" : true,
20 | "strict" : true,
21 | "maxparams" : false,
22 | "maxdepth" : false,
23 | "maxstatements" : false,
24 | "maxcomplexity" : false,
25 | "maxlen" : 120,
26 |
27 | "asi" : false,
28 | "boss" : false,
29 | "debug" : false,
30 | "eqnull" : false,
31 | "es5" : false,
32 | "esnext" : false,
33 | "moz" : false,
34 | "evil" : false,
35 | "expr" : true,
36 | "funcscope" : false,
37 | "globalstrict" : false,
38 | "iterator" : false,
39 | "lastsemic" : false,
40 | "laxbreak" : false,
41 | "laxcomma" : false,
42 | "loopfunc" : false,
43 | "multistr" : false,
44 | "noyield" : false,
45 | "notypeof" : false,
46 | "proto" : false,
47 | "scripturl" : false,
48 | "shadow" : false,
49 | "sub" : false,
50 | "supernew" : false,
51 | "validthis" : false,
52 |
53 | "devel" : true,
54 | "mocha" : true,
55 | "node" : true,
56 |
57 | "globals" : {}
58 | }
59 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | sudo: required
3 | node_js:
4 | - 0.10
5 | - 0.12
6 | os:
7 | - linux
8 | - osx
9 | before_install:
10 | - sh ./travis_scripts/before_install.sh
11 |
12 | matrix:
13 | allow_failures:
14 | - os: osx
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | node-webcrawler ChangeLog
2 | -------------------------
3 |
4 | 0.5.0 version changelog:
5 | * parse charset from `content-type` in http headers or meta tag in html, then convert
6 | * big5 charset is avaliable as the `iconv-lite` has already supported it
7 | * default enable gzip in request header
8 | * remove unzip code in crawler since `request` will do this
9 | * body will return as a Buffer if encoding is null which is an option in `request`
10 | * remove cache and skip duplicate `request` for `GET`, `POST`(only for type `urlencode`), `HEAD`
11 | * add log feature, you can use `winston` to set `logger:winston`, or crawler will output to console
12 | * rotate user-agent in case some sites ban your requests
13 |
14 | 0.5.1 version changelog:
15 | * remove cache feature, it's useless
16 | * add `localAddress`, `time`, `tunnel`, `proxyHeaderWhiteList`, `proxyHeaderExclusiveList` properties to pass to `request`
17 |
18 | 0.5.2 version changelog:
19 | * you can manually terminate all the resources in your pool, when `onDrain` called, before their timeouts have been reached
20 | * add a read-only property `queueSize` to crawler
21 |
22 | 0.6.0 version changelog:
23 | * add `bottleneck` to implement rate limit, one can set limit for each connection at same time.
24 |
25 | 0.6.3 version changelog:
26 | * you could also get `result.options` from callback even when some errors ouccurred
27 | * add test for `bottleneck`
28 |
29 | 0.6.5
30 | * fix a deep and big bug when initializing Pool, that may lead to sequence execution. [issue](https://github.com/bda-research/node-webcrawler/issues/2)
31 | * print log of Pool status
32 |
33 | 0.6.9
34 | * use `bottleneckConcurrent` instead of `maxConnections`, default `10000`
35 | * add debug info
36 |
37 | 0.7.0
38 | * cancel recursion in queue
39 | * upgrade `request` version to v2.67.0
40 |
41 | 0.7.4
42 | * change `debug` option to instance level instead of `options`
43 | * update README.md to detail error handling
44 | * call `onDrain` with scope of `this`
45 | * upgrade `seenreq` version to 0.1.7
46 |
47 | 0.7.5
48 | * delete entity in options before copy, and assgin after, `jar` is one of the typical properties which is an `Entity` wich functions
49 | * upgrade `request` to version 2.74.0
50 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the official Docker images
2 | # https://registry.hub.docker.com/_/node/
3 | #
4 | FROM node:0.12.7
5 |
6 | RUN apt-get update
7 |
8 | RUN apt-get install -y python python-pip
9 |
10 | RUN pip install httpbin gunicorn
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Mike Chen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010 Sylvain Zimmer
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/bda-research/node-webcrawler)
2 | This repo is merged with [node-crawler](https://github.com/bda-research/node-crawler), please use it instead.
3 | ------------
4 |
5 | Features:
6 | * server-side DOM & automatic jQuery insertion with Cheerio (default) or JSDOM
7 | * Configurable pool size and retries
8 | * Priority of requests
9 | * forceUTF8 mode to let crawler deal for you with charset detection and conversion
10 |
11 | Here is the [CHANGELOG](https://github.com/bda-research/node-webcrawler/blob/master/CHANGELOG.md)
12 |
13 | Help & Forks welcomed!
14 |
15 | How to install
16 | --------------
17 |
18 | $ npm install node-webcrawler
19 |
20 | Crash course
21 | ------------
22 |
23 | ```javascript
24 | var Crawler = require("node-webcrawler");
25 | var url = require('url');
26 |
27 | var c = new Crawler({
28 | maxConnections : 10,
29 | // This will be called for each crawled page
30 | callback : function (error, result, $) {
31 | // $ is Cheerio by default
32 | //a lean implementation of core jQuery designed specifically for the server
33 | if(error){
34 | console.log(error);
35 | }else{
36 | console.log($("title").text());
37 | }
38 | }
39 | });
40 |
41 | // Queue just one URL, with default callback
42 | c.queue('http://www.amazon.com');
43 |
44 | // Queue a list of URLs
45 | c.queue(['http://www.google.com/','http://www.yahoo.com']);
46 |
47 | // Queue URLs with custom callbacks & parameters
48 | c.queue([{
49 | uri: 'http://parishackers.org/',
50 | jQuery: false,
51 |
52 | // The global callback won't be called
53 | callback: function (error, result) {
54 | if(error){
55 | console.log(error);
56 | }else{
57 | console.log('Grabbed', result.body.length, 'bytes');
58 | }
59 | }
60 | }]);
61 |
62 | // Queue some HTML code directly without grabbing (mostly for tests)
63 | c.queue([{
64 | html: '
This is a test
'
65 | }]);
66 | ```
67 |
68 | Work with `bottleneck`
69 | --------------------
70 | Control rate limits for each connection, usually used with proxy.
71 |
72 | ```javascript
73 | var Crawler = require("node-webcrawler");
74 |
75 | var c = new Crawler({
76 | maxConnections : 3,
77 | rateLimits:2000,
78 | callback : function (error, result, $) {
79 | if(error){
80 | console.error(error);
81 | }else{
82 | console.log($('title').text());
83 | }
84 | }
85 | });
86 |
87 | c.queue({
88 | uri:"http://www.google.com",
89 | limiter:"key1",// for connection of 'key1'
90 | proxy:"http://user:pass@127.0.0.1:8080"
91 | });
92 |
93 | c.queue({
94 | uri:"http://www.google.com",
95 | limiter:"key2", // for connection of 'key2'
96 | proxy:"http://user:pass@127.0.0.1:8082"
97 | });
98 |
99 | c.queue({
100 | uri:"http://www.google.com",
101 | limiter:"key3", // for connection of 'key3'
102 | proxy:"http://user:pass@127.0.0.1:8081"
103 | });
104 |
105 | ```
106 |
107 | Options reference
108 | -----------------
109 |
110 | You can pass these options to the Crawler() constructor if you want them to be global or as
111 | items in the queue() calls if you want them to be specific to that item (overwriting global options)
112 |
113 | This options list is a strict superset of [mikeal's request options](https://github.com/mikeal/request#requestoptions-callback) and will be directly passed to
114 | the request() method.
115 |
116 | Basic request options:
117 |
118 | * `uri`: String, the URL you want to crawl
119 | * `timeout` : Number, in milliseconds (Default 15000)
120 | * [All mikeal's requests options are accepted](https://github.com/mikeal/request#requestoptions-callback)
121 |
122 | Callbacks:
123 |
124 | * `callback(error, result, $)`: A request was completed
125 | * `onDrain(pool)`: There is no more queued requests, call `pool.destroyAllNow()` if you wanna release resources in pool to, or if you have follow-up tasks to queue you can ignore.
126 |
127 | Pool options:
128 |
129 | * `maxConnections`: Number, Size of the worker pool (Default 10),
130 | * `priorityRange`: Number, Range of acceptable priorities starting from 0 (Default 10),
131 | * `priority`: Number, Priority of this request (Default 5),
132 |
133 | Retry options:
134 |
135 | * `retries`: Number of retries if the request fails (Default 3),
136 | * `retryTimeout`: Number of milliseconds to wait before retrying (Default 10000),
137 |
138 | Server-side DOM options:
139 |
140 | * `jQuery`: true, false or ConfObject (Default true)
141 |
142 | Charset encoding:
143 |
144 | * `forceUTF8`: Boolean, if true will get charset from HTTP headers or meta tag in html and convert it to UTF8 if necessary. Never worry about encoding anymore! (Default false),
145 | * `incomingEncoding`: String, with forceUTF8: true to set encoding manually (Default null)
146 | `incomingEncoding : 'windows-1255'` for example
147 |
148 | Cache:
149 |
150 | * `cache`: Boolean, if true stores requests' result in memory (Default false), not recommend if you are doing with huge amount of pages as the process will exhaust momery
151 | * `skipDuplicates`: Boolean, if true skips URIs that were already crawled, without even calling callback() (Default false)
152 |
153 | Other:
154 | * `rotateUA`: Boolean, if true, `userAgent` should be an array, and rotate it (Default false)
155 | * `userAgent`: String or Array, if `rotateUA` is false, but `userAgent` is array, will use first one.
156 | * `referer`: String, if truthy sets the HTTP referer header
157 | * `rateLimits`: Number of milliseconds to delay between each requests (Default 0)
158 |
159 |
160 | Class:Crawler
161 | -------------
162 |
163 | Instance of Crawler
164 |
165 | __crawler.queue(uri|options)__
166 | * `uri` String, `options` is [Options](#options-reference)
167 |
168 | __crawler.queueSize__
169 |
170 | Size of queue, read-only
171 |
172 |
173 | Working with Cheerio or JSDOM
174 | -----------------------------
175 |
176 | Crawler by default use [Cheerio](https://github.com/cheeriojs/cheerio) instead of [Jsdom](https://github.com/tmpvar/jsdom). Jsdom is more robust but can be hard to install (espacially on windows) because of [contextify](https://github.com/tmpvar/jsdom#contextify).
177 | Which is why, if you want to use jsdom you will have to build it, and `require('jsdom')` in your own script before passing it to crawler. This is to avoid cheerio crawler user to build jsdom when installing crawler.
178 |
179 | ###Working with Cheerio
180 | ```javascript
181 | jQuery: true //(default)
182 | //OR
183 | jQuery: 'cheerio'
184 | //OR
185 | jQuery: {
186 | name: 'cheerio',
187 | options: {
188 | normalizeWhitespace: true,
189 | xmlMode: true
190 | }
191 | }
192 | ```
193 | These parsing options are taken directly from [htmlparser2](https://github.com/fb55/htmlparser2/wiki/Parser-options), therefore any options that can be used in `htmlparser2` are valid in cheerio as well. The default options are:
194 |
195 | ```js
196 | {
197 | normalizeWhitespace: false,
198 | xmlMode: false,
199 | decodeEntities: true
200 | }
201 | ```
202 |
203 | For a full list of options and their effects, see [this](https://github.com/fb55/DomHandler) and
204 | [htmlparser2's options](https://github.com/fb55/htmlparser2/wiki/Parser-options).
205 | [source](https://github.com/cheeriojs/cheerio#loading)
206 |
207 | ###Working with JSDOM
208 |
209 | In order to work with JSDOM you will have to install it in your project folder `npm install jsdom`, deal with [compiling C++](https://github.com/tmpvar/jsdom#contextify) and pass it to crawler.
210 | ```javascript
211 | var jsdom = require('jsdom');
212 | var Crawler = require('node-webcrawler');
213 |
214 | var c = new Crawler({
215 | jQuery: jsdom
216 | });
217 | ```
218 |
219 | How to test
220 | -----------
221 |
222 | ### Install and run Httpbin
223 |
224 | node-webcrawler use a local httpbin for testing purpose. You can install httpbin as a library from PyPI and run it as a WSGI app. For example, using Gunicorn:
225 |
226 | $ pip install httpbin
227 | // launch httpbin as a daemon with 6 worker on localhost
228 | $ gunicorn httpbin:app -b 127.0.0.1:8000 -w 6 --daemon
229 |
230 | // Finally
231 | $ npm install && npm test
232 |
233 | ### Alternative: Docker
234 |
235 | After [installing Docker](http://docs.docker.com/), you can run:
236 |
237 | // Builds the local test environment
238 | $ docker build -t node-webcrawler .
239 |
240 | // Runs tests
241 | $ docker run node-webcrawler sh -c "gunicorn httpbin:app -b 127.0.0.1:8000 -w 6 --daemon && npm install && npm test"
242 |
243 | // You can also ssh into the container for easier debugging
244 | $ docker run -i -t node-webcrawler bash
245 |
246 |
247 | [](https://travis-ci.org/bda-research/node-webcrawler)
248 |
249 | Rough todolist
250 | --------------
251 |
252 | * Using bottleneck to deal with rate limits
253 | * Introducing zombie to deal with page with complex ajax
254 | * Refactoring the code to be more maintenable, it's spaghetti code in there !
255 | * Proxy feature
256 | * This issue: https://github.com/sylvinus/node-crawler/issues/118
257 | * Make Sizzle tests pass (jsdom bug? https://github.com/tmpvar/jsdom/issues#issue/81)
258 | * More crawling tests
259 | * Document the API more (+ the result object)
260 | * Option to wait for callback to finish before freeing the pool resource (via another callback like next())
261 |
262 |
263 | ChangeLog
264 | ---------
265 |
266 | See https://github.com/bda-research/node-webcrawler/releases
267 |
--------------------------------------------------------------------------------
/lib/crawler.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | var path = require('path'),
4 | util = require('util'),
5 | EventEmitter = require('events').EventEmitter,
6 | request = require('request'),
7 | _ = require('lodash'),
8 | cheerio = require('cheerio'),
9 | fs = require('fs'),
10 | Pool = require('generic-pool').Pool,
11 | charsetParser = require('charset-parser'),
12 | Bottleneck = require('bottleneck'),
13 | seenreq = require('seenreq');
14 |
15 | var logger = null;
16 | // Fallback on iconv-lite if we didn't succeed compiling iconv
17 | // https://github.com/sylvinus/node-crawler/pull/29
18 | var iconv, iconvLite;
19 | try {
20 | iconv = require('iconv').Iconv;
21 | } catch (e) {}
22 |
23 | if (!iconv) {
24 | iconvLite = require('iconv-lite');
25 | }
26 |
27 | function useCache (options) {
28 | return (options.uri || options.url) && options.cache;
29 | }
30 |
31 | function checkJQueryNaming (options) {
32 | if ('jquery' in options) {
33 | options.jQuery = options.jquery;
34 | delete options.jquery;
35 | }
36 | return options;
37 | }
38 |
39 | function readJqueryUrl (url, callback) {
40 | if (url.match(/^(file\:\/\/|\w+\:|\/)/)) {
41 | fs.readFile(url.replace(/^file\:\/\//,''),'utf-8', function(err,jq) {
42 | callback(err, jq);
43 | });
44 | } else {
45 | callback(null, url);
46 | }
47 | }
48 |
49 | function Crawler (options) {
50 | var self = this;
51 | self.init(options);
52 | }
53 | // augment the prototype for node events using util.inherits
54 | util.inherits(Crawler, EventEmitter);
55 |
56 | Crawler.prototype.init = function init (options) {
57 | var self = this;
58 |
59 | var defaultOptions = {
60 | autoWindowClose: true,
61 | cache: false,
62 | forceUTF8: false,
63 | gzip: true,
64 | incomingEncoding: null, //TODO remove or optimize
65 | jQuery: true,
66 | maxConnections: 10,
67 | bottleneckConcurrent: 10000,
68 | method: 'GET',
69 | onDrain: false,
70 | priority: 5,
71 | priorityRange: 10,
72 | rateLimits: 0,
73 | referer: false,
74 | retries: 3,
75 | retryTimeout: 10000,
76 | timeout: 15000,
77 | skipDuplicates: false,
78 | rotateUA: false
79 | };
80 |
81 | //return defaultOptions with overriden properties from options.
82 | self.options = _.extend(defaultOptions, options);
83 |
84 | // you can use jquery or jQuery
85 | self.options = checkJQueryNaming(self.options);
86 |
87 | if (self.options.rateLimits !== 0 && self.options.maxConnections == 1) {
88 | //self.options.limiter = "default";
89 | } else {
90 | //self.limiters = null;//self.options.maxConnections = 1;
91 | }
92 |
93 | // Don't make these options persist to individual queries
94 | self.globalOnlyOptions = ['maxConnections', 'priorityRange', 'onDrain'];
95 |
96 | //Setup a worker pool w/ https://github.com/coopernurse/node-pool
97 | self.pool = Pool({
98 | name : 'crawler',
99 | max : self.options.maxConnections,
100 | min : self.options.minConnections,
101 | log : self.options.debug && self.options.logger && function(){self.options.logger.log(arguments[1],arguments[0]);},
102 | priorityRange: self.options.priorityRange,
103 | create : function(callback) {
104 | callback(new Object());
105 | },
106 | destroy : function() {}
107 | });
108 |
109 | self.limiters = new Bottleneck.Cluster(self.options.bottleneckConcurrent,self.options.rateLimits);
110 | self.plannedQueueCallsCount = 0;
111 | self.queueItemSize = 0;
112 |
113 | self.cache = {};
114 | self.seen = new seenreq();
115 | self.debug = self.options.debug || false;
116 | self.mapEntity = Object.create(null);
117 | self.entityList = ["jar"];
118 | logger = self.options.logger || console;
119 |
120 | self.on('pool:release', function(options) {
121 | self._release(options);
122 | });
123 |
124 | self.on("request",function(options){
125 | if(_.isFunction(self.options.preRequest)){
126 | self.options.preRequest(options);
127 | }
128 | });
129 |
130 | self.on('pool:drain', function() {
131 | if (self.options.onDrain) {
132 | self.options.onDrain.call(self, self.pool);
133 | }
134 | });
135 | };
136 |
137 | Crawler.prototype._release = function _release (options) {
138 | var self = this;
139 |
140 | self.queueItemSize--;
141 | if (options._poolReference) {
142 | if(self.debug){
143 | logger.info("Releasing resource, limiter:%s", options.limiter || "default");
144 | }
145 | self.pool.release(options._poolReference);
146 | }
147 |
148 | // Pool stats are behaving weird - have to implement our own counter
149 | if (self.queueItemSize + self.plannedQueueCallsCount === 0) {
150 | self.emit('pool:drain');
151 | }
152 | };
153 |
154 | Crawler.prototype._inject = function _inject (response, options, callback) {
155 | var $;
156 | var self = this;
157 |
158 | if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) {
159 | var defaultCheerioOptions = {
160 | normalizeWhitespace: false,
161 | xmlMode: false,
162 | decodeEntities: true
163 | };
164 | var cheerioOptions = options.jQuery.options || defaultCheerioOptions;
165 | $ = cheerio.load(response.body, cheerioOptions);
166 |
167 | callback(null, $);
168 | }
169 |
170 | else if (options.jQuery.jsdom) {
171 | var jsdom = options.jQuery.jsdom;
172 | var scriptLocation = path.resolve(__dirname, '../vendor/jquery-2.1.1.min.js');
173 |
174 | //Use promises
175 | readJqueryUrl(scriptLocation, function(err, jquery) {
176 | try {
177 | jsdom.env({
178 | url: options.uri,
179 | html: response.body,
180 | src: [jquery],
181 | done: function (errors, window) {
182 | $ = window.jQuery;
183 | callback(errors, $);
184 |
185 | try {
186 | window.close();
187 | window = null;
188 | } catch (err) {
189 | logger.error(err);
190 | }
191 |
192 | }
193 | });
194 | } catch (e) {
195 | options.callback(e);
196 | self.emit('pool:release', options);
197 | }
198 | });
199 | }
200 | // Jquery is set to false are not set
201 | else {
202 | callback(null);
203 | }
204 | };
205 |
206 | Crawler.prototype.queue = function queue (options) {
207 | var self = this;
208 |
209 | // Did you get a single object or string? Make it compatible.
210 | options = _.isArray(options) ? options : [options];
211 |
212 | options = _.flattenDeep(options);
213 |
214 | for(var i = 0; i < options.length; ++i) {
215 | if(_.isNull(options[i]) || _.isUndefined(options[i]) || (!_.isString(options[i]) && !_.isPlainObject(options[i]))) {
216 | if(self.debug) {
217 | logger.warn("Illegal queue option: ", JSON.stringify(options[i]));
218 | }
219 | continue;
220 | }
221 | self._pushToQueue(
222 | _.isString(options[i]) ? {uri: options[i]} : options[i]
223 | );
224 | }
225 | };
226 |
227 | Crawler.prototype._pushToQueue = function _pushToQueue (options) {
228 | var self = this;
229 | self.queueItemSize++;
230 |
231 | // you can use jquery or jQuery
232 | options = checkJQueryNaming(options);
233 |
234 | _.defaults(options, self.options);
235 |
236 | // Remove all the global options from our options
237 | // TODO we are doing this for every _pushToQueue, find a way to avoid this
238 | _.each(self.globalOnlyOptions, function(globalOnlyOption) {
239 | delete options[globalOnlyOption];
240 | });
241 |
242 | // If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled
243 | if (options.skipDuplicates && self.seen.exists(options)) {
244 | return self.emit('pool:release', options);
245 | }
246 |
247 | // acquire connection - callback function is called
248 | // once a resource becomes available
249 | // self.pool.acquire(
250 | var acquired = function(error, poolReference) {
251 | options._poolReference = poolReference;
252 |
253 | // this is and operation error
254 | if (error) {
255 | logger.error(error);
256 | options.callback(error);// need release
257 | return self.emit('pool:release',options);
258 | }
259 |
260 | if(self.debug){
261 | logger.info("Acquired resource, limiter:%s, uri:%s", options.limiter || "default", options.uri);
262 | logger.info("pool queue size:%s, bottleneck '%s' queue size:%s", self.waitingCount, options.limiter||"default", self.limiters.key(options.limiter||"default")._queue.length);
263 | }
264 |
265 | //Static HTML was given, skip request
266 | if (options.html) {
267 | self._onContent(null, options, {body:options.html});
268 | } else if (typeof options.uri === 'function') {
269 | options.uri(function(uri) {
270 | options.uri = uri;
271 | self._makeCrawlerRequest(options);
272 | });
273 | } else {
274 | self._makeCrawlerRequest(options);
275 | }
276 | }//, options.priority);
277 |
278 | var acquireWrapped = function(priority,cb){
279 | if(self.debug){
280 | logger.info("Called by bottleneck, limiter:%s, uri:%s", options.limiter || "default", options.uri);
281 | }
282 |
283 | return self.pool.acquire(cb,priority);
284 | }
285 |
286 | self.limiters.key(options.limiter||"default").submit(acquireWrapped,options.priority,acquired);
287 | };
288 |
289 | Crawler.prototype._makeCrawlerRequest = function _makeCrawlerRequest (options) {
290 | var self = this;
291 | //var cacheData = self.cache[self.seen.normalize(options)];
292 |
293 | // if(useCache(options) && cacheData){
294 | // if(self.debug){
295 | // logger.info("using cache.");
296 | // }
297 |
298 | // self._onContent(null, options, cacheData, true);
299 | // return;
300 | // }
301 |
302 | // if (typeof options.rateLimits === 'number' && options.rateLimits !== 0) {
303 | // setTimeout(function() {
304 | // self._buildHttpRequest(options);
305 | // }, options.rateLimits);
306 | // } else {
307 | self._buildHttpRequest(options);
308 | // }
309 | };
310 |
311 | Crawler.prototype._deleteEntity = function _deleteEntity(options){
312 | var self = this;
313 | this.entityList.forEach(function(name){
314 | if(typeof options[name] == "object"){
315 | self.mapEntity[name] = options[name];
316 | delete options[name];
317 | }
318 | })
319 | }
320 |
321 | Crawler.prototype._attachEntity = function _attachEntity(options){
322 | var self = this;
323 | return this.entityList.reduce(function(target,name){
324 | if(typeof self.mapEntity[name] == "object")
325 | target[name] = self.mapEntity[name];
326 |
327 | return target;
328 | }, options);
329 | }
330 |
331 |
332 | Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
333 | var self = this;
334 |
335 | if (self.debug) {
336 | logger.info(options.method+' '+options.uri);
337 | if(options.proxy)
338 | logger.info("Use proxy: %s", options.proxy);
339 | }
340 |
341 | // Cloning keeps the opts parameter clean:
342 | // - some versions of "request" apply the second parameter as a
343 | // property called "callback" to the first parameter
344 | // - keeps the query object fresh in case of a retry
345 | // Doing parse/stringify instead of _.clone will do a deep clone and remove functions
346 |
347 | self._deleteEntity(options);
348 | var ropts = JSON.parse(JSON.stringify(options));
349 | self._attachEntity(ropts);
350 |
351 | if (!ropts.headers) { ropts.headers={}; }
352 | if (ropts.forceUTF8) {
353 | // if (!ropts.headers['Accept-Charset'] && !ropts.headers['accept-charset']) {
354 | // ropts.headers['Accept-Charset'] = 'utf-8;q=0.7,*;q=0.3';
355 | // }
356 |
357 | ropts.encoding=null;
358 | }
359 |
360 | if (ropts.userAgent) {
361 | if(ropts.rotateUA && _.isArray(ropts.userAgent)){
362 | ropts.headers['User-Agent'] = ropts.userAgent[0];
363 | // If "rotateUA" is true, rotate User-Agent
364 | options.userAgent.push(options.userAgent.shift());
365 | }else{
366 | ropts.headers['User-Agent'] = ropts.userAgent;
367 | }
368 | if(self.debug){
369 | logger.info(ropts.headers['User-Agent']);
370 | }
371 | }
372 | if (ropts.referer) {
373 | ropts.headers.Referer = ropts.referer;
374 | }
375 | if (ropts.proxies && ropts.proxies.length) {
376 | ropts.proxy = ropts.proxies[0];
377 | }
378 |
379 | this.emit("request",ropts);
380 |
381 | var requestArgs = ['uri','url','qs','method','headers','body','form','json','multipart','followRedirect',
382 | 'followAllRedirects', 'maxRedirects','encoding','pool','timeout','proxy','auth','oauth','strictSSL',
383 | 'jar','aws','gzip','time','tunnel','proxyHeaderWhiteList','proxyHeaderExclusiveList','localAddress','forever'];
384 |
385 | var req = request(_.pick.apply(this,[ropts].concat(requestArgs)), function(error,response) {
386 | if (error) {
387 | return self._onContent(error, options);
388 | }
389 |
390 | response.uri = response.request.href;
391 | self._onContent(error,options,response);
392 | });
393 | };
394 |
395 | Crawler.prototype._onContent = function _onContent (error, options, response, fromCache) {
396 | var self = this;
397 |
398 | if (error) {
399 | if (self.debug) {
400 | logger.error('Error '+error+' when fetching '+
401 | options.uri+(options.retries?' ('+options.retries+' retries left)':''));
402 | }
403 | if (options.retries) {
404 | self.plannedQueueCallsCount++;
405 | setTimeout(function() {
406 | options.retries--;
407 | self.plannedQueueCallsCount--;
408 |
409 | // If there is a "proxies" option, rotate it so that we don't keep hitting the same one
410 | // if (options.proxies) {
411 | // options.proxies.push(options.proxies.shift());
412 | // }
413 | self.queue(options);
414 | },options.retryTimeout);
415 |
416 | } else if (options.callback) {
417 | options.callback(error,{options:options});
418 | }
419 |
420 | return self.emit('pool:release', options);
421 | }
422 |
423 | if (!response.body) { response.body=''; }
424 |
425 | if (self.debug) {
426 | logger.info('Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...');
427 | }
428 |
429 | if(!fromCache){
430 | try{
431 | self._doEncoding(options,response);
432 | }catch(e){
433 | logger.error(e);
434 | if(options.callback){
435 | options.callback(e);
436 | }
437 | return self.emit('pool:release',options);
438 | }
439 | }
440 |
441 | // if(useCache(options)){
442 | // self.cache[self.seen.normalize(options)] = response;
443 | // }
444 |
445 | if (!options.callback) {
446 | return self.emit('pool:release', options);
447 | }
448 |
449 | response.options = options;
450 |
451 | // This could definitely be improved by *also* matching content-type headers
452 | var isHTML = _.isString(response.body) && response.body.match(/^\s*);
453 |
454 | if (isHTML && options.jQuery && options.method !== 'HEAD') {
455 | self._inject(response, options, function(errors, $) {
456 | self._onInject(errors, options, response, $);
457 | });
458 | } else {
459 | options.callback(null,response);
460 | self.emit('pool:release', options);
461 | }
462 | };
463 |
464 | Crawler.prototype._doEncoding = function(options,response){
465 | var self = this;
466 |
467 | if(options.encoding === null){
468 | return;
469 | }
470 |
471 | if (options.forceUTF8) {
472 | var iconvObj;
473 | var charset = options.incomingEncoding || self._parseCharset(response);
474 |
475 | if (self.debug) {
476 | logger.info('Charset ' + charset);
477 | }
478 |
479 | if (charset !== 'utf-8' && charset !== 'ascii') {
480 | if (iconv) {
481 | iconvObj = new iconv(charset, 'UTF-8//TRANSLIT//IGNORE');
482 | response.body = iconvObj.convert(response.body).toString();
483 | } else{
484 | response.body = iconvLite.decode(response.body, charset);
485 | }
486 | }
487 | }
488 |
489 | //if charset = 'utf-8', call toString() ;
490 | response.body = response.body.toString();
491 | }
492 |
493 | Crawler.prototype._onInject = function _onInject (errors, options, response, $) {
494 | var self = this;
495 |
496 | options.callback(errors, response, $);
497 | self.emit('pool:release', options);
498 | };
499 |
500 | Crawler.prototype._parseCharset = function(res){
501 | var ctt = res.headers['content-type']||res.headers['Content-Type']||'';
502 | var body = res.body instanceof Buffer?res.body.toString():res.body;
503 | var charset = charsetParser(ctt,body,'utf-8');
504 |
505 | return charset;
506 | }
507 |
508 | Object.defineProperty(Crawler.prototype,'queueSize',{
509 | get:function(){
510 | return this.pool.waitingClientsCount();
511 | }
512 | })
513 |
514 | Object.defineProperty(Crawler.prototype,'waitingCount',{
515 | get:function(){
516 | return this.pool.waitingClientsCount();
517 | }
518 | })
519 |
520 | Object.defineProperty(Crawler.prototype,'availableCount',{
521 | get:function(){
522 | return this.pool.availableObjectsCount();
523 | }
524 | })
525 |
526 | Object.defineProperty(Crawler.prototype,'size',{
527 | get:function(){
528 | return this.pool.getPoolSize();
529 | }
530 | })
531 |
532 | module.exports = Crawler;
533 | module.exports.VERSION = '0.7.2';
534 |
--------------------------------------------------------------------------------
/lib/debug.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | var util = require('util');
4 | var crawler = require('./crawler');
5 | module.exports = function debug() {
6 | if (crawler.debug) {
7 | console.error('CRAWLER %s', util.format.apply(util, arguments))
8 | }
9 | };
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "node-webcrawler",
3 | "version": "0.7.5",
4 | "description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously",
5 | "main": "./lib/crawler.js",
6 | "directories": {
7 | "test": "tests"
8 | },
9 | "scripts": {
10 | "test": "./node_modules/mocha/bin/mocha --reporter spec --bail --timeout 10000 tests/*.js"
11 | },
12 | "repository": {
13 | "type": "git",
14 | "url": "https://github.com/bda-research/node-webcrawler.git"
15 | },
16 | "dependencies": {
17 | "charset-parser": "^0.2.0",
18 | "cheerio": "0.19.0",
19 | "generic-pool": "2.2.0",
20 | "iconv": "2.1.7",
21 | "iconv-lite": "0.4.8",
22 | "lodash": "3.8.0",
23 | "request": "2.74.0",
24 | "seenreq": "^0.1.7",
25 | "bottleneck":"1.9.1"
26 | },
27 | "optionalDependencies": {
28 | "iconv": "*"
29 | },
30 | "devDependencies": {
31 | "chai": "2.3.0",
32 | "mocha": "2.2.5",
33 | "mocha-testdata": "1.1.0",
34 | "sinon": "1.14.1",
35 | "jsdom": "3.1.2"
36 | },
37 | "keywords": [
38 | "dom",
39 | "javascript",
40 | "crawling",
41 | "spider",
42 | "scraper",
43 | "scraping",
44 | "jquery",
45 | "crawler"
46 | ],
47 | "author": "Mike Chen",
48 | "license": "ISC",
49 | "bugs": {
50 | "url": "https://github.com/bda-research/node-webcrawler/issues"
51 | },
52 | "homepage": "https://github.com/bda-research/node-webcrawler"
53 | }
54 |
--------------------------------------------------------------------------------
/tests/cacheOption.test.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | var Crawler = require('../lib/crawler');
4 | var expect = require('chai').expect;
5 | var sinon = require('sinon');
6 | var httpbinHost = 'localhost:8000';
7 | var c;
8 |
9 | describe('Cache features tests', function() {
10 | describe('Cache', function() {
11 | afterEach(function () {
12 | c = {};
13 | });
14 | it.skip('should crawl one url', function (done) {
15 | c = new Crawler({
16 | maxConnections:1,
17 | debug:true,
18 | cache: true,
19 | jquery: false,
20 | onDrain: function () //noinspection BadExpressionStatementJS,BadExpressionStatementJS
21 | {
22 | expect(spy.calledOnce).to.be.true;
23 | done();
24 | },
25 | callback: function (error, result) {
26 | expect(error).to.be.null;
27 | expect(result.statusCode).to.equal(200);
28 | }
29 | });
30 | var spy = sinon.spy(c, '_buildHttpRequest');
31 | c.queue(['http://'+httpbinHost, 'http://' + httpbinHost, 'http://' + httpbinHost, 'http://' + httpbinHost]);
32 | });
33 | });
34 |
35 | describe('Skip Duplicate active', function() {
36 | afterEach(function () {
37 | c = {};
38 | });
39 |
40 | it('should not skip one single url', function (done) {
41 | c = new Crawler({
42 | jquery: false,
43 | skipDuplicates: true,
44 | callback: function (error, result) {
45 | expect(error).to.be.null;
46 | expect(result.statusCode).to.equal(200);
47 | done();
48 | },
49 | });
50 |
51 | c.queue('http://' + httpbinHost + '/status/200');
52 | });
53 |
54 | //it('should skip previous crawled urls', function (done) {});
55 | });
56 | });
57 |
58 |
--------------------------------------------------------------------------------
/tests/encoding.test.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | var Crawler = require('../lib/crawler');
4 | var expect = require('chai').expect;
5 | var c;
6 |
7 | describe('Encoding', function() {
8 | beforeEach(function() {
9 | c = new Crawler({
10 | forceUTF8: true
11 | });
12 | });
13 | it('should parse latin-1', function(done) {
14 | this.timeout(5000);
15 | c.queue([{
16 | uri: 'http://czyborra.com/charsets/iso8859.html',
17 | callback: function(error, result) //noinspection BadExpressionStatementJS,BadExpressionStatementJS
18 | {
19 | expect(error).to.be.null;
20 | expect(result.body.indexOf('Jörg')).to.be.above(0);
21 | done();
22 | }
23 | }]);
24 | });
25 | it('should return buffer if encoding = null', function(done) {
26 | this.timeout(5000);
27 | c.queue([{
28 | uri: 'http://czyborra.com/charsets/iso8859.html',
29 | encoding:null,
30 | callback: function(error, result) //noinspection BadExpressionStatementJS,BadExpressionStatementJS
31 | {
32 | expect(error).to.be.null;
33 | expect(result.body instanceof Buffer).to.be.true;
34 | done();
35 | }
36 | }]);
37 | });
38 | });
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/tests/errorHandling.test.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 |
3 | var Crawler = require('../lib/crawler');
4 | var expect = require('chai').expect;
5 | var jsdom = require('jsdom');
6 | var httpbinHost = 'localhost:8000';
7 |
8 | describe('Errors', function() {
9 | describe('timeout', function() {
10 | var c = new Crawler({
11 | timeout : 1500,
12 | retryTimeout : 1000,
13 | retries : 2,
14 | jquery : false
15 | });
16 | it('should return a timeout error after ~5sec', function(done) {
17 |
18 | // override default mocha test timeout of 2000ms
19 | this.timeout(10000);
20 |
21 | c.queue({
22 | uri : 'http://'+httpbinHost+'/delay/15',
23 | callback : function(error, response) //noinspection BadExpressionStatementJS,BadExpressionStatementJS
24 | {
25 | expect(error).not.to.be.null;
26 | expect(error.code).to.equal("ETIMEDOUT");
27 | //expect(response).to.be.undefined;
28 | done();
29 | }
30 | });
31 | });
32 | it('should retry after a first timeout', function(done) {
33 |
34 | // override default mocha test timeout of 2000ms
35 | this.timeout(15000);
36 |
37 | c.queue({
38 | uri : 'http://'+httpbinHost+'/delay/1',
39 | callback : function(error, response) {
40 | expect(error).to.be.null;
41 | expect(response.body).to.be.ok;
42 | done();
43 | }
44 | });
45 | });
46 | });
47 |
48 | describe('error status code', function() {
49 | var c = new Crawler({
50 | jQuery : false
51 | });
52 | it('should not return an error on status code 400 (Bad Request)', function(done) {
53 | c.queue({
54 | uri: 'http://' + httpbinHost + '/status/400',
55 | callback: function(error, response, $){
56 | expect(error).to.be.null;
57 | expect(response.statusCode).to.equal(400);
58 | done();
59 | }
60 | });
61 | });
62 | it('should not return an error on status code 401 (Unauthorized)', function(done) {
63 | c.queue({
64 | uri: 'http://' + httpbinHost + '/status/401',
65 | callback: function(error, response, $){
66 | expect(error).to.be.null;
67 | expect(response.statusCode).to.equal(401);
68 | done();
69 | }
70 | });
71 | });
72 | it('should not return an error on status code 403 (Forbidden)', function(done) {
73 | c.queue({
74 | uri: 'http://' + httpbinHost + '/status/403',
75 | callback: function(error, response, $){
76 | expect(error).to.be.null;
77 | expect(response.statusCode).to.equal(403);
78 | done();
79 | }
80 | });
81 | });
82 | it('should not return an error on a 404', function(done) {
83 | c.queue({
84 | uri : 'http://'+httpbinHost+'/status/404',
85 | callback : function(error, response) {
86 | expect(error).to.be.null;
87 | expect(response.statusCode).to.equal(404);
88 | done();
89 | }
90 | });
91 | });
92 | it('should not return an error on a 500', function(done) {
93 | c.queue({
94 | uri : 'http://'+httpbinHost+'/status/500',
95 | callback : function(error, response) {
96 | expect(error).to.be.null;
97 | expect(response.statusCode).to.equal(500);
98 | done();
99 | }
100 | });
101 | });
102 | it('should not failed on empty response', function(done) {
103 | c.queue({
104 | uri : 'http://'+httpbinHost+'/status/204',
105 | callback : function(error) {
106 | expect(error).to.be.null;
107 | done();
108 | }
109 | });
110 | });
111 | it('should not failed on a malformed html if jquery is false', function(done) {
112 | c.queue({
113 | html : '
hello
dude',
114 | callback : function(error, response) {
115 | expect(error).to.be.null;
116 | expect(response).not.to.be.null;
117 | done();
118 | }
119 | });
120 | });
121 | it('should not return an error on a malformed html if jQuery is jsdom', function(done) {
122 | c.queue({
123 | html : '