├── .gitignore
├── .npmignore
├── screenshots
    ├── debug.jpg
    └── performance.jpg
├── package.json
├── examples
    ├── sina.news.js
    └── basic.js
├── lib
    ├── spider.js
    ├── util
    │   ├── debug.js
    │   ├── crawler.js
    │   └── helper.js
    └── cluster
    │   ├── spider.js
    │   └── nest.js
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | .gitignore
4 | screenshot


--------------------------------------------------------------------------------
/screenshots/debug.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tjatse/spider2/HEAD/screenshots/debug.jpg


--------------------------------------------------------------------------------
/screenshots/performance.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tjatse/spider2/HEAD/screenshots/performance.jpg


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "spider2",
 3 |   "version": "0.0.7",
 4 |   "description": "A 2nd generation spider to crawl any article site, automatic reading title and content.",
 5 |   "main": "./lib/spider.js",
 6 |   "scripts": {
 7 |   },
 8 |   "repository": {
 9 |     "type": "git",
10 |     "url": "git://github.com/Tjatse/spider2.git"
11 |   },
12 |   "keywords": [
13 |     "crawl",
14 |     "crawling",
15 |     "spider",
16 |     "spidering",
17 |     "readability",
18 |     "scrape"
19 |   ],
20 |   "author": "Tjatse",
21 |   "license": "Apache, Version 2.0",
22 |   "bugs": {
23 |     "url": "https://github.com/Tjatse/spider2/issues"
24 |   },
25 |   "engines": [
26 |     "node >= 0.8.0"
27 |   ],
28 |   "dependencies": {
29 |     "cheerio":"~0.19.0",
30 |     "lodash":"~3.10.1",
31 |     "read-art":"^0.4.2",
32 |     "req-fast": "^0.2.9",
33 |     "async":"~1.5.0",
34 |     "chalk": "~1.1.1",
35 |     "urijs":"~1.17.0"
36 |   },
37 |   "devDependencies": {
38 |     "mocha": "1.21.4",
39 |     "chai": "1.9.1"
40 |   },
41 |   "readmeFilename": "README.md"
42 | }
43 | 


--------------------------------------------------------------------------------
/examples/sina.news.js:
--------------------------------------------------------------------------------
 1 | var _      = require('lodash'),
 2 |     Spider = require('../');
 3 | 
 4 | var spider = Spider({
 5 |   debug      : true,
 6 |   concurrency: 5
 7 | });
 8 | spider.on('error', function(err, req){
 9 |   if (req.worker) {
10 |     console.error('worker #', req.worker, 'has an error:', err.message);
11 |   } else {
12 |     console.error(req.uri, err.message);
13 |   }
14 | });
15 | spider.on('data', function(req, res){
16 |   if (req._type == Spider.type.LINK) {
17 |     var links = _.filter(res, validLink);
18 |     console.log('length of articles:',  links.length);
19 |     spider.read(links);
20 |   } else if (req._type == Spider.type.ARTICLE) {
21 |     console.log(req.uri, res.title);
22 |   }
23 | });
24 | spider.on('end', function(){
25 |   console.log('[END]');
26 | });
27 | 
28 | spider.crawl([
29 |   'http://news.sina.com.cn'
30 | ]);
31 | 
32 | function validLink(ele){
33 |   if (!ele.uri || !ele.title) {
34 |     return false;
35 |   }
36 |   /**
37 |    * must be:
38 |    * 1. uri must have 4 digital at least
39 |    * 2. uri can not be a bitmap
40 |    * 3. uri can not have no path
41 |    * 4. length of title must greater than 5
42 |    */
43 |   var qsi, uri = ele.uri;
44 |   if ((qsi = uri.indexOf('?')) > 0) {
45 |     uri = uri.substr(0, qsi);
46 |   }
47 |   return uri.match(/\d{4,}/i) && !uri.match(/\.(jpg|png|jpeg|pdf)/i) && uri.indexOf('/') != uri.length - 1 && ele.title.length >= 5;
48 | }


--------------------------------------------------------------------------------
/lib/spider.js:
--------------------------------------------------------------------------------
 1 | var Debug = require('./util/debug'),
 2 |   _ = require('lodash'),
 3 |   chalk = require('chalk'),
 4 |   Nest = require('./cluster/nest'),
 5 |   helper = require('./util/helper'),
 6 |   EventEmitter = require('events').EventEmitter;
 7 | 
 8 | module.exports = Spider;
 9 | 
10 | function Spider(options) {
11 |   if (!(this instanceof Spider)) {
12 |     return new Spider(options);
13 |   }
14 |   this.options = _.defaults(options || {}, {
15 |     debug: process.env.SP_DEBUG,
16 |     domain: true
17 |     // workers: numCPUs
18 |     // concurrency: 1
19 |   });
20 | 
21 |   this._log = Debug({
22 |     namespace: 'spider2',
23 |     debug: this.options.debug
24 |   });
25 | 
26 |   this.nest = Nest(this.options);
27 |   this.nest.on('error', this._handleError.bind(this));
28 |   this.nest.on('data', this._processData.bind(this));
29 |   this.nest.on('end', this.emit.bind(this, 'end'));
30 | };
31 | 
32 | Spider.prototype.__proto__ = EventEmitter.prototype,
33 | 
34 | _.assign(Spider.prototype, {
35 |   crawl: function (urls) {
36 |     this.nest.forage(urls, helper.SPIDER_TYPE.LINK);
37 |   },
38 |   read: function (urls) {
39 |     this.nest.forage(urls, helper.SPIDER_TYPE.ARTICLE);
40 |   },
41 |   destroy: function () {
42 |     this._log.w('destroy', 'graceful exit');
43 |     this.nest.destroy();
44 |   },
45 |   ping: function () {
46 |     return this.nest.ping();
47 |   },
48 |   _handleError: function (err, data) {
49 |     this.emit('error', err, data);
50 |   },
51 |   _processData: function (data, body) {
52 |     this.emit('data', data, body);
53 |   }
54 | });
55 | 
56 | Spider.type = helper.SPIDER_TYPE;


--------------------------------------------------------------------------------
/lib/util/debug.js:
--------------------------------------------------------------------------------
 1 | var chalk = require('chalk'),
 2 |   _ = require('lodash');
 3 | 
 4 | module.exports = Debug;
 5 | 
 6 | /**
 7 |  * Simple debug tool.
 8 |  * @param {Object} options
 9 |  * @returns {Debug}
10 |  * @constructor
11 |  */
12 | function Debug(options) {
13 |   if (!(this instanceof Debug)) {
14 |     return new Debug(options);
15 |   }
16 |   if (typeof options == 'string') {
17 |     options = {
18 |       namespace: options
19 |     };
20 |   }
21 |   this.options = _.defaults(options || {}, {
22 |     namespace: 'SP2',
23 |     timestamp: true,
24 |     debug: false
25 |   });
26 | }
27 | Debug.prototype._l = function (level, args) {
28 |   if(!this.options.debug){
29 |     return;
30 |   }
31 |   args = _.values(args);
32 | 
33 |   var prints = [chalk.bgBlack.grey(this.options.namespace)];
34 |   var prefix, color;
35 |   switch (level) {
36 |     case 'e':
37 |       prefix = 'ERR!', color = 'red';
38 |       break;
39 |     case 'w':
40 |       prefix = 'warn', color = 'yellow';
41 |       break;
42 |     case 'd':
43 |       if(this.options.timestamp){
44 |         prints.push(chalk.underline.dim((new Date()).toISOString()))
45 |       }
46 |       break;
47 |     default :
48 |       prefix = args.splice(0, 1), color = 'green';
49 |       break;
50 |   }
51 |   if(prefix && color){
52 |     prints.splice(2, 0, chalk.bgBlack[color](prefix));
53 |   }
54 |   prints.push(args.join(' '));
55 |   console.log.apply(null, prints);
56 | };
57 | 
58 | /**
59 |  * Loggers: info, error, debug, log, warn.
60 |  */
61 | ['i', 'e', 'd', 'l', 'w'].forEach(function(s){
62 |   Debug.prototype[s] = function(){
63 |     this._l.call(this, s, arguments);
64 |   };
65 | });


--------------------------------------------------------------------------------
/examples/basic.js:
--------------------------------------------------------------------------------
 1 | var _      = require('lodash'),
 2 |     Spider = require('../');
 3 | 
 4 | var spider = Spider({
 5 |   debug      : true,
 6 |   workers    : 7,
 7 |   concurrency: 1
 8 | });
 9 | spider.on('error', function(err, req){
10 |   if (req.worker) {
11 |     console.error('worker #', req.worker, 'has an error:', err.message);
12 |   } else {
13 |     console.error(req.uri, err.message);
14 |   }
15 | });
16 | spider.on('data', function(req, res){
17 |   if (req._type == Spider.type.LINK) {
18 |     spider.read(_.filter(res, validLink));
19 |   } else if (req._type == Spider.type.ARTICLE) {
20 |     console.log(req.uri, res.title);
21 |   }
22 | });
23 | spider.on('end', function(){
24 |   console.log('[END]');
25 | });
26 | 
27 | spider.crawl([
28 |   'http://www.sina.com.cn',
29 |   'http://www.163.com',
30 |   'http://www.autohome.com.cn',
31 |   'http://www.sohu.com'
32 | ]);
33 | 
34 | /*
35 |  setTimeout(function(){
36 |  spider.crawl([
37 |  'http://getbootstrap.com/components/',
38 |  'https://lodash.com/docs#compact',
39 |  'https://www.npmjs.org/package/read-art'
40 |  ]);
41 |  }, 200);*/
42 | 
43 | /*
44 |  setTimeout(function(){
45 |  spider.destroy();
46 |  }, 500);*/
47 | 
48 | setTimeout(function(){
49 |   var pong = spider.ping();
50 |   console.log(pong);
51 | }, 20000);
52 | 
53 | function validLink(ele){
54 |   if (!ele.uri || !ele.title) {
55 |     return false;
56 |   }
57 |   /**
58 |    * must be:
59 |    * 1. uri must have 4 digital at least
60 |    * 2. uri can not be a bitmap
61 |    * 3. uri can not have no path
62 |    * 4. length of title must greater than 5
63 |    */
64 |   var qsi, uri = ele.uri;
65 |   if ((qsi = uri.indexOf('?')) > 0) {
66 |     uri = uri.substr(0, qsi);
67 |   }
68 |   return uri.match(/\d{4,}/i) && !uri.match(/\.(jpg|png|jpeg|pdf)/i) && uri.indexOf('/') != uri.length - 1 && ele.title.length >= 5;
69 | }


--------------------------------------------------------------------------------
/lib/util/crawler.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash'),
 2 |   chalk = require('chalk'),
 3 |   req = require('req-fast'),
 4 |   Debug = require('./debug');
 5 | 
 6 | module.exports = Crawler;
 7 | 
 8 | function Crawler(options) {
 9 |   if (!(this instanceof Crawler)) {
10 |     return new Crawler(options);
11 |   }
12 | 
13 |   this.options = _.defaults(options || {}, {
14 |     method: 'GET',
15 |     timeout: 10000,
16 |     debug: false
17 |   });
18 | 
19 |   this._log = Debug({
20 |     namespace: 'crawler',
21 |     debug: this.options.debug
22 |   });
23 | }
24 | 
25 | _.assign(Crawler.prototype, {
26 |   crawl: function (url, fn) {
27 |     if (_.isArray(url)) {
28 |       for (var i = 0; i < url.length; i++) {
29 |         this.crawl(url[i], fn);
30 |       }
31 |       return;
32 |     }
33 |     if (typeof url == 'string') {
34 |       url = {uri: url};
35 |     }
36 |     if (!url || !url.uri) {
37 |       return fn(new Error('`uri` is required'), url);
38 |     }
39 |     _.defaults(url, this.options);
40 |     this._request(url, fn);
41 |   },
42 |   destroy: function () {
43 |     this._destroyed = true;
44 |   },
45 |   _request: function (url, fn) {
46 |     this._log.i('http', chalk.magenta('GET'), chalk.underline.grey(url.uri));
47 |     var pickedKeys = ['uri', 'method', 'timeout', 'dataType', 'data', 'agent', 'charset', 'disableRedirect', 'maxRedirects', 'disableGzip', 'trackCookie', 'headers', 'cookies', 'proxy'],
48 |         options = _.pick(url, pickedKeys);
49 |     options.__data = _.omit(url, pickedKeys);
50 | 
51 |     req(options, function (callback, error, resp) {
52 |       if (this._destroyed) {
53 |         error = new Error('request was destroyed.');
54 |       }
55 |       // handle error.
56 |       if (!error && !resp) {
57 |         error = new Error('No response from server.');
58 |       } else if (!error && resp && !resp.body) {
59 |         error = new Error('No body has been found.');
60 |       }
61 |       this._processHTML(error, options, resp, callback);
62 |     }.bind(this, fn));
63 |   },
64 |   _processHTML: function (error, data, resp, fn) {
65 |     var uriArg = chalk.underline.grey(data.uri);
66 | 
67 |     // handle error.
68 |     if (error) {
69 |       this._log.e(error.message, uriArg);
70 |       return fn(error, data);
71 |     }
72 | 
73 |     this._log.i('http', chalk.magenta(resp.statusCode), uriArg);
74 | 
75 |     var body = resp.body;
76 | 
77 |     // handle JSON.
78 |     if (typeof body == 'object') {
79 |       var errMsg = 'Body is a type of JSON, can not be crawled.';
80 |       this._log.w(errMsg, uriArg);
81 |       return fn(new Error(errMsg), data);
82 |     }
83 | 
84 |     // make sure response body is string.
85 |     if (typeof body != 'string') {
86 |       this._log.w('Body is not a type of String, try to decode by UTF-8 encoding.', uriArg);
87 |       body = body.toString('utf-8');
88 |     }
89 | 
90 |     this._log.i('http', chalk.magenta('FIN'), uriArg);
91 |     fn(null, data, body);
92 |   }
93 | });


--------------------------------------------------------------------------------
/lib/util/helper.js:
--------------------------------------------------------------------------------
  1 | var URI = require('urijs');
  2 | 
  3 | var helper = module.exports = {
  4 |   SPIDER_TYPE: {
  5 |     LINK: 'link',
  6 |     ARTICLE: 'article'
  7 |   }
  8 | };
  9 | 
 10 | /**
 11 |  * Analyze href of the specific anchor
 12 |  * @param {Object} options, including:
 13 |  *                  {URIjs} baseURI
 14 |  *                  {Cheerio} ele anchor
 15 |  *                  {Function} predication verification
 16 |  * @returns {*}
 17 |  */
 18 | helper.analyzeHref = function(options){
 19 |   var href = options.ele.attr('href'), parent;
 20 |   // if href attribute does not exist.
 21 |   if(!href && (parent = options.ele.parent()) && parent.length > 0){
 22 |     var onclick;
 23 |     // try to get href from onclick if onclick like `window.open('/link_to.html')`.
 24 |     if(onclick = parent.attr('onclick')){
 25 |       var link;
 26 |       if(link = onclick.match(/['"]([^'"]+)['"]/)){
 27 |         href = link[1];
 28 |       }
 29 |     }
 30 |   }
 31 |   // if href still does not exist, returns nothing.
 32 |   if(!href){
 33 |     return;
 34 |   }
 35 | 
 36 |   // trim href.
 37 |   href = href.trim();
 38 | 
 39 |   // if href only contains hash, returns nothing.
 40 |   if(href.indexOf('#') == 0){
 41 |     return;
 42 |   }
 43 | 
 44 |   var uri;
 45 | 
 46 |   // try to parse href as URIjs object.
 47 |   try{
 48 |     uri = URI(href);
 49 |     if(uri.is('relative')){
 50 |       // make sure it is an absolute url.
 51 |       uri = uri.absoluteTo(options.baseURI);
 52 |     }
 53 |   }catch(err){
 54 |     return;
 55 |   }
 56 |   // both of them must in a same domain.
 57 |   if(options.domain && uri.domain().toLowerCase() != options.baseURI.domain().toLowerCase()){
 58 |     return;
 59 |   }
 60 | 
 61 |   // expose returning value.
 62 |   var retVal = {
 63 |     uri: uri.href(),
 64 |     title: (options.ele.attr('title') || options.ele.text() || '').trim().replace(/[\r\n\t]/g, ' ')
 65 |   };
 66 | 
 67 |   return retVal;
 68 | };
 69 | 
 70 | /**
 71 |  * Analyze links on the specific element (including children-anchors).
 72 |  * @param {Object} options
 73 |  *          {String} url basic url
 74 |  *          {Cheerio} ele body cheerio element
 75 |  *          {Function} predication verification
 76 |  *          {Function} onProcess process data
 77 |  * @returns {*}
 78 |  */
 79 | helper.analyzeLinks = function(options){
 80 |   var baseURI;
 81 |   try{
 82 |     // parse base url as URIjs object.
 83 |     baseURI = URI(options.url).normalize();
 84 | 
 85 |     // the `base` tag in `head` takes top priority as base href.
 86 |     var baseEle = options.ele('head>base'), baseHref;
 87 |     if(baseEle && baseEle.length > 0 && (baseHref = baseEle.eq(0).attr('href'))){
 88 |       var baseHrefURI = URI(baseHref);
 89 |       if(baseHrefURI.is('absolute')){
 90 |         baseURI = baseHrefURI;
 91 |       }else{
 92 |         baseURI = baseHrefURI.absoluteTo(baseURI);
 93 |       }
 94 |     }
 95 |   }catch(err){
 96 |     return err.message;
 97 |   }
 98 | 
 99 |   var links = [];
100 |   // map all anchors as simple {title: '', href:''} object.
101 |   options.ele('a').each(function(idx,lnk){
102 |     var ele = helper.analyzeHref({
103 |       ele: options.ele(this),
104 |       domain: options.domain,
105 |       baseURI: baseURI
106 |     });
107 |     ele && links.push(ele);
108 |   });
109 |   return links;
110 | }
111 | 


--------------------------------------------------------------------------------
/lib/cluster/spider.js:
--------------------------------------------------------------------------------
  1 | var async = require('async'),
  2 |   cheerio = require('cheerio'),
  3 |   helper = require('../util/helper'),
  4 |   Crawler = require('../util/crawler'),
  5 |   read = require('read-art'),
  6 |   _ = require('lodash');
  7 | 
  8 | var options = JSON.parse(process.argv.slice(2));
  9 | 
 10 | run();
 11 | 
 12 | function run(){
 13 |   var crawler = Crawler(options);
 14 | 
 15 |   var queue = async.queue(function (job, fn) {
 16 |     crawler.crawl(job, processData.bind(null, job._type, fn));
 17 |   }, options.concurrency);
 18 | 
 19 |   /**
 20 |    * Listen on message.
 21 |    */
 22 |   process.on('message', function(msg){
 23 |     // zip array to an object
 24 |     var req = _.zipObject(['code', 'data', 'type'], msg);
 25 |     if(req.code == 'kill'){
 26 |       // try to quit peaceful.
 27 |       crawler.destroy();
 28 |       queue.kill();
 29 |       process.exit(0);
 30 |     }else if(req.code == 'data'){
 31 |       // wrap data, and begin working.
 32 |       queue.push(_.map(req.data, function(url){
 33 |         if(_.isString(url)){
 34 |           url = {uri: url};
 35 |         }
 36 |         return _.assign(url, {_type: req.type});
 37 |       }));
 38 |     }
 39 |   });
 40 | }
 41 | 
 42 | /**
 43 |  * Process data.
 44 |  * @param {String} type `links` or `article`
 45 |  * @param {Function} callback of queue
 46 |  * @param {Error} err
 47 |  * @param {Object} data
 48 |  * @param {String} body
 49 |  */
 50 | function processData(type, callback, err, data, body){
 51 |   // make sure process is connected.
 52 |   if(!process.connected) {
 53 |     return callback();
 54 |   }
 55 |   // bind type.
 56 |   _.assign(data, {_type: type});
 57 | 
 58 |   var result;
 59 |   if(err){
 60 |     // handle error.
 61 |     result = err.message;
 62 |   }else if (type == helper.SPIDER_TYPE.LINK) {
 63 |     // crawl links from site.
 64 |     result = crawlLinks(data, body);
 65 |   }else if(type == helper.SPIDER_TYPE.ARTICLE){
 66 |     // read article by url.
 67 |     return read(body, function(data, callback, err, art){
 68 |       if(err){
 69 |         process.send(['fin', data, {error: err.message}]);
 70 |       }else{
 71 |         process.send(['fin', data, {title: art.title, content: art.content}]);
 72 |       }
 73 |       art = null;
 74 |       return callback();
 75 |     }.bind(null, data, callback));
 76 |   }else{
 77 |     result = 'only `links` and `articles` can be crawled.';
 78 |   }
 79 |   // send finish signal.
 80 |   process.send(['fin', data, _.isString(result) ? {error: result} : result]);
 81 |   callback();
 82 | }
 83 | 
 84 | /**
 85 |  * Crawl links from a site.
 86 |  * @param {Object} data request data
 87 |  * @param {Object} body html body
 88 |  * @returns {*}
 89 |  */
 90 | function crawlLinks(data, body){
 91 |   // body must be a HTML string.
 92 |   if (body.search(/^\s*</) < 0) {
 93 |     return 'Body is not a type of HTML, can not be loaded by cheerio.';
 94 |   }
 95 |   // try to load by cheerio.
 96 |   try {
 97 |     var $ = cheerio.load(body, {
 98 |       ignoreWhitespace: false,
 99 |       xmlMode: false,
100 |       lowerCaseTags: true,
101 |       decodeEntities: false
102 |     });
103 | 
104 |     // analyze links.
105 |     var result = helper.analyzeLinks({
106 |       ele: $,
107 |       url: data.uri,
108 |       domain: !!options.domain
109 |     });
110 |     return result;
111 |   } catch (err) {
112 |     return err.message;
113 |   }
114 | }
115 | 
116 | 


--------------------------------------------------------------------------------
/lib/cluster/nest.js:
--------------------------------------------------------------------------------
  1 | var Debug = require('../util/debug'),
  2 |   helper = require('../util/helper'),
  3 |   path = require('path'),
  4 |   _ = require('lodash'),
  5 |   chalk = require('chalk'),
  6 |   cluster = require("cluster"),
  7 |   EventEmitter = require('events').EventEmitter,
  8 |   numCPUs = require('os').cpus().length;
  9 | 
 10 | module.exports = Nest;
 11 | 
 12 | function Nest(options) {
 13 |   if (!(this instanceof Nest)) {
 14 |     return new Nest(options);
 15 |   }
 16 |   options = _.defaults(options || {}, {
 17 |     debug: false,
 18 |     workers: numCPUs,
 19 |     concurrency: 1
 20 |   });
 21 |   this.options = options;
 22 | 
 23 |   this._log = Debug({
 24 |     namespace: 'sp-nest',
 25 |     debug: this.options.debug
 26 |   });
 27 | }
 28 | 
 29 | Nest.prototype.__proto__ = EventEmitter.prototype;
 30 | 
 31 | _.assign(Nest.prototype, {
 32 |   forage: function(urls, type){
 33 |     this._setupCluster();
 34 | 
 35 |     if(!_.isArray(urls)){
 36 |       urls = [urls];
 37 |     }
 38 |     this._allocateJobs(urls, type);
 39 |   },
 40 |   ping: function(){
 41 |     var result = [];
 42 |     for(var id in cluster.workers){
 43 |       var w = cluster.workers[id];
 44 |       result.push({
 45 |         id: id,
 46 |         suicide: !!w.suicide,
 47 |         count: w.count || 0
 48 |       });
 49 |     }
 50 |     return result;
 51 |   },
 52 |   destroy: function () {
 53 |     _.values(cluster.workers).forEach(function (worker) {
 54 |       this._log.i('spider', chalk.blue('#' + worker.id), 'was destroyed');
 55 |       worker.count = 0;
 56 |       worker.send(['kill']);
 57 |       worker.disconnect();
 58 |     }.bind(this));
 59 |     this._setup = false;
 60 |   },
 61 |   _setupCluster: function () {
 62 |     if(this._setup){
 63 |       return;
 64 |     }
 65 |     this._setup = true;
 66 | 
 67 |     if (this.options.workers > numCPUs || this.options.workers < 0) {
 68 |       this._log.w('cluster', 'maximize of workers is', numCPUs, 'but currency', this.options.workers);
 69 |       this.options.workers = numCPUs;
 70 |     }
 71 |     this._log.i('cluster', chalk.bold.yellow('setup master...'));
 72 | 
 73 |     cluster.setupMaster({
 74 |       exec: path.resolve(__dirname, 'spider.js'),
 75 |       args: [JSON.stringify(_.omit(this.options, 'workers'))],
 76 |       silent: false
 77 |     });
 78 | 
 79 |     for (var i = 0; i < this.options.workers; i++) {
 80 |       var worker = cluster.fork();
 81 |       worker.count = 0;
 82 |       this._log.i('spider', chalk.blue('#' + worker.id),  'has prepared to work');
 83 |       worker.on('error', function (id, err) {
 84 |           this._log.e('spider', chalk.blue('#' + id), 'has an error:', err.message);
 85 |           this.emit('error', err, {worker: id});
 86 |         }.bind(this, worker.id))
 87 |         .on('exit', function (id, code, sgnal) {
 88 |           this._log.i('spider', chalk.blue('#' + id),  'has exited yet, code:', (!isNaN(code) ? code : 'unknown'));
 89 |           if(this.ping() == 0){
 90 |             this.emit('end');
 91 |           }
 92 |         }.bind(this, worker.id))
 93 |         .on('message', this._processData.bind(this, worker.id));
 94 |     }
 95 |   },
 96 |   _allocateJobs: function(urls, type){
 97 |     // grep id and count, then sort by count.
 98 |     var wks = _(cluster.workers).values().map(function(w){
 99 |       return {id: w.id, count: w.count};
100 |     }).sortBy(function(w){
101 |       return w.count;
102 |     }).clone();
103 | 
104 |     if(wks.length == 0){
105 |       return;
106 |     }
107 | 
108 |     var max = wks[wks.length - 1].count;
109 | 
110 |     function allocate(_urls, _wks, _fill){
111 |       var size = 0, extra = 0;
112 |       if(_fill) {
113 |         size = parseInt(_urls.length / this.options.workers);
114 |         extra = _urls.length % this.options.workers, i = 0;
115 |       }
116 |       for(var i = 0; i < _wks.length; i++){
117 |         var data = _urls.splice(0, size + ((i < extra) ? 1 : 0) + max - _wks[i].count);
118 |         if(data.length == 0) {
119 |           break;
120 |         }
121 | 
122 |         var worker = cluster.workers[_wks[i].id];
123 |         worker.count += data.length;
124 |         worker.send(['data', data, type]);
125 |       }
126 |     }
127 | 
128 |     // Equal allocation
129 |     if(max == 0){
130 |       return allocate.call(this, urls, wks, true);
131 |     }
132 |     // load balance
133 |     var sum = _(wks).map(function(w){ return w.count; }).reduce(function(sum, num){ return sum + num; });
134 |     allocate.call(this, urls, wks, urls.length > max * wks.length - sum)
135 |   },
136 |   _processData: function(id, data){
137 |     cluster.workers[id].count--;
138 | 
139 |     var resp = _.zipObject(['code', 'data', 'body'], data);
140 | 
141 |     if(resp.code == 'fin'){
142 |       if(resp.body && resp.body.error) {
143 |         return this.emit('error', new Error(resp.body.error), resp.data);
144 |       }
145 |       this.emit('data', resp.data, resp.body);
146 |     }
147 |   }
148 | });
149 | 
150 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | spider2 [![NPM version](https://badge.fury.io/js/spider2.svg)](http://badge.fury.io/js/spider2)
  2 | =========
  3 | 
  4 | A 2nd generation spider to crawl any article site, automatic reading title and content.
  5 | # Performance
  6 | In my case, the speed of [spider](https://github.com/Tjatse/spider2) is about **700 thousands documents per day**, **22 million per month**, and the maximize crawling speed is **450 per minute**, **avg 80 per minute**, the memory cost are about **200 megabytes** on each spider kernel, and the accuracy is about 90%, the rest 10% can be fixed by customizing [Score Rules](#score_rule) or [Selectors](selectors). it's better than any other readability modules.
  7 | 
  8 | ![image](screenshots/performance.jpg)
  9 | 
 10 | > Server infos: 
 11 | > * 20M bandwidth of fibre-optical
 12 | > * 8 Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz cpus
 13 | > * 32G memory
 14 | 
 15 | # Features
 16 | ## Multi-core crawling
 17 | It is not a single spider runs in a single thread. To take advantage of multi-core systems we maybe wanna launch
 18 | a cluster of processes to handle the load, this is exactly what `spider2` does - Crawling fast and in order to
 19 | maximum performance.
 20 | 
 21 | ## Concurrency
 22 | **Multi-core crawling** feature is just make spiders work in a fork mode, but concurrency makes them work together in a
 23 | same thread and at a same time!!!
 24 | 
 25 | ## Automatic
 26 | The old school crawls links/articles in a manual mode, e.g.: *request to server and get the response(HTML), then using
 27 | jQuery or something else to analyze links/articles by hard code*, this feels sucks, currently, we just need to make a list
 28 | of websites that you wanna scrape, `spider2` will handle anything else, take a cup of coffee, and wait to harvest the fruit.
 29 | 
 30 | ## Queue
 31 | All the jobs are managed by async queue, so you can keep pushing the urls which to be crawled/read.
 32 | 
 33 | ## More features
 34 | - Automatic User-Agent (actually I am a browser, not a spider).
 35 | - Proxy supports (avoid being blocked by server).
 36 | - Blazing fast with Cheerio & Cluster Fork Mode.
 37 | - Automatic Decode encodings (especially useful with non-english language).
 38 | - ...
 39 | 
 40 | # Installation
 41 | ```
 42 | npm install spider2 --production
 43 | ```
 44 | 
 45 | # Programmatic
 46 | ## Require
 47 | ```javascript
 48 | var Spider = require('spider2');
 49 | ```
 50 | 
 51 | ## Usage
 52 | ```javascript
 53 | var spider = Spider({
 54 |   timeout: 5000,
 55 |   debug: true,
 56 |   domain: true,
 57 |   workers: 7,
 58 |   concurrency: 1
 59 | });
 60 | ```
 61 | ### Options
 62 | The options including:
 63 | - **timeout** Set a timeout (in milliseconds) for the request, `10000` by default.
 64 | - **debug** A value indicating whether show the debug log or not, `false` by default, also it could be set with `process.env.SP_DEBUG`.
 65 | - **domain** A value indicating whether the links being crawled should in a same domain with the base url or not, `true` by default.
 66 | - **workers** Number of multi-core, can not greater than number of CPUs.
 67 | - **concurrency** number of concurrency per worker, `1` by default.
 68 | 
 69 | ### Events
 70 | #### error
 71 | This event is emitted when an error has been caught, the arguments including:
 72 | - `err` Error object
 73 | - `req` Request data, if `req.worker` is defined an it is a number, means error is from the worker, `req.worker` is the id of a worker, otherwise it is a normal error.
 74 | Example:
 75 | ```javascript
 76 | spider.on('error', function (err, req) {
 77 |   if (req.worker) {
 78 |     console.error('worker #', req.worker, 'has an error:', err.message);
 79 |   } else {
 80 |     console.error(req.uri, err.message);
 81 |   }
 82 | });
 83 | ```
 84 | 
 85 | #### data
 86 | Data send by spider are obtained using this event, arguments including:
 87 | - `req` Request data.
 88 | - `res` Response data, if `req._type` equals `Spider.type.LINK`, `res` should be an array, including key-value pairs like `{title: [ANCHOR_TITLE], uri: [ANCHOR_HREF]}`, and if equals `Spider.type.ARTICLE`, `res` should be an object, keys including `title` and `content`.
 89 | Example:
 90 | ```javascript
 91 | spider.on('data', function (req, res) {
 92 |   if (req._type == Spider.type.LINK) {
 93 |     spider.read(_.filter(res, validLink));
 94 |   } else if (req._type == Spider.type.ARTICLE) {
 95 |     console.log(req.uri, res.title);
 96 |   }
 97 | });
 98 | ```
 99 | 
100 | #### end
101 | This event is emitted after all the spiders terminated abnormally, e.g.:
102 | ```javascript
103 | spider.on('end', function () {
104 |   console.log('[END]');
105 | });
106 | ```
107 | 
108 | ### Methods
109 | #### crawl
110 | Crawl links, `OPTION` could be one of below:
111 | - **String** Url
112 | - **Array** Array of urls, both `[String, String, ...]` and `[Object, Object, ...]` will be fine.
113 | - **Object** Must including `uri` property.
114 | 
115 | e.g.:
116 | ```javascript
117 | spider.crawl([OPTION]);
118 | ```
119 | 
120 | #### read
121 | Read title and content of article, `OPTION` is same as above, e.g.:
122 | ```javascript
123 | spider.read([OPTION]);
124 | ```
125 | 
126 | #### destroy
127 | Peaceful quit, e.g.:
128 | ```javascript
129 | spider.destroy();
130 | ```
131 | 
132 | #### ping
133 | Ping the spider and returns workers' status Array, e.g.:
134 | ```javascript
135 | var pong = spider.ping();
136 | console.log(pong);
137 | ```
138 | 
139 | `pong` will be printed like:
140 | ```
141 | [
142 |   {id: 1, count: 12},
143 |   {id: 2, count: 90},
144 |   ...
145 | ]
146 | ```
147 | 
148 | `id` is the id of worker, and `count` is the count of remaining jobs.
149 | 
150 | # Test
151 | ```
152 | npm test
153 | ```
154 | 
155 | # Examples
156 | Head over to `test/` or `/examples` directory.
157 | 
158 | # TODO
159 | - [ ] fix typo bug
160 | - [ ] more tests
161 | 
162 | # License
163 | Copyright 2014 Tjatse
164 | 
165 | Licensed under the Apache License, Version 2.0 (the "License");
166 | you may not use this file except in compliance with the License.
167 | You may obtain a copy of the License at
168 | 
169 |     http://www.apache.org/licenses/LICENSE-2.0
170 | 
171 | Unless required by applicable law or agreed to in writing, software
172 | distributed under the License is distributed on an "AS IS" BASIS,
173 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
174 | See the License for the specific language governing permissions and
175 | limitations under the License.


--------------------------------------------------------------------------------