├── .gitignore ├── .npmignore ├── screenshots ├── debug.jpg └── performance.jpg ├── package.json ├── examples ├── sina.news.js └── basic.js ├── lib ├── spider.js ├── util │ ├── debug.js │ ├── crawler.js │ └── helper.js └── cluster │ ├── spider.js │ └── nest.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea 3 | .gitignore 4 | screenshot -------------------------------------------------------------------------------- /screenshots/debug.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjatse/spider2/HEAD/screenshots/debug.jpg -------------------------------------------------------------------------------- /screenshots/performance.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tjatse/spider2/HEAD/screenshots/performance.jpg -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "spider2", 3 | "version": "0.0.7", 4 | "description": "A 2nd generation spider to crawl any article site, automatic reading title and content.", 5 | "main": "./lib/spider.js", 6 | "scripts": { 7 | }, 8 | "repository": { 9 | "type": "git", 10 | "url": "git://github.com/Tjatse/spider2.git" 11 | }, 12 | "keywords": [ 13 | "crawl", 14 | "crawling", 15 | "spider", 16 | "spidering", 17 | "readability", 18 | "scrape" 19 | ], 20 | "author": "Tjatse", 21 | "license": "Apache, Version 2.0", 22 | "bugs": { 23 | "url": "https://github.com/Tjatse/spider2/issues" 24 | }, 25 | "engines": [ 26 | "node >= 0.8.0" 27 | ], 28 | "dependencies": { 29 | "cheerio":"~0.19.0", 30 | "lodash":"~3.10.1", 31 | "read-art":"^0.4.2", 32 | "req-fast": "^0.2.9", 33 | "async":"~1.5.0", 34 | "chalk": "~1.1.1", 35 | "urijs":"~1.17.0" 36 | }, 37 | "devDependencies": { 38 | "mocha": "1.21.4", 39 | "chai": "1.9.1" 40 | }, 41 | "readmeFilename": "README.md" 42 | } 43 | -------------------------------------------------------------------------------- /examples/sina.news.js: -------------------------------------------------------------------------------- 1 | var _ = require('lodash'), 2 | Spider = require('../'); 3 | 4 | var spider = Spider({ 5 | debug : true, 6 | concurrency: 5 7 | }); 8 | spider.on('error', function(err, req){ 9 | if (req.worker) { 10 | console.error('worker #', req.worker, 'has an error:', err.message); 11 | } else { 12 | console.error(req.uri, err.message); 13 | } 14 | }); 15 | spider.on('data', function(req, res){ 16 | if (req._type == Spider.type.LINK) { 17 | var links = _.filter(res, validLink); 18 | console.log('length of articles:', links.length); 19 | spider.read(links); 20 | } else if (req._type == Spider.type.ARTICLE) { 21 | console.log(req.uri, res.title); 22 | } 23 | }); 24 | spider.on('end', function(){ 25 | console.log('[END]'); 26 | }); 27 | 28 | spider.crawl([ 29 | 'http://news.sina.com.cn' 30 | ]); 31 | 32 | function validLink(ele){ 33 | if (!ele.uri || !ele.title) { 34 | return false; 35 | } 36 | /** 37 | * must be: 38 | * 1. uri must have 4 digital at least 39 | * 2. uri can not be a bitmap 40 | * 3. uri can not have no path 41 | * 4. length of title must greater than 5 42 | */ 43 | var qsi, uri = ele.uri; 44 | if ((qsi = uri.indexOf('?')) > 0) { 45 | uri = uri.substr(0, qsi); 46 | } 47 | return uri.match(/\d{4,}/i) && !uri.match(/\.(jpg|png|jpeg|pdf)/i) && uri.indexOf('/') != uri.length - 1 && ele.title.length >= 5; 48 | } -------------------------------------------------------------------------------- /lib/spider.js: -------------------------------------------------------------------------------- 1 | var Debug = require('./util/debug'), 2 | _ = require('lodash'), 3 | chalk = require('chalk'), 4 | Nest = require('./cluster/nest'), 5 | helper = require('./util/helper'), 6 | EventEmitter = require('events').EventEmitter; 7 | 8 | module.exports = Spider; 9 | 10 | function Spider(options) { 11 | if (!(this instanceof Spider)) { 12 | return new Spider(options); 13 | } 14 | this.options = _.defaults(options || {}, { 15 | debug: process.env.SP_DEBUG, 16 | domain: true 17 | // workers: numCPUs 18 | // concurrency: 1 19 | }); 20 | 21 | this._log = Debug({ 22 | namespace: 'spider2', 23 | debug: this.options.debug 24 | }); 25 | 26 | this.nest = Nest(this.options); 27 | this.nest.on('error', this._handleError.bind(this)); 28 | this.nest.on('data', this._processData.bind(this)); 29 | this.nest.on('end', this.emit.bind(this, 'end')); 30 | }; 31 | 32 | Spider.prototype.__proto__ = EventEmitter.prototype, 33 | 34 | _.assign(Spider.prototype, { 35 | crawl: function (urls) { 36 | this.nest.forage(urls, helper.SPIDER_TYPE.LINK); 37 | }, 38 | read: function (urls) { 39 | this.nest.forage(urls, helper.SPIDER_TYPE.ARTICLE); 40 | }, 41 | destroy: function () { 42 | this._log.w('destroy', 'graceful exit'); 43 | this.nest.destroy(); 44 | }, 45 | ping: function () { 46 | return this.nest.ping(); 47 | }, 48 | _handleError: function (err, data) { 49 | this.emit('error', err, data); 50 | }, 51 | _processData: function (data, body) { 52 | this.emit('data', data, body); 53 | } 54 | }); 55 | 56 | Spider.type = helper.SPIDER_TYPE; -------------------------------------------------------------------------------- /lib/util/debug.js: -------------------------------------------------------------------------------- 1 | var chalk = require('chalk'), 2 | _ = require('lodash'); 3 | 4 | module.exports = Debug; 5 | 6 | /** 7 | * Simple debug tool. 8 | * @param {Object} options 9 | * @returns {Debug} 10 | * @constructor 11 | */ 12 | function Debug(options) { 13 | if (!(this instanceof Debug)) { 14 | return new Debug(options); 15 | } 16 | if (typeof options == 'string') { 17 | options = { 18 | namespace: options 19 | }; 20 | } 21 | this.options = _.defaults(options || {}, { 22 | namespace: 'SP2', 23 | timestamp: true, 24 | debug: false 25 | }); 26 | } 27 | Debug.prototype._l = function (level, args) { 28 | if(!this.options.debug){ 29 | return; 30 | } 31 | args = _.values(args); 32 | 33 | var prints = [chalk.bgBlack.grey(this.options.namespace)]; 34 | var prefix, color; 35 | switch (level) { 36 | case 'e': 37 | prefix = 'ERR!', color = 'red'; 38 | break; 39 | case 'w': 40 | prefix = 'warn', color = 'yellow'; 41 | break; 42 | case 'd': 43 | if(this.options.timestamp){ 44 | prints.push(chalk.underline.dim((new Date()).toISOString())) 45 | } 46 | break; 47 | default : 48 | prefix = args.splice(0, 1), color = 'green'; 49 | break; 50 | } 51 | if(prefix && color){ 52 | prints.splice(2, 0, chalk.bgBlack[color](prefix)); 53 | } 54 | prints.push(args.join(' ')); 55 | console.log.apply(null, prints); 56 | }; 57 | 58 | /** 59 | * Loggers: info, error, debug, log, warn. 60 | */ 61 | ['i', 'e', 'd', 'l', 'w'].forEach(function(s){ 62 | Debug.prototype[s] = function(){ 63 | this._l.call(this, s, arguments); 64 | }; 65 | }); -------------------------------------------------------------------------------- /examples/basic.js: -------------------------------------------------------------------------------- 1 | var _ = require('lodash'), 2 | Spider = require('../'); 3 | 4 | var spider = Spider({ 5 | debug : true, 6 | workers : 7, 7 | concurrency: 1 8 | }); 9 | spider.on('error', function(err, req){ 10 | if (req.worker) { 11 | console.error('worker #', req.worker, 'has an error:', err.message); 12 | } else { 13 | console.error(req.uri, err.message); 14 | } 15 | }); 16 | spider.on('data', function(req, res){ 17 | if (req._type == Spider.type.LINK) { 18 | spider.read(_.filter(res, validLink)); 19 | } else if (req._type == Spider.type.ARTICLE) { 20 | console.log(req.uri, res.title); 21 | } 22 | }); 23 | spider.on('end', function(){ 24 | console.log('[END]'); 25 | }); 26 | 27 | spider.crawl([ 28 | 'http://www.sina.com.cn', 29 | 'http://www.163.com', 30 | 'http://www.autohome.com.cn', 31 | 'http://www.sohu.com' 32 | ]); 33 | 34 | /* 35 | setTimeout(function(){ 36 | spider.crawl([ 37 | 'http://getbootstrap.com/components/', 38 | 'https://lodash.com/docs#compact', 39 | 'https://www.npmjs.org/package/read-art' 40 | ]); 41 | }, 200);*/ 42 | 43 | /* 44 | setTimeout(function(){ 45 | spider.destroy(); 46 | }, 500);*/ 47 | 48 | setTimeout(function(){ 49 | var pong = spider.ping(); 50 | console.log(pong); 51 | }, 20000); 52 | 53 | function validLink(ele){ 54 | if (!ele.uri || !ele.title) { 55 | return false; 56 | } 57 | /** 58 | * must be: 59 | * 1. uri must have 4 digital at least 60 | * 2. uri can not be a bitmap 61 | * 3. uri can not have no path 62 | * 4. length of title must greater than 5 63 | */ 64 | var qsi, uri = ele.uri; 65 | if ((qsi = uri.indexOf('?')) > 0) { 66 | uri = uri.substr(0, qsi); 67 | } 68 | return uri.match(/\d{4,}/i) && !uri.match(/\.(jpg|png|jpeg|pdf)/i) && uri.indexOf('/') != uri.length - 1 && ele.title.length >= 5; 69 | } -------------------------------------------------------------------------------- /lib/util/crawler.js: -------------------------------------------------------------------------------- 1 | var _ = require('lodash'), 2 | chalk = require('chalk'), 3 | req = require('req-fast'), 4 | Debug = require('./debug'); 5 | 6 | module.exports = Crawler; 7 | 8 | function Crawler(options) { 9 | if (!(this instanceof Crawler)) { 10 | return new Crawler(options); 11 | } 12 | 13 | this.options = _.defaults(options || {}, { 14 | method: 'GET', 15 | timeout: 10000, 16 | debug: false 17 | }); 18 | 19 | this._log = Debug({ 20 | namespace: 'crawler', 21 | debug: this.options.debug 22 | }); 23 | } 24 | 25 | _.assign(Crawler.prototype, { 26 | crawl: function (url, fn) { 27 | if (_.isArray(url)) { 28 | for (var i = 0; i < url.length; i++) { 29 | this.crawl(url[i], fn); 30 | } 31 | return; 32 | } 33 | if (typeof url == 'string') { 34 | url = {uri: url}; 35 | } 36 | if (!url || !url.uri) { 37 | return fn(new Error('`uri` is required'), url); 38 | } 39 | _.defaults(url, this.options); 40 | this._request(url, fn); 41 | }, 42 | destroy: function () { 43 | this._destroyed = true; 44 | }, 45 | _request: function (url, fn) { 46 | this._log.i('http', chalk.magenta('GET'), chalk.underline.grey(url.uri)); 47 | var pickedKeys = ['uri', 'method', 'timeout', 'dataType', 'data', 'agent', 'charset', 'disableRedirect', 'maxRedirects', 'disableGzip', 'trackCookie', 'headers', 'cookies', 'proxy'], 48 | options = _.pick(url, pickedKeys); 49 | options.__data = _.omit(url, pickedKeys); 50 | 51 | req(options, function (callback, error, resp) { 52 | if (this._destroyed) { 53 | error = new Error('request was destroyed.'); 54 | } 55 | // handle error. 56 | if (!error && !resp) { 57 | error = new Error('No response from server.'); 58 | } else if (!error && resp && !resp.body) { 59 | error = new Error('No body has been found.'); 60 | } 61 | this._processHTML(error, options, resp, callback); 62 | }.bind(this, fn)); 63 | }, 64 | _processHTML: function (error, data, resp, fn) { 65 | var uriArg = chalk.underline.grey(data.uri); 66 | 67 | // handle error. 68 | if (error) { 69 | this._log.e(error.message, uriArg); 70 | return fn(error, data); 71 | } 72 | 73 | this._log.i('http', chalk.magenta(resp.statusCode), uriArg); 74 | 75 | var body = resp.body; 76 | 77 | // handle JSON. 78 | if (typeof body == 'object') { 79 | var errMsg = 'Body is a type of JSON, can not be crawled.'; 80 | this._log.w(errMsg, uriArg); 81 | return fn(new Error(errMsg), data); 82 | } 83 | 84 | // make sure response body is string. 85 | if (typeof body != 'string') { 86 | this._log.w('Body is not a type of String, try to decode by UTF-8 encoding.', uriArg); 87 | body = body.toString('utf-8'); 88 | } 89 | 90 | this._log.i('http', chalk.magenta('FIN'), uriArg); 91 | fn(null, data, body); 92 | } 93 | }); -------------------------------------------------------------------------------- /lib/util/helper.js: -------------------------------------------------------------------------------- 1 | var URI = require('urijs'); 2 | 3 | var helper = module.exports = { 4 | SPIDER_TYPE: { 5 | LINK: 'link', 6 | ARTICLE: 'article' 7 | } 8 | }; 9 | 10 | /** 11 | * Analyze href of the specific anchor 12 | * @param {Object} options, including: 13 | * {URIjs} baseURI 14 | * {Cheerio} ele anchor 15 | * {Function} predication verification 16 | * @returns {*} 17 | */ 18 | helper.analyzeHref = function(options){ 19 | var href = options.ele.attr('href'), parent; 20 | // if href attribute does not exist. 21 | if(!href && (parent = options.ele.parent()) && parent.length > 0){ 22 | var onclick; 23 | // try to get href from onclick if onclick like `window.open('/link_to.html')`. 24 | if(onclick = parent.attr('onclick')){ 25 | var link; 26 | if(link = onclick.match(/['"]([^'"]+)['"]/)){ 27 | href = link[1]; 28 | } 29 | } 30 | } 31 | // if href still does not exist, returns nothing. 32 | if(!href){ 33 | return; 34 | } 35 | 36 | // trim href. 37 | href = href.trim(); 38 | 39 | // if href only contains hash, returns nothing. 40 | if(href.indexOf('#') == 0){ 41 | return; 42 | } 43 | 44 | var uri; 45 | 46 | // try to parse href as URIjs object. 47 | try{ 48 | uri = URI(href); 49 | if(uri.is('relative')){ 50 | // make sure it is an absolute url. 51 | uri = uri.absoluteTo(options.baseURI); 52 | } 53 | }catch(err){ 54 | return; 55 | } 56 | // both of them must in a same domain. 57 | if(options.domain && uri.domain().toLowerCase() != options.baseURI.domain().toLowerCase()){ 58 | return; 59 | } 60 | 61 | // expose returning value. 62 | var retVal = { 63 | uri: uri.href(), 64 | title: (options.ele.attr('title') || options.ele.text() || '').trim().replace(/[\r\n\t]/g, ' ') 65 | }; 66 | 67 | return retVal; 68 | }; 69 | 70 | /** 71 | * Analyze links on the specific element (including children-anchors). 72 | * @param {Object} options 73 | * {String} url basic url 74 | * {Cheerio} ele body cheerio element 75 | * {Function} predication verification 76 | * {Function} onProcess process data 77 | * @returns {*} 78 | */ 79 | helper.analyzeLinks = function(options){ 80 | var baseURI; 81 | try{ 82 | // parse base url as URIjs object. 83 | baseURI = URI(options.url).normalize(); 84 | 85 | // the `base` tag in `head` takes top priority as base href. 86 | var baseEle = options.ele('head>base'), baseHref; 87 | if(baseEle && baseEle.length > 0 && (baseHref = baseEle.eq(0).attr('href'))){ 88 | var baseHrefURI = URI(baseHref); 89 | if(baseHrefURI.is('absolute')){ 90 | baseURI = baseHrefURI; 91 | }else{ 92 | baseURI = baseHrefURI.absoluteTo(baseURI); 93 | } 94 | } 95 | }catch(err){ 96 | return err.message; 97 | } 98 | 99 | var links = []; 100 | // map all anchors as simple {title: '', href:''} object. 101 | options.ele('a').each(function(idx,lnk){ 102 | var ele = helper.analyzeHref({ 103 | ele: options.ele(this), 104 | domain: options.domain, 105 | baseURI: baseURI 106 | }); 107 | ele && links.push(ele); 108 | }); 109 | return links; 110 | } 111 | -------------------------------------------------------------------------------- /lib/cluster/spider.js: -------------------------------------------------------------------------------- 1 | var async = require('async'), 2 | cheerio = require('cheerio'), 3 | helper = require('../util/helper'), 4 | Crawler = require('../util/crawler'), 5 | read = require('read-art'), 6 | _ = require('lodash'); 7 | 8 | var options = JSON.parse(process.argv.slice(2)); 9 | 10 | run(); 11 | 12 | function run(){ 13 | var crawler = Crawler(options); 14 | 15 | var queue = async.queue(function (job, fn) { 16 | crawler.crawl(job, processData.bind(null, job._type, fn)); 17 | }, options.concurrency); 18 | 19 | /** 20 | * Listen on message. 21 | */ 22 | process.on('message', function(msg){ 23 | // zip array to an object 24 | var req = _.zipObject(['code', 'data', 'type'], msg); 25 | if(req.code == 'kill'){ 26 | // try to quit peaceful. 27 | crawler.destroy(); 28 | queue.kill(); 29 | process.exit(0); 30 | }else if(req.code == 'data'){ 31 | // wrap data, and begin working. 32 | queue.push(_.map(req.data, function(url){ 33 | if(_.isString(url)){ 34 | url = {uri: url}; 35 | } 36 | return _.assign(url, {_type: req.type}); 37 | })); 38 | } 39 | }); 40 | } 41 | 42 | /** 43 | * Process data. 44 | * @param {String} type `links` or `article` 45 | * @param {Function} callback of queue 46 | * @param {Error} err 47 | * @param {Object} data 48 | * @param {String} body 49 | */ 50 | function processData(type, callback, err, data, body){ 51 | // make sure process is connected. 52 | if(!process.connected) { 53 | return callback(); 54 | } 55 | // bind type. 56 | _.assign(data, {_type: type}); 57 | 58 | var result; 59 | if(err){ 60 | // handle error. 61 | result = err.message; 62 | }else if (type == helper.SPIDER_TYPE.LINK) { 63 | // crawl links from site. 64 | result = crawlLinks(data, body); 65 | }else if(type == helper.SPIDER_TYPE.ARTICLE){ 66 | // read article by url. 67 | return read(body, function(data, callback, err, art){ 68 | if(err){ 69 | process.send(['fin', data, {error: err.message}]); 70 | }else{ 71 | process.send(['fin', data, {title: art.title, content: art.content}]); 72 | } 73 | art = null; 74 | return callback(); 75 | }.bind(null, data, callback)); 76 | }else{ 77 | result = 'only `links` and `articles` can be crawled.'; 78 | } 79 | // send finish signal. 80 | process.send(['fin', data, _.isString(result) ? {error: result} : result]); 81 | callback(); 82 | } 83 | 84 | /** 85 | * Crawl links from a site. 86 | * @param {Object} data request data 87 | * @param {Object} body html body 88 | * @returns {*} 89 | */ 90 | function crawlLinks(data, body){ 91 | // body must be a HTML string. 92 | if (body.search(/^\s* numCPUs || this.options.workers < 0) { 68 | this._log.w('cluster', 'maximize of workers is', numCPUs, 'but currency', this.options.workers); 69 | this.options.workers = numCPUs; 70 | } 71 | this._log.i('cluster', chalk.bold.yellow('setup master...')); 72 | 73 | cluster.setupMaster({ 74 | exec: path.resolve(__dirname, 'spider.js'), 75 | args: [JSON.stringify(_.omit(this.options, 'workers'))], 76 | silent: false 77 | }); 78 | 79 | for (var i = 0; i < this.options.workers; i++) { 80 | var worker = cluster.fork(); 81 | worker.count = 0; 82 | this._log.i('spider', chalk.blue('#' + worker.id), 'has prepared to work'); 83 | worker.on('error', function (id, err) { 84 | this._log.e('spider', chalk.blue('#' + id), 'has an error:', err.message); 85 | this.emit('error', err, {worker: id}); 86 | }.bind(this, worker.id)) 87 | .on('exit', function (id, code, sgnal) { 88 | this._log.i('spider', chalk.blue('#' + id), 'has exited yet, code:', (!isNaN(code) ? code : 'unknown')); 89 | if(this.ping() == 0){ 90 | this.emit('end'); 91 | } 92 | }.bind(this, worker.id)) 93 | .on('message', this._processData.bind(this, worker.id)); 94 | } 95 | }, 96 | _allocateJobs: function(urls, type){ 97 | // grep id and count, then sort by count. 98 | var wks = _(cluster.workers).values().map(function(w){ 99 | return {id: w.id, count: w.count}; 100 | }).sortBy(function(w){ 101 | return w.count; 102 | }).clone(); 103 | 104 | if(wks.length == 0){ 105 | return; 106 | } 107 | 108 | var max = wks[wks.length - 1].count; 109 | 110 | function allocate(_urls, _wks, _fill){ 111 | var size = 0, extra = 0; 112 | if(_fill) { 113 | size = parseInt(_urls.length / this.options.workers); 114 | extra = _urls.length % this.options.workers, i = 0; 115 | } 116 | for(var i = 0; i < _wks.length; i++){ 117 | var data = _urls.splice(0, size + ((i < extra) ? 1 : 0) + max - _wks[i].count); 118 | if(data.length == 0) { 119 | break; 120 | } 121 | 122 | var worker = cluster.workers[_wks[i].id]; 123 | worker.count += data.length; 124 | worker.send(['data', data, type]); 125 | } 126 | } 127 | 128 | // Equal allocation 129 | if(max == 0){ 130 | return allocate.call(this, urls, wks, true); 131 | } 132 | // load balance 133 | var sum = _(wks).map(function(w){ return w.count; }).reduce(function(sum, num){ return sum + num; }); 134 | allocate.call(this, urls, wks, urls.length > max * wks.length - sum) 135 | }, 136 | _processData: function(id, data){ 137 | cluster.workers[id].count--; 138 | 139 | var resp = _.zipObject(['code', 'data', 'body'], data); 140 | 141 | if(resp.code == 'fin'){ 142 | if(resp.body && resp.body.error) { 143 | return this.emit('error', new Error(resp.body.error), resp.data); 144 | } 145 | this.emit('data', resp.data, resp.body); 146 | } 147 | } 148 | }); 149 | 150 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | spider2 [![NPM version](https://badge.fury.io/js/spider2.svg)](http://badge.fury.io/js/spider2) 2 | ========= 3 | 4 | A 2nd generation spider to crawl any article site, automatic reading title and content. 5 | # Performance 6 | In my case, the speed of [spider](https://github.com/Tjatse/spider2) is about **700 thousands documents per day**, **22 million per month**, and the maximize crawling speed is **450 per minute**, **avg 80 per minute**, the memory cost are about **200 megabytes** on each spider kernel, and the accuracy is about 90%, the rest 10% can be fixed by customizing [Score Rules](#score_rule) or [Selectors](selectors). it's better than any other readability modules. 7 | 8 | ![image](screenshots/performance.jpg) 9 | 10 | > Server infos: 11 | > * 20M bandwidth of fibre-optical 12 | > * 8 Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz cpus 13 | > * 32G memory 14 | 15 | # Features 16 | ## Multi-core crawling 17 | It is not a single spider runs in a single thread. To take advantage of multi-core systems we maybe wanna launch 18 | a cluster of processes to handle the load, this is exactly what `spider2` does - Crawling fast and in order to 19 | maximum performance. 20 | 21 | ## Concurrency 22 | **Multi-core crawling** feature is just make spiders work in a fork mode, but concurrency makes them work together in a 23 | same thread and at a same time!!! 24 | 25 | ## Automatic 26 | The old school crawls links/articles in a manual mode, e.g.: *request to server and get the response(HTML), then using 27 | jQuery or something else to analyze links/articles by hard code*, this feels sucks, currently, we just need to make a list 28 | of websites that you wanna scrape, `spider2` will handle anything else, take a cup of coffee, and wait to harvest the fruit. 29 | 30 | ## Queue 31 | All the jobs are managed by async queue, so you can keep pushing the urls which to be crawled/read. 32 | 33 | ## More features 34 | - Automatic User-Agent (actually I am a browser, not a spider). 35 | - Proxy supports (avoid being blocked by server). 36 | - Blazing fast with Cheerio & Cluster Fork Mode. 37 | - Automatic Decode encodings (especially useful with non-english language). 38 | - ... 39 | 40 | # Installation 41 | ``` 42 | npm install spider2 --production 43 | ``` 44 | 45 | # Programmatic 46 | ## Require 47 | ```javascript 48 | var Spider = require('spider2'); 49 | ``` 50 | 51 | ## Usage 52 | ```javascript 53 | var spider = Spider({ 54 | timeout: 5000, 55 | debug: true, 56 | domain: true, 57 | workers: 7, 58 | concurrency: 1 59 | }); 60 | ``` 61 | ### Options 62 | The options including: 63 | - **timeout** Set a timeout (in milliseconds) for the request, `10000` by default. 64 | - **debug** A value indicating whether show the debug log or not, `false` by default, also it could be set with `process.env.SP_DEBUG`. 65 | - **domain** A value indicating whether the links being crawled should in a same domain with the base url or not, `true` by default. 66 | - **workers** Number of multi-core, can not greater than number of CPUs. 67 | - **concurrency** number of concurrency per worker, `1` by default. 68 | 69 | ### Events 70 | #### error 71 | This event is emitted when an error has been caught, the arguments including: 72 | - `err` Error object 73 | - `req` Request data, if `req.worker` is defined an it is a number, means error is from the worker, `req.worker` is the id of a worker, otherwise it is a normal error. 74 | Example: 75 | ```javascript 76 | spider.on('error', function (err, req) { 77 | if (req.worker) { 78 | console.error('worker #', req.worker, 'has an error:', err.message); 79 | } else { 80 | console.error(req.uri, err.message); 81 | } 82 | }); 83 | ``` 84 | 85 | #### data 86 | Data send by spider are obtained using this event, arguments including: 87 | - `req` Request data. 88 | - `res` Response data, if `req._type` equals `Spider.type.LINK`, `res` should be an array, including key-value pairs like `{title: [ANCHOR_TITLE], uri: [ANCHOR_HREF]}`, and if equals `Spider.type.ARTICLE`, `res` should be an object, keys including `title` and `content`. 89 | Example: 90 | ```javascript 91 | spider.on('data', function (req, res) { 92 | if (req._type == Spider.type.LINK) { 93 | spider.read(_.filter(res, validLink)); 94 | } else if (req._type == Spider.type.ARTICLE) { 95 | console.log(req.uri, res.title); 96 | } 97 | }); 98 | ``` 99 | 100 | #### end 101 | This event is emitted after all the spiders terminated abnormally, e.g.: 102 | ```javascript 103 | spider.on('end', function () { 104 | console.log('[END]'); 105 | }); 106 | ``` 107 | 108 | ### Methods 109 | #### crawl 110 | Crawl links, `OPTION` could be one of below: 111 | - **String** Url 112 | - **Array** Array of urls, both `[String, String, ...]` and `[Object, Object, ...]` will be fine. 113 | - **Object** Must including `uri` property. 114 | 115 | e.g.: 116 | ```javascript 117 | spider.crawl([OPTION]); 118 | ``` 119 | 120 | #### read 121 | Read title and content of article, `OPTION` is same as above, e.g.: 122 | ```javascript 123 | spider.read([OPTION]); 124 | ``` 125 | 126 | #### destroy 127 | Peaceful quit, e.g.: 128 | ```javascript 129 | spider.destroy(); 130 | ``` 131 | 132 | #### ping 133 | Ping the spider and returns workers' status Array, e.g.: 134 | ```javascript 135 | var pong = spider.ping(); 136 | console.log(pong); 137 | ``` 138 | 139 | `pong` will be printed like: 140 | ``` 141 | [ 142 | {id: 1, count: 12}, 143 | {id: 2, count: 90}, 144 | ... 145 | ] 146 | ``` 147 | 148 | `id` is the id of worker, and `count` is the count of remaining jobs. 149 | 150 | # Test 151 | ``` 152 | npm test 153 | ``` 154 | 155 | # Examples 156 | Head over to `test/` or `/examples` directory. 157 | 158 | # TODO 159 | - [ ] fix typo bug 160 | - [ ] more tests 161 | 162 | # License 163 | Copyright 2014 Tjatse 164 | 165 | Licensed under the Apache License, Version 2.0 (the "License"); 166 | you may not use this file except in compliance with the License. 167 | You may obtain a copy of the License at 168 | 169 | http://www.apache.org/licenses/LICENSE-2.0 170 | 171 | Unless required by applicable law or agreed to in writing, software 172 | distributed under the License is distributed on an "AS IS" BASIS, 173 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 174 | See the License for the specific language governing permissions and 175 | limitations under the License. --------------------------------------------------------------------------------