├── .gitignore ├── .jshintignore ├── .jshintrc ├── CHANGELOG.md ├── LICENSE ├── README.md ├── index.js ├── lib └── document.js ├── package.json └── test ├── document.js ├── mocha.opts └── spider.js /.gitignore: -------------------------------------------------------------------------------- 1 | lib-cov 2 | *.seed 3 | *.log 4 | *.csv 5 | *.dat 6 | *.out 7 | *.pid 8 | *.gz 9 | 10 | pids 11 | logs 12 | results 13 | 14 | npm-debug.log 15 | node_modules 16 | -------------------------------------------------------------------------------- /.jshintignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | test/ -------------------------------------------------------------------------------- /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "shadow": "inner", 3 | "indent": 1, 4 | 5 | "camelcase": false, 6 | "eqeqeq": true, 7 | "eqnull": true, 8 | "freeze": true, 9 | "funcscope": true, 10 | "newcap": true, 11 | "noarg": true, 12 | "noempty": true, 13 | "nonbsp": true, 14 | "unused": "vars", 15 | "undef": true, 16 | "scripturl": true, 17 | "strict": false, 18 | "loopfunc": true, 19 | "quotmark": "single", 20 | 21 | "esnext": true, 22 | "globals": {"define": true, "jade":true}, 23 | "browser": true, 24 | "devel": true, 25 | "mocha": true, 26 | "node": true, 27 | "jquery": true 28 | } 29 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.4.1 4 | - Fixed bug when adding referrer 5 | 6 | ## 1.4.0 7 | - Implemented `addReferrer` option 8 | - Implemented `keepAlive` option 9 | - Implemented `xhr` option 10 | 11 | ## 1.3.0 12 | - Fixed document.$ (cheerio instance) was being recreated on every call 13 | - Implemented git hooks via ghooks module 14 | - Added jshint script, along with it's rc config and ignore files 15 | - Added a pre-commit hook to lint the code 16 | - Added tests powered by mocha+chai 17 | - Added a pre-push hook to test the code 18 | 19 | ## 1.2.2 20 | - request() errors hanging the flow fixed, thanks @arve0 21 | - require('./lib/Document') fixed, it's now lower case, thanks @arve0 22 | 23 | ## 1.2.1 24 | - `logs` cannot be `true` anymore, just specify a stream, or don't 25 | - All options are passed to `request`, so `headers` and `encoding` work implicitely now 26 | - Updated the docs 27 | 28 | ## 1.2.0 29 | - Added `catchErrors` option, if true handlers will by try-catched and errors sent to `error` callback 30 | - `logs` option can be any stream and the logs are written to it (stdout, stderr, file, etc) 31 | 32 | ## 1.1.0 33 | - Added `delay` option to wait after each request 34 | - Added `encoding` option to set the requests encoding 35 | 36 | ## 1.0.0 37 | - Updated dependencies versions 38 | - Duplicated URLs are ignored unless `allowDuplicates` is `true` 39 | - Reversed `error()` callback's arguments 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2015, Ariel Flesler 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | node-spider 2 | ======= 3 | 4 | Generic web crawler powered by Node.js 5 | 6 | # Installation 7 | Using npm: 8 | 9 | $ npm install node-spider 10 | 11 | # Usage 12 | ```js 13 | var Spider = require('node-spider'); 14 | 15 | var spider = new Spider({ 16 | // How many requests can be run in parallel 17 | concurrent: 5, 18 | // How long to wait after each request 19 | delay: 0, 20 | // A stream to where internal logs are sent, optional 21 | logs: process.stderr, 22 | // Re-visit visited URLs, false by default 23 | allowDuplicates: false, 24 | // If `true` all queued handlers will be try-catch'd, errors go to `error` callback 25 | catchErrors: true, 26 | // If `true` the spider will set the Referer header automatically on subsequent requests 27 | addReferrer: false, 28 | // If `true` adds the X-Requested-With:XMLHttpRequest header 29 | xhr: false, 30 | // If `true` adds the Connection:keep-alive header and forever option on request module 31 | keepAlive: false, 32 | // Called when there's an error, throw will be used if none is provided 33 | error: function(err, url) { 34 | }, 35 | // Called when there are no more requests 36 | done: function() { 37 | }, 38 | 39 | //- All options are passed to `request` module, for example: 40 | headers: { 'user-agent': 'node-spider' }, 41 | encoding: 'utf8' 42 | }); 43 | 44 | var handleRequest = function(doc) { 45 | // new page crawled 46 | console.log(doc.res); // response object 47 | console.log(doc.url); // page url 48 | // uses cheerio, check its docs for more info 49 | doc.$('a').each(function(i, elem) { 50 | // do stuff with element 51 | var href = doc.$(elem).attr('href').split('#')[0]; 52 | var url = doc.resolve(href); 53 | // crawl more 54 | spider.queue(url, handleRequest); 55 | }); 56 | }; 57 | 58 | // start crawling 59 | spider.queue('http://google.com/', handleRequest); 60 | ``` 61 | # License 62 | 63 | Copyright (c) 2014, Ariel Flesler 64 | All rights reserved. 65 | 66 | Redistribution and use in source and binary forms, with or without modification, 67 | are permitted provided that the following conditions are met: 68 | 69 | * Redistributions of source code must retain the above copyright notice, this 70 | list of conditions and the following disclaimer. 71 | 72 | * Redistributions in binary form must reproduce the above copyright notice, this 73 | list of conditions and the following disclaimer in the documentation and/or 74 | other materials provided with the distribution. 75 | 76 | * Neither the name of the {organization} nor the names of its 77 | contributors may be used to endorse or promote products derived from 78 | this software without specific prior written permission. 79 | 80 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 81 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 82 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 83 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 84 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 85 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 86 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 87 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 88 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 89 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 90 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var request = require('request'), 2 | Doc = require('./lib/document'); 3 | 4 | function Spider(opts) { 5 | opts = this.opts = opts || {}; 6 | opts.concurrent = opts.concurrent || 1; 7 | opts.headers = opts.headers || {}; 8 | 9 | if (opts.xhr) { 10 | opts.headers['X-Requested-With'] = 'XMLHttpRequest'; 11 | } 12 | if (opts.keepAlive) { 13 | opts.headers.Connection = 'keep-alive'; 14 | opts.forever = true; 15 | } 16 | 17 | this.pending = []; 18 | this.active = []; 19 | this.visited = {}; 20 | } 21 | 22 | Spider.prototype = { 23 | constructor: Spider, 24 | 25 | log: function(status, url) { 26 | if (this.opts.logs) { 27 | this.opts.logs.write('Spider: ' + status + ' ' + url + '\n'); 28 | } 29 | }, 30 | 31 | full: function() { 32 | return this.active.length >= this.opts.concurrent; 33 | }, 34 | 35 | queue: function(url, done) { 36 | if (this.visited[url]) return; 37 | 38 | if (!this.opts.allowDuplicates) { 39 | this.visited[url] = true; 40 | } 41 | 42 | if (this.full()) { 43 | this.log('Queueing', url); 44 | this.pending.push([url, done]); 45 | } else { 46 | this.load(url, done); 47 | } 48 | }, 49 | 50 | load: function(url, done, referrer) { 51 | this.log('Loading', url); 52 | this.active.push(url); 53 | 54 | if (this.opts.addReferrer) { 55 | this.opts.headers.Referer = referrer; 56 | } 57 | 58 | this.opts.url = url; 59 | this._request(this.opts, function(err, res, _) { 60 | if (err) { 61 | this.error(err, url); 62 | return this.finished(url); 63 | } 64 | 65 | var doc = new Doc(url, res); 66 | this.log('Success', url); 67 | if (this.opts.catchErrors) { 68 | try { done.call(this, doc); } 69 | catch (err) { this.error(err, url); } 70 | } else { 71 | done.call(this, doc); 72 | } 73 | this.finished(url); 74 | }.bind(this)); 75 | }, 76 | 77 | // Wrap it for easier mocking 78 | _request: function(opts, done) { 79 | // All options forwarded to request() 80 | request(opts, done); 81 | }, 82 | 83 | error: function(err, url) { 84 | this.log('Error', url); 85 | if (!this.opts.error) throw err; 86 | this.opts.error(err, url); 87 | }, 88 | 89 | dequeue: function(referrer) { 90 | var args = this.pending.shift(); 91 | if (args) { 92 | this.load.apply(this, args.concat(referrer)); 93 | } else if (this.opts.done && this.active.length === 0) { 94 | this.opts.done.call(this); 95 | } 96 | }, 97 | 98 | finished: function(url) { 99 | var i = this.active.indexOf(url); 100 | if (i === -1) { 101 | return this.log('URL was not active', url); 102 | } 103 | this.active.splice(i, 1); 104 | 105 | if (!this.full()) { 106 | if (this.opts.delay) { 107 | setTimeout(this.dequeue.bind(this, url), this.opts.delay); 108 | } else { 109 | this.dequeue(url); 110 | } 111 | } 112 | } 113 | }; 114 | 115 | module.exports = Spider; 116 | -------------------------------------------------------------------------------- /lib/document.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio'), 2 | url = require('url'); 3 | 4 | function Document(url, res) { 5 | this.url = url; 6 | this.res = res; 7 | } 8 | 9 | Document.prototype = { 10 | constructor: Document, 11 | 12 | // Lazy parse 13 | get $() { 14 | return this._$ || (this._$ = cheerio.load(this.res.body)); 15 | }, 16 | 17 | resolve: function(uri) { 18 | return url.resolve(this.url, uri); 19 | } 20 | }; 21 | 22 | module.exports = Document; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-spider", 3 | "author": "Ariel Flesler ", 4 | "version": "1.4.1", 5 | "description": "Generic web crawler powered by Node.js", 6 | "keywords": [ 7 | "spider", 8 | "crawler", 9 | "node", 10 | "nodejs", 11 | "web", 12 | "scrap", 13 | "crawl" 14 | ], 15 | "main": "./index.js", 16 | "license": "BSD-2-Clause", 17 | "bugs": "https://github.com/flesler/node-spider/issues", 18 | "repository": "git://github.com/flesler/node-spider.git", 19 | "dependencies": { 20 | "cheerio": "0.19.x", 21 | "request": "2.61.x" 22 | }, 23 | "devDependencies": { 24 | "chai": "^3.4.0", 25 | "ghooks": "^0.3.2", 26 | "jshint": "^2.8.0", 27 | "mocha": "^2.3.3" 28 | }, 29 | "scripts": { 30 | "test": "./node_modules/.bin/mocha -b test/*.js", 31 | "lint": "./node_modules/.bin/jshint index.js lib/document.js" 32 | }, 33 | "config": { 34 | "ghooks": { 35 | "pre-commit": "npm run lint", 36 | "pre-push": "npm test" 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /test/document.js: -------------------------------------------------------------------------------- 1 | /*jshint -W030 */ 2 | var expect = require('chai').expect, 3 | Doc = require('../lib/document'); 4 | 5 | describe('Document', function() { 6 | 7 | const URL = 'https://www.google.com/somepage/index.html'; 8 | const HTML = 'link'; 9 | 10 | var doc; 11 | beforeEach(function() { 12 | doc = new Doc(URL, {body:HTML}); 13 | }); 14 | 15 | describe('constructor()', function() { 16 | it('should map the first argument to the url attribute', function() { 17 | expect(doc.url).to.equal(URL); 18 | }); 19 | 20 | it('should map the first argument to the res attribute', function() { 21 | expect(doc.res.body).to.equal(HTML); 22 | }); 23 | }); 24 | 25 | describe('resolve()', function() { 26 | it('should resolve relative paths', function() { 27 | expect(doc.resolve('otherpage.html')).to.equal('https://www.google.com/somepage/otherpage.html'); 28 | expect(doc.resolve('./otherpage.html')).to.equal('https://www.google.com/somepage/otherpage.html'); 29 | expect(doc.resolve('../otherpage.html')).to.equal('https://www.google.com/otherpage.html'); 30 | }); 31 | 32 | it('should resolve root paths', function() { 33 | expect(doc.resolve('/otherpage.html')).to.equal('https://www.google.com/otherpage.html'); 34 | }); 35 | 36 | it('should resolve querystrings', function() { 37 | expect(doc.resolve('?key=value')).to.equal('https://www.google.com/somepage/index.html?key=value'); 38 | }); 39 | 40 | it('should resolve urls without protocol', function() { 41 | expect(doc.resolve('//yahoo.com/page.html')).to.equal('https://yahoo.com/page.html'); 42 | }); 43 | 44 | it('should resolve absolute urls', function() { 45 | expect(doc.resolve('http://yahoo.com/page.html')).to.equal('http://yahoo.com/page.html'); 46 | }); 47 | }); 48 | 49 | describe('get $()', function() { 50 | it('should not parse the body until it is called once', function() { 51 | expect(doc._$).to.be.undefined; 52 | }); 53 | 54 | it('should cache the cheerio instance internally', function() { 55 | doc.$; 56 | expect(doc._$).to.be.a.function; 57 | }); 58 | 59 | it('should not recreate the cheerio instance on every call', function() { 60 | expect(doc.$).to.equal(doc.$); 61 | }); 62 | 63 | it('should parse the body correctly', function() { 64 | expect(doc.$('body').length).to.equal(1); 65 | }); 66 | }); 67 | }); 68 | -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --reporter spec 2 | --bail 3 | --sort 4 | --recursive 5 | --inline-diffs -------------------------------------------------------------------------------- /test/spider.js: -------------------------------------------------------------------------------- 1 | /*jshint -W030 */ 2 | var expect = require('chai').expect, 3 | Spider = require('../'); 4 | 5 | describe('Spider', function() { 6 | 7 | function create(opts) { 8 | return new Spider(opts); 9 | } 10 | 11 | // Empty _request() by default 12 | Spider.prototype._request = function() {}; 13 | 14 | function mock(spider, fn) { 15 | // Override once 16 | spider._request = function(opts, done) { 17 | spider._request = function(){}; 18 | fn(opts, done); 19 | }; 20 | } 21 | 22 | describe('constructor()', function() { 23 | it('should have the default options', function() { 24 | var opts = create().opts; 25 | expect(opts.concurrent).to.equal(1); 26 | expect(opts.headers).to.deep.equal({}); 27 | }); 28 | 29 | it('should have the correct initial state', function() { 30 | var spider = create(); 31 | expect(spider.pending).to.deep.equal([]); 32 | expect(spider.active).to.deep.equal([]); 33 | expect(spider.visited).to.deep.equal({}); 34 | }); 35 | 36 | describe('options.concurrent', function() { 37 | it('should not run more requests concurrent than allowed', function() { 38 | var spider = create({concurrent:2}); 39 | expect(spider.full()).to.be.false; 40 | spider.queue('a'); 41 | expect(spider.active.length).to.equal(1); 42 | expect(spider.full()).to.be.false; 43 | spider.queue('b'); 44 | expect(spider.active.length).to.equal(2); 45 | expect(spider.full()).to.be.true; 46 | spider.queue('c'); 47 | expect(spider.active.length).to.equal(2); 48 | expect(spider.pending.length).to.equal(1); 49 | expect(spider.full()).to.be.true; 50 | }); 51 | }); 52 | 53 | describe('options.allowDuplicates', function() { 54 | it('should not queue visited urls if false', function() { 55 | var spider = create({allowDuplicates:false}); 56 | spider.queue('a'); 57 | spider.queue('a'); 58 | expect(spider.active.length).to.equal(1); 59 | expect(spider.pending.length).to.equal(0); 60 | expect(spider.visited.a).to.be.true; 61 | }); 62 | 63 | it('should queue visited urls if true', function() { 64 | var spider = create({allowDuplicates:true}); 65 | spider.queue('a'); 66 | spider.queue('a'); 67 | expect(spider.active.length).to.equal(1); 68 | expect(spider.pending.length).to.equal(1); 69 | expect(spider.visited.a).to.be.undefined; 70 | }); 71 | }); 72 | }); 73 | 74 | // TO BE CONTINUED... 75 | }); --------------------------------------------------------------------------------