├── .gitignore ├── lib ├── cli.js ├── index.js ├── cache.js ├── quickcrawl.js ├── cache-backend-fs.js ├── queue.js ├── cookies.js └── crawler.js ├── .travis.yml ├── .jshintrc ├── .editorconfig ├── example ├── quickcrawl-example.js ├── testcrawler.js └── savetodisk.js ├── test ├── init.js ├── jshint.js ├── lib │ ├── testserver.js │ └── routes.js ├── depth.js ├── testcrawl.js ├── reliability.js ├── discovery.js ├── resourcevalidity.js └── cookies.js ├── package.json └── README.markdown /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules -------------------------------------------------------------------------------- /lib/cli.js: -------------------------------------------------------------------------------- 1 | // CLI module for crawling. 2 | // Not yet built. 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.11" 4 | - "0.10" -------------------------------------------------------------------------------- /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "asi": false, 3 | "node": true, 4 | "require": true, 5 | "process": true, 6 | "module": true, 7 | "setInterval": true, 8 | "setTimeout": true, 9 | "clearTimeout": true, 10 | "Buffer": true 11 | } 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: http://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = false 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | 11 | # Tab indentation 12 | [*.js] 13 | indent_style = tab 14 | indent_size = 4 -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | // SimpleCrawler 2 | // Export interfaces 3 | 4 | module.exports = require("./crawler.js"); 5 | 6 | // Aliasing for compatibility with legacy code. 7 | module.exports.Crawler = module.exports; 8 | 9 | module.exports.queue = require("./queue.js"); 10 | module.exports.cache = require("./cache.js"); 11 | 12 | // Convenience function for small, fast crawls 13 | module.exports.crawl = require("./quickcrawl.js"); 14 | -------------------------------------------------------------------------------- /example/quickcrawl-example.js: -------------------------------------------------------------------------------- 1 | // Example demonstrating the simple (but less flexible) way of initiating 2 | // a crawler. 3 | 4 | var Crawler = require("../lib"); 5 | 6 | Crawler.crawl("http://deewr.gov.au/") 7 | .on("fetchstart",function(queueItem){ 8 | console.log("Starting request for:",queueItem.url); 9 | }) 10 | .on("fetchcomplete",function(queueItem){ 11 | console.log("Completed fetching resource:",queueItem.url); 12 | }); 13 | -------------------------------------------------------------------------------- /example/testcrawler.js: -------------------------------------------------------------------------------- 1 | var Crawler = require("../"), 2 | crawler = new Crawler("127.0.0.1","/",3000); 3 | 4 | crawler.on("crawlstart",function() { 5 | console.log("Crawl starting"); 6 | }); 7 | 8 | crawler.on("fetchstart",function(queueItem) { 9 | console.log("fetchStart",queueItem); 10 | }); 11 | 12 | crawler.on("fetchcomplete",function(queueItem) { 13 | console.log("fetchcomplete",queueItem); 14 | }); 15 | 16 | crawler.on("complete",function() { 17 | console.log("Finished!"); 18 | }); 19 | 20 | crawler.start(); -------------------------------------------------------------------------------- /test/init.js: -------------------------------------------------------------------------------- 1 | // Ensures that the crawler object is requireable, and doesn't die 2 | // horribly right off the bat 3 | 4 | var chai = require("chai"); 5 | chai.should(); 6 | 7 | describe("Crawler object",function() { 8 | 9 | it("should be able to be required",function() { 10 | var Crawler = require("../"); 11 | 12 | Crawler.should.be.a("function"); 13 | Crawler.Crawler.should.be.a("function"); 14 | }); 15 | 16 | it("should import the queue",function() { 17 | var Crawler = require("../"); 18 | 19 | Crawler.queue.should.be.a("function"); 20 | }); 21 | 22 | it("should import the cache system",function() { 23 | var Crawler = require("../"); 24 | 25 | Crawler.cache.should.be.a("function"); 26 | }); 27 | 28 | it("should be able to be initialised",function() { 29 | var Crawler = require("../"), 30 | myCrawler = new Crawler("127.0.0.1","/",3000); 31 | 32 | myCrawler.should.be.an.instanceof(Crawler); 33 | }); 34 | 35 | }) 36 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "simplecrawler", 3 | "description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.", 4 | "version": "0.3.11", 5 | "homepage": "http://github.com/cgiffard/node-simplecrawler", 6 | "author": "Christopher Giffard ", 7 | "keywords": [ 8 | "simple", 9 | "crawler", 10 | "spider", 11 | "cache", 12 | "queue", 13 | "simplecrawler", 14 | "eventemitter" 15 | ], 16 | "scripts": { 17 | "test": "mocha -R spec -t 5000" 18 | }, 19 | "bin": { 20 | "crawl": "./lib/cli.js" 21 | }, 22 | "repository": { 23 | "type": "git", 24 | "url": "http://github.com/cgiffard/node-simplecrawler.git" 25 | }, 26 | "bugs": { 27 | "url": "https://github.com/cgiffard/node-simplecrawler/issues" 28 | }, 29 | "main": "./lib/index.js", 30 | "engines": { 31 | "node": ">=0.8.0" 32 | }, 33 | "devDependencies": { 34 | "mocha": "~1.8.2", 35 | "jshint": "~0.7.x", 36 | "chai": "~1.2.0" 37 | }, 38 | "dependencies": { 39 | "URIjs": "~1.10.2" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /test/jshint.js: -------------------------------------------------------------------------------- 1 | // Tests to ensure crawler code is well formed 2 | 3 | var chai = require("chai"); 4 | chai.should(); 5 | 6 | describe("Core code",function() { 7 | var JSHINT = require("jshint").JSHINT, 8 | fs = require("fs"); 9 | 10 | function readCode(file) { 11 | file = __dirname + "/../lib/" + file + ".js"; 12 | return fs.readFileSync(file).toString("utf8"); 13 | } 14 | 15 | [ "cache-backend-fs", 16 | "cache", 17 | "cli", 18 | "cookies", 19 | "crawler", 20 | "index", 21 | "queue", 22 | "quickcrawl" ].forEach(function(item) { 23 | 24 | var code = readCode(item); 25 | 26 | it("module `" + item + "` should pass JSHint with no errors",function() { 27 | 28 | var slowThresholdMilliseconds = 200; 29 | this.slow(slowThresholdMilliseconds); 30 | 31 | JSHINT(code,{ 32 | "indent": 4, 33 | "undef": true 34 | }, 35 | { 36 | // Don't want no errant logging statements going to production! 37 | // `console` has been deliberately omitted from this whitelist. 38 | 39 | // All the regular node stuff 40 | "require": true, 41 | "module": true, 42 | "process": true, 43 | "setInterval": true, 44 | "clearInterval": true, 45 | "setTimeout": true, 46 | "clearTimeout": true, 47 | "Buffer": true 48 | }); 49 | 50 | if (JSHINT.errors.length) { 51 | throw new Error( 52 | "Line " + 53 | JSHINT.errors[0].line + ": " + 54 | JSHINT.errors[0].reason); 55 | } 56 | }); 57 | 58 | }); 59 | }); 60 | -------------------------------------------------------------------------------- /test/lib/testserver.js: -------------------------------------------------------------------------------- 1 | // Server for testing HTTP crawls! 2 | // Ultra simple - only for running with mocha tests. 3 | 4 | // Include HTTP 5 | var http = require("http"); 6 | 7 | // Create server for crawling 8 | var httpServer = http.createServer(); 9 | 10 | var testRoutes = require("./routes"); 11 | 12 | // Listen to events 13 | httpServer.on("request",function(req,res) { 14 | 15 | function write(status,data,contentType) { 16 | res.writeHead( 17 | status, 18 | http.STATUS_CODES[status], 19 | { 20 | "Content-Type": contentType || "text/html", 21 | "Content-Length": Buffer.byteLength(data), 22 | }); 23 | 24 | res.write(data); 25 | res.end(); 26 | } 27 | 28 | function redir(to) { 29 | var data = "Redirecting you to " + to; 30 | 31 | res.writeHead( 32 | 301, 33 | http.STATUS_CODES[301], 34 | { 35 | "Content-Type": "text/plain", 36 | "Content-Length": Buffer.byteLength(data), 37 | "Location": to 38 | }); 39 | 40 | res.write(data); 41 | res.end(); 42 | } 43 | 44 | if (testRoutes[req.url] && 45 | testRoutes[req.url] instanceof Function) { 46 | 47 | // Pass in a function that takes a status and some data to write back 48 | // out to the client 49 | testRoutes[req.url](write,redir); 50 | 51 | } else { 52 | 53 | // Otherwise, a 404 54 | res.writeHead(404,"Page Not Found"); 55 | res.write("Page not found."); 56 | res.end(); 57 | } 58 | }); 59 | 60 | httpServer.listen(3000); 61 | 62 | module.exports = httpServer; 63 | module.exports.routes = testRoutes; 64 | -------------------------------------------------------------------------------- /lib/cache.js: -------------------------------------------------------------------------------- 1 | // Simplecrawler - cache module 2 | // Christopher Giffard, 2011 3 | // 4 | // http://www.github.com/cgiffard/node-simplecrawler 5 | 6 | var fs = require("fs"); 7 | var EventEmitter = require('events').EventEmitter; 8 | var FilesystemBackend = require("./cache-backend-fs.js"); 9 | // var RedisBackend = require("cache-backend-redis.js"); 10 | // var MongoBackend = require("cache-backend-mongo.js"); 11 | 12 | // Init cache wrapper for backend... 13 | var Cache = function Cache(cacheLoadParameter,cacheBackend) { 14 | 15 | // Ensure parameters are how we want them... 16 | cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend; 17 | cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter]; 18 | 19 | // Now we can just run the factory. 20 | this.datastore = cacheBackend.apply(cacheBackend,cacheLoadParameter); 21 | 22 | // Instruct the backend to load up. 23 | this.datastore.load(); 24 | }; 25 | 26 | Cache.prototype = new EventEmitter(); 27 | 28 | // Set up data import and export functions 29 | Cache.prototype.setCacheData = function(queueObject,data,callback) { 30 | this.datastore.setItem(queueObject,data,callback); 31 | this.emit("setcache",queueObject,data); 32 | }; 33 | 34 | Cache.prototype.getCacheData = function(queueObject,callback) { 35 | this.datastore.getItem(queueObject,callback); 36 | }; 37 | 38 | Cache.prototype.saveCache = function() { 39 | this.datastore.saveCache(); 40 | }; 41 | 42 | module.exports = Cache; 43 | module.exports.Cache = Cache; 44 | module.exports.FilesystemBackend = FilesystemBackend; 45 | -------------------------------------------------------------------------------- /test/depth.js: -------------------------------------------------------------------------------- 1 | // Runs a very simple crawl on an HTTP server with different depth 2 | 3 | var chai = require("chai"); 4 | chai.should(); 5 | 6 | var testserver = require("./lib/testserver.js"); 7 | 8 | var Crawler = require("../"); 9 | 10 | // Test the number of links discovered for the given "depth" and compare it to "linksToDiscover" 11 | var depthTest = function(depth, linksToDiscover) { 12 | depth = parseInt(depth); // Force depth to be a number 13 | 14 | var crawler; 15 | var linksDiscovered; 16 | 17 | describe("depth "+ depth, function() { 18 | before(function() { 19 | // Create a new crawler to crawl our local test server 20 | crawler = new Crawler("127.0.0.1","/depth/1",3000); 21 | 22 | // Speed up tests. No point waiting for every request when we're running 23 | // our own server. 24 | crawler.interval = 1; 25 | 26 | // Define max depth for this crawl 27 | crawler.maxDepth = depth; 28 | 29 | linksDiscovered = 0; 30 | 31 | crawler.on("fetchcomplete",function(queueItem) { 32 | linksDiscovered++; 33 | }); 34 | 35 | crawler.start(); 36 | }); 37 | 38 | after(function() { 39 | // Clean listeners and crawler 40 | crawler.removeAllListeners("discoverycomplete"); 41 | crawler.removeAllListeners("complete"); 42 | crawler = null; 43 | }); 44 | 45 | it("should discover "+ linksToDiscover +" linked resources",function(done) { 46 | crawler.on("complete",function() { 47 | linksDiscovered.should.equal(linksToDiscover); 48 | done(); 49 | }); 50 | }); 51 | }); 52 | }; 53 | 54 | describe("Crawler max depth",function() { 55 | 56 | // depth: linksToDiscover 57 | var linksToDiscover = { 58 | 0: 11, // links for depth 0 59 | 1: 6, // links for depth 1 60 | 2: 7, // links for depth 2 61 | 3: 11 // links for depth 3 62 | }; 63 | 64 | for(var depth in linksToDiscover) { 65 | depthTest(depth, linksToDiscover[depth]); 66 | } 67 | 68 | }); 69 | -------------------------------------------------------------------------------- /example/savetodisk.js: -------------------------------------------------------------------------------- 1 | // Example use of simplecrawler, courtesy of @breck7! Thanks mate. :) 2 | 3 | var fs = require('node-fs'), 4 | url = require('url'), 5 | wrench = require('wrench'), 6 | Crawler = require("simplecrawler").Crawler 7 | 8 | /** 9 | * @param String. Domain to download. 10 | * @Param Function. Callback when crawl is complete. 11 | */ 12 | var downloadSite = function (domain, callback) { 13 | 14 | // Where to save downloaded data 15 | var outputDirectory = __dirname + '/' + domain 16 | var myCrawler = new Crawler(domain) 17 | myCrawler.interval = 250 18 | myCrawler.maxConcurrency = 5 19 | 20 | myCrawler.on("fetchcomplete",function(queueItem, responseBuffer, response) { 21 | 22 | // Parse url 23 | var parsed = url.parse(queueItem.url) 24 | 25 | // Rename / to index.html 26 | if (parsed.pathname === '/') 27 | parsed.pathname = '/index.html' 28 | 29 | // Get directory name in order to create any nested dirs 30 | var dirname = outputDirectory + parsed.pathname.replace(/\/[^\/]+$/, '') 31 | 32 | // Path to save file 33 | var filepath = outputDirectory + parsed.pathname 34 | 35 | // Check if DIR exists 36 | fs.exists(dirname, function (exists) { 37 | 38 | // If DIR exists, write file 39 | if (exists) 40 | fs.writeFile(filepath, responseBuffer, function () {}) 41 | 42 | // Else, recursively create dir using node-fs, then write file 43 | else 44 | fs.mkdir(dirname, 0755, true, function (err) { 45 | fs.writeFile(filepath, responseBuffer, function () {}) 46 | }) 47 | 48 | }) 49 | 50 | console.log("I just received %s (%d bytes)",queueItem.url,responseBuffer.length) 51 | console.log("It was a resource of type %s",response.headers['content-type']) 52 | 53 | }) 54 | 55 | // Fire callback 56 | myCrawler.on('complete', function () { 57 | callback() 58 | }) 59 | 60 | // Start Crawl 61 | myCrawler.start() 62 | 63 | } 64 | 65 | if (process.argv.length < 3) { 66 | console.log('Usage: node downloadSiteExample.js mysite.com') 67 | process.exit(1) 68 | } 69 | downloadSite(process.argv[2], function () { 70 | console.log('Done!') 71 | }) 72 | 73 | 74 | -------------------------------------------------------------------------------- /lib/quickcrawl.js: -------------------------------------------------------------------------------- 1 | var Crawler = require("./crawler.js"), 2 | URI = require("URIjs"); 3 | 4 | 5 | /* 6 | Public: Convenience function for really quick, simple crawls. It generates 7 | a new crawler, parses the URL provided, and sets up the new crawler with 8 | the host and path information extracted from the URL. It returns the crawler 9 | object, so you can set up event handlers, and waits until `process.nextTick` 10 | before kicking off the crawl. 11 | 12 | url - URL to begin crawl from. 13 | successCallback - Optional function called once an item is completely 14 | downloaded. Functionally identical to a fetchcomplete 15 | event listener. 16 | failCallback - Optional function to be called if an item fails to 17 | download. Functionally identical to a fetcherror 18 | event listener. 19 | 20 | Examples 21 | 22 | Crawler.crawl( 23 | "http://example.com:3000/start", 24 | function(queueItem,data) { 25 | console.log("I got a new item!"); 26 | } 27 | ); 28 | 29 | Crawler 30 | .crawl("http://www.example.com/") 31 | .on("fetchstart",function(queueItem) { 32 | console.log("Beginning fetch for",queueItem.url); 33 | }); 34 | 35 | Returns the crawler object which has now been constructed. 36 | 37 | */ 38 | module.exports = function crawl(url,successCallback,failCallback) { 39 | 40 | // Parse the URL first 41 | url = URI(url); 42 | 43 | // If either the protocol, path, or hostname are unset, we can't really 44 | // do much. Die with error. 45 | if (!url.protocol()) 46 | throw new Error("Can't crawl with unspecified protocol."); 47 | 48 | if (!url.hostname()) 49 | throw new Error("Can't crawl with unspecified hostname."); 50 | 51 | if (!url.path()) 52 | throw new Error("Can't crawl with unspecified path."); 53 | 54 | var tmpCrawler = 55 | new Crawler( 56 | url.hostname(), 57 | url.path(), 58 | url.port() || 80); 59 | 60 | // Attach callbacks if they were provided 61 | if (successCallback) tmpCrawler.on("fetchcomplete",successCallback); 62 | if (failCallback) tmpCrawler.on("fetcherror",failCallback); 63 | 64 | // Start the crawler on the next runloop 65 | // This enables initial configuration options and event handlers to take 66 | // effect before the first resource is queued. 67 | process.nextTick(function() { 68 | tmpCrawler.start(); 69 | }); 70 | 71 | // Return crawler 72 | return tmpCrawler; 73 | }; 74 | -------------------------------------------------------------------------------- /test/lib/routes.js: -------------------------------------------------------------------------------- 1 | // Routes for testing server 2 | 3 | 4 | module.exports = { 5 | "/": function(write) { 6 | write(200,"Home. stage2"); 7 | }, 8 | 9 | "/stage2": function(write) { 10 | write(200,"Stage2. http://127.0.0.1:3000/stage/3"); 11 | }, 12 | 13 | "/stage/3": function(write) { 14 | write(200,"Stage3. stage4"); 15 | }, 16 | 17 | "/stage/4": function(write) { 18 | write(200,"Stage4. stage5"); 19 | }, 20 | 21 | "/stage5": function(write,redir) { 22 | redir("/stage6"); 23 | }, 24 | 25 | "/stage6": function(write) { 26 | write(200,"Crawl complete!"); 27 | }, 28 | 29 | "/async-stage1": function(write) { 30 | write(200,"http://127.0.0.1:3000/async-stage2"); 31 | }, 32 | 33 | "/async-stage2": function(write) { 34 | write(200,"http://127.0.0.1:3000/async-stage3"); 35 | }, 36 | 37 | "/async-stage3": function(write) { 38 | write(200,"Complete!"); 39 | }, 40 | 41 | "/timeout": function(write) { 42 | // We want to trigger a timeout. Never respond. 43 | }, 44 | 45 | // Routes for depth tests 46 | "/depth/1": function(write) { 47 | write(200," Home. depth2"); 48 | }, 49 | 50 | "/depth/2": function(write) { 51 | write(200,"Depth 2. http://127.0.0.1:3000/depth/3"); 52 | }, 53 | 54 | "/depth/3": function(write) { 55 | write(200,"Depth 3. "); 56 | }, 57 | 58 | "/css": function(write) { 59 | write(200,"/* CSS 1 */ @import url('/css/2'); @font-face { url(/font/1) format('woff'); }", "text/css"); 60 | }, 61 | 62 | "/css/2": function(write) { 63 | write(200,"/* CSS 2 */ @import url('/css/3'); .img1 { background-image:url('/img/1'); }", "text/css"); 64 | }, 65 | 66 | "/css/3": function(write) { 67 | write(200,"/* CSS 3 */", "text/css"); 68 | }, 69 | 70 | "/css/4": function(write) { 71 | write(200,"/* CSS 4 */ .img1 { background-image:url('/img/2'); } @font-face { url(/font/2) format('woff'); }", "text/css"); 72 | }, 73 | 74 | "/img/1": function(write) { 75 | write(200,"", "image/png"); 76 | }, 77 | 78 | "/img/2": function(write) { 79 | write(200,"", "image/png"); 80 | }, 81 | 82 | "/font/1": function(write) { 83 | write(200,"", "font/woff"); 84 | }, 85 | 86 | "/font/2": function(write) { 87 | write(200,"", "application/font-woff"); 88 | } 89 | }; 90 | -------------------------------------------------------------------------------- /test/testcrawl.js: -------------------------------------------------------------------------------- 1 | // Runs a very simple crawl on an HTTP server 2 | // This is more of an integration test than a unit test. 3 | 4 | var chai = require("chai"); 5 | chai.should(); 6 | 7 | var testserver = require("./lib/testserver.js"); 8 | 9 | describe("Test Crawl",function() { 10 | 11 | var Crawler = require("../"); 12 | 13 | // Create a new crawler to crawl this server 14 | var localCrawler = new Crawler("127.0.0.1","/",3000), 15 | asyncCrawler = new Crawler("127.0.0.1","/",3000); 16 | 17 | // Speed up tests. No point waiting for every request when we're running 18 | // our own server. 19 | localCrawler.interval = asyncCrawler.interval = 1; 20 | 21 | var linksDiscovered = 0; 22 | 23 | it("should be able to be started",function(done) { 24 | 25 | localCrawler.on("crawlstart",function() { done() }); 26 | localCrawler.on("discoverycomplete",function() { 27 | linksDiscovered ++; 28 | }); 29 | 30 | localCrawler.start(); 31 | localCrawler.running.should.be.truthy; 32 | }); 33 | 34 | it("should have a queue with at least the initial crawl path",function() { 35 | 36 | localCrawler.queue.length.should.be.greaterThan(0); 37 | }); 38 | 39 | it("should discover all linked resources in the queue",function(done) { 40 | 41 | localCrawler.on("complete",function() { 42 | linksDiscovered.should.equal(5); 43 | done(); 44 | }); 45 | }); 46 | 47 | it("should support async event listeners for manual discovery",function(done) { 48 | 49 | this.slow('1s') 50 | 51 | // Use a different crawler this time 52 | asyncCrawler.discoverResources = false; 53 | asyncCrawler.queueURL("http://127.0.0.1:3000/async-stage1"); 54 | asyncCrawler.start(); 55 | 56 | asyncCrawler.on("fetchcomplete",function(queueItem,data,res) { 57 | var evtDone = this.wait(); 58 | 59 | setTimeout(function(){ 60 | linksDiscovered ++; 61 | 62 | if (String(data).match(/complete/i)) 63 | return evtDone(); 64 | 65 | // Taking advantage of the fact that for these, the sum total 66 | // of the body data is a URL. 67 | asyncCrawler.queueURL(String(data)).should.be.true; 68 | 69 | evtDone(); 70 | },100); 71 | }); 72 | 73 | asyncCrawler.on("complete",function() { 74 | linksDiscovered.should.equal(8); 75 | done(); 76 | }); 77 | }); 78 | 79 | // TODO 80 | 81 | // Test how simple error conditions, content types, and responses are handled. 82 | 83 | // Test encodings. 84 | 85 | // Test URL detection 86 | 87 | // Test handling binary data 88 | 89 | // Test bad content length 90 | 91 | }); 92 | -------------------------------------------------------------------------------- /test/reliability.js: -------------------------------------------------------------------------------- 1 | // Runs a very simple crawl on an HTTP server 2 | 3 | var chai = require("chai"); 4 | chai.should(); 5 | 6 | // Require the same server as in our previous tests... 7 | var testserver = require("./lib/testserver.js"); 8 | 9 | describe("Crawler reliability",function() { 10 | 11 | var Crawler = require("../"); 12 | 13 | it("should be able to handle a timeout",function(done) { 14 | 15 | this.slow('1s') 16 | 17 | var localCrawler = Crawler.crawl("http://127.0.0.1:3000/timeout"); 18 | localCrawler.timeout = 200; 19 | 20 | localCrawler.on("fetchtimeout",function(queueItem) { 21 | queueItem.should.be.an("object"); 22 | queueItem.url.should.equal("http://127.0.0.1:3000/timeout"); 23 | done(); 24 | }); 25 | }); 26 | 27 | it("should be able to freeze and defrost the queue", function(done) { 28 | 29 | var localCrawler = new Crawler("127.0.0.1", "/", 3000), 30 | newCrawler = new Crawler("127.0.0.1", "/", 3000), 31 | tmp = (process.env.TMPDIR || __dirname) + "/queue.json"; 32 | localCrawler.start(); 33 | 34 | var test = function() { 35 | this.stop(); 36 | 37 | // Lets the queue be populated 38 | process.nextTick(function() { 39 | localCrawler.queue.length.should.equal(3); 40 | localCrawler.queue.oldestUnfetchedIndex.should.equal(1); 41 | localCrawler.queue.scanIndex["http://127.0.0.1:3000/"] 42 | .should.equal(true); 43 | localCrawler.queue.scanIndex["http://127.0.0.1:3000/stage2"] 44 | .should.equal(true); 45 | localCrawler.queue.scanIndex["http://127.0.0.1:3000/stage/3"] 46 | .should.equal(true); 47 | 48 | localCrawler.queue[0].status.should.equal("downloaded"); 49 | localCrawler.queue[1].status.should.equal("downloaded"); 50 | localCrawler.queue[2].status.should.equal("queued"); 51 | 52 | localCrawler.queue.freeze(tmp, defrost); 53 | }); 54 | }; 55 | 56 | var defrost = function() { 57 | newCrawler.queue.defrost(tmp, checkDefrost); 58 | }; 59 | 60 | var checkDefrost = function() { 61 | newCrawler.queue.length.should.equal(3); 62 | newCrawler.queue.oldestUnfetchedIndex.should.equal(2); 63 | newCrawler.queue.scanIndex["http://127.0.0.1:3000/"] 64 | .should.equal(true); 65 | newCrawler.queue.scanIndex["http://127.0.0.1:3000/stage2"] 66 | .should.equal(true); 67 | newCrawler.queue.scanIndex["http://127.0.0.1:3000/stage/3"] 68 | .should.equal(true); 69 | 70 | newCrawler.queue[0].status.should.equal("downloaded"); 71 | newCrawler.queue[1].status.should.equal("downloaded"); 72 | newCrawler.queue[2].status.should.equal("queued"); 73 | 74 | newCrawler.queue.oldestUnfetchedItem(function(err, queueItem) { 75 | queueItem.url.should.equal("http://127.0.0.1:3000/stage/3"); 76 | done(); 77 | }); 78 | }; 79 | 80 | localCrawler.once("fetchcomplete", 81 | localCrawler.once.bind(localCrawler, "fetchcomplete", test)); 82 | 83 | }); 84 | }); 85 | -------------------------------------------------------------------------------- /test/discovery.js: -------------------------------------------------------------------------------- 1 | // Runs a very simple crawl on an HTTP server 2 | 3 | var chai = require("chai"); 4 | chai.should(); 5 | 6 | describe("Crawler link discovery",function() { 7 | 8 | var Crawler = null, 9 | crawler = null, 10 | discover = null; 11 | 12 | beforeEach(function() { 13 | Crawler = require("../"); 14 | crawler = new Crawler(); 15 | discover = crawler.discoverResources.bind(crawler); 16 | }); 17 | 18 | it("should discover http/s prefixed URLs in the document",function() { 19 | 20 | var links = 21 | discover(" blah blah http://google.com/ \ 22 | blah blah https://fish.com/resource blah \ 23 | //example.com"); 24 | 25 | links.should.be.an("array"); 26 | links.length.should.equal(2); 27 | links[0].should.equal("http://google.com/"); 28 | links[1].should.equal("https://fish.com/resource"); 29 | }); 30 | 31 | it("should discover URLS in quoted attributes in the document",function() { 32 | 33 | var links = 34 | discover(" \ 35 | \ 36 | url('thingo.com/test.html')"); 37 | 38 | links.should.be.an("array"); 39 | links.length.should.equal(4); 40 | links[0].should.equal("google.com"); 41 | links[1].should.equal("http://example.com/resource%20with%20spaces.txt"); 42 | links[2].should.equal("thingo.com/test.html"); 43 | }); 44 | 45 | it("should discover URLS in unquoted attributes in the document",function() { 46 | 47 | var links = 48 | discover(" \ 49 | \ 50 | url(thingo.com/test.html)"); 51 | 52 | links.should.be.an("array"); 53 | links.length.should.equal(3); 54 | links[0].should.equal("google.com"); 55 | links[1].should.equal("http://example.com/resource"); 56 | links[2].should.equal("thingo.com/test.html"); 57 | }); 58 | 59 | it("should replace all '&'s with ampersands",function() { 60 | 61 | var links = 62 | discover(""); 63 | 64 | links.should.be.an("array"); 65 | links.length.should.equal(2); 66 | links[0].should.equal("http://example.com/resource?with&query=params&and=entities"); 67 | links[1].should.equal("http://example.com/resource"); 68 | }); 69 | 70 | it("should ignore HTML comments with parseHTMLComments = false",function() { 71 | 72 | crawler.parseHTMLComments = false; 73 | 74 | var links = 75 | discover(" \ 76 | \ 77 | "); 81 | 82 | links.should.be.an("array"); 83 | links.length.should.equal(1); 84 | links[0].should.equal("google.com"); 85 | }); 86 | 87 | it("should ignore script tags with parseScriptTags = false",function() { 88 | 89 | crawler.parseScriptTags = false; 90 | 91 | var links = 92 | discover(" \ 93 | \ 94 | "); 98 | 99 | links.should.be.an("array"); 100 | links.length.should.equal(1); 101 | links[0].should.equal("google.com"); 102 | }); 103 | }); 104 | -------------------------------------------------------------------------------- /test/resourcevalidity.js: -------------------------------------------------------------------------------- 1 | // Tests whether a given resource is considered 'valid' for crawling under 2 | // a number of different conditions. 3 | 4 | var chai = require("chai"); 5 | chai.should(); 6 | 7 | describe("Resource validity checker",function() { 8 | 9 | it("should be able to determine whether a domain is in crawl scope", 10 | function() { 11 | 12 | var crawler = new (require("../"))("example.com",3000); 13 | 14 | // The domain itself should be allowed. 15 | crawler.domainValid("example.com").should.equal(true); 16 | 17 | // Whereas other domains should not be allowed. 18 | crawler.domainValid("somethingelse").should.equal(false); 19 | crawler.domainValid("microsoft.com").should.equal(false); 20 | crawler.domainValid("a.really.complex.fqdn.").should.equal(false); 21 | 22 | }); 23 | 24 | it("should be able to determine whether a domain is a subdomain of another", 25 | function() { 26 | 27 | var crawler = new (require("../"))("example.com",3000); 28 | 29 | // Enable scanning subdomains, important for this test 30 | crawler.scanSubdomains = true; 31 | 32 | // The domain itself isn't a subdomain per-se, but should be allowed 33 | crawler.domainValid("example.com").should.equal(true); 34 | 35 | // WWW is a subdomain 36 | crawler.domainValid("www.example.com").should.equal(true); 37 | 38 | // More complex examples 39 | crawler.domainValid("testing.example.com").should.equal(true); 40 | 41 | // Multiple levels 42 | crawler.domainValid("system.cache.example.com").should.equal(true); 43 | 44 | // These aren't valid... 45 | crawler.domainValid("com.example").should.equal(false); 46 | crawler.domainValid("example.com.au").should.equal(false); 47 | crawler.domainValid("example.us").should.equal(false); 48 | 49 | }); 50 | 51 | 52 | it("should consider WWW domains and non-WWW domains alike by default", 53 | function() { 54 | 55 | var crawler = new (require("../"))("example.com",3000); 56 | 57 | // Explicitly disallow crawling subdomains, important for this test 58 | crawler.scanSubdomains = false; 59 | 60 | // The domain itself isn't a subdomain per-se, but should be allowed 61 | crawler.domainValid("example.com").should.equal(true); 62 | 63 | // Its WWW domain should be allowed by default 64 | crawler.domainValid("www.example.com").should.equal(true); 65 | 66 | }); 67 | 68 | it("should consider WWW domains and non-WWW domains as separate if requested", 69 | function() { 70 | 71 | var crawler = new (require("../"))("example.com",3000); 72 | 73 | // Explicitly disallow crawling subdomains, important for this test 74 | crawler.scanSubdomains = false; 75 | 76 | // Explicitly consider www a separate subdomain (ordinarily, true) 77 | crawler.ignoreWWWDomain = false; 78 | 79 | // The domain itself isn't a subdomain per-se, but should be allowed 80 | crawler.domainValid("example.com").should.equal(true); 81 | 82 | // Its WWW domain should be allowed by default 83 | crawler.domainValid("www.example.com").should.equal(false); 84 | 85 | }); 86 | 87 | it("should permit a specified set of domains based on the internal whitelist", 88 | function() { 89 | 90 | var crawler = new (require("../"))("example.com",3000); 91 | 92 | // Add a few specific subdomains 93 | crawler.domainWhitelist.push("foo.com"); 94 | crawler.domainWhitelist.push("bar.com"); 95 | crawler.domainWhitelist.push("abcdefg.net.nz"); 96 | 97 | // The domain itself isn't a subdomain per-se, but should be allowed 98 | crawler.domainValid("example.com").should.equal(true); 99 | 100 | // The explicitly set domains should be permitted 101 | crawler.domainValid("foo.com").should.equal(true); 102 | crawler.domainValid("bar.com").should.equal(true); 103 | crawler.domainValid("abcdefg.net.nz").should.equal(true); 104 | 105 | // These domains were never whitelisted, and should be denied 106 | crawler.domainValid("wumpus.com").should.equal(false); 107 | crawler.domainValid("fish.net").should.equal(false); 108 | 109 | }); 110 | 111 | it("should permit fetching of specified protocols based on internal whitelist", 112 | function() { 113 | 114 | var crawler = new (require("../"))("example.com",3000); 115 | 116 | // Protocols supported by default 117 | crawler.protocolSupported("http://google.com").should.equal(true); 118 | crawler.protocolSupported("https://google.com").should.equal(true); 119 | crawler.protocolSupported("rss://google.com").should.equal(true); 120 | crawler.protocolSupported("feed://google.com").should.equal(true); 121 | crawler.protocolSupported("atom://google.com").should.equal(true); 122 | 123 | // Protocols not supported 124 | crawler.protocolSupported("gopher://google.com").should.equal(false); 125 | crawler.protocolSupported("ws://google.com").should.equal(false); 126 | crawler.protocolSupported("wss://google.com").should.equal(false); 127 | }); 128 | 129 | it("should permit parsing of specified resources based on mimetype checks", 130 | function() { 131 | 132 | this.supportedMimeTypes = [ 133 | /^text\//i, 134 | /^application\/(rss)?[\+\/\-]?xml/i, 135 | /^application\/javascript/i, 136 | /^xml/i 137 | ]; 138 | 139 | var crawler = new (require("../"))("example.com",3000); 140 | 141 | // Protocols supported by default 142 | crawler.mimeTypeSupported("text/plain").should.equal(true); 143 | 144 | // Crawler should be able to process all plain-text formats 145 | crawler.mimeTypeSupported("text/SomeFormat").should.equal(true); 146 | crawler.mimeTypeSupported("text/html").should.equal(true); 147 | 148 | // XML based formats 149 | crawler.mimeTypeSupported("application/rss+xml").should.equal(true); 150 | crawler.mimeTypeSupported("application/html+xml").should.equal(true); 151 | crawler.mimeTypeSupported("application/xhtml+xml").should.equal(true); 152 | 153 | // Some weird JS mimetypes 154 | crawler.mimeTypeSupported("application/javascript").should.equal(true); 155 | 156 | // Anything with XML... 157 | crawler.mimeTypeSupported("xml/manifest").should.equal(true); 158 | 159 | // And these should fail 160 | crawler.mimeTypeSupported("application/octet-stream").should.equal(false); 161 | crawler.mimeTypeSupported("img/png").should.equal(false); 162 | crawler.mimeTypeSupported("video/webm").should.equal(false); 163 | crawler.mimeTypeSupported("blah/blah").should.equal(false); 164 | 165 | }); 166 | 167 | 168 | describe("Link parser",function() { 169 | 170 | var crawler = new (require("../"))("127.0.0.1",3000); 171 | 172 | it("should throw out junky or invalid URLs without dying",function() { 173 | 174 | var urlContext = { 175 | "url": "http://www.example.com" 176 | }; 177 | 178 | crawler.processURL("",urlContext).should.equal(false); 179 | crawler.processURL("\n\n",urlContext).should.equal(false); 180 | crawler.processURL("ur34nfie4985:s////dsf/",urlContext).should.equal(false); 181 | 182 | }); 183 | 184 | }); 185 | }); 186 | 187 | -------------------------------------------------------------------------------- /lib/cache-backend-fs.js: -------------------------------------------------------------------------------- 1 | // Simplecrawler - FS cache backend 2 | // Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself. 3 | // The idea is that it is then possible to re-serve the website just using the cache. 4 | 5 | var fs = require("fs"); 6 | var crypto = require("crypto"); 7 | 8 | // Factory for FSBackend 9 | var backend = function backend(loadParameter) { 10 | return new FSBackend(loadParameter); 11 | }; 12 | 13 | module.exports = backend; 14 | 15 | // Constructor for filesystem cache backend 16 | var FSBackend = function FSBackend(loadParameter) { 17 | this.loaded = false; 18 | this.index = []; 19 | this.location = typeof(loadParameter) === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/"; 20 | this.location = this.location.substr(this.location.length-1) === "/" ? this.location : this.location + "/"; 21 | }; 22 | 23 | // Function for sanitising paths 24 | // We try to get the most understandable, file-system friendly paths we can. 25 | // An extension is added if not present or inappropriate - if a better one can be determined. 26 | // Querystrings are hashed to truncate without (hopefully) collision. 27 | 28 | function sanitisePath(path,queueObject) { 29 | // Remove first slash (as we set one later.) 30 | path = path.replace(/^\//,""); 31 | 32 | var pathStack = []; 33 | 34 | // Trim whitespace. If no path is present - assume index.html. 35 | var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html"; 36 | var headers = queueObject.stateData.headers, sanitisedPathParts; 37 | 38 | if (sanitisedPath.match(/\?/)) { 39 | sanitisedPathParts = sanitisedPath.split(/\?/g); 40 | var resource = sanitisedPathParts.shift(); 41 | var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex"); 42 | sanitisedPath = resource + "?" + hashedQS; 43 | } 44 | 45 | pathStack = sanitisedPath.split(/\//g); 46 | pathStack = pathStack.map(function(pathChunk,count) { 47 | if (pathChunk.length >= 250) { 48 | return crypto.createHash("sha1").update(pathChunk).digest("hex"); 49 | } 50 | 51 | return pathChunk; 52 | }); 53 | 54 | sanitisedPath = pathStack.join("/"); 55 | 56 | // Try to get a file extension for the file - for ease of identification 57 | // We run through this if we either: 58 | // 1) haven't got a file extension at all, or: 59 | // 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type) 60 | 61 | if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || (headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i))) { 62 | var subMimeType = ""; 63 | var mimeParts = []; 64 | 65 | if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) { 66 | if (sanitisedPath.match(/\/$/)) { 67 | sanitisedPath += "index.html"; 68 | } else { 69 | sanitisedPath += ".html"; 70 | } 71 | 72 | } else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) { 73 | subMimeType = mimeParts[2]; 74 | sanitisedPath += "." + subMimeType; 75 | } 76 | } 77 | 78 | return sanitisedPath; 79 | } 80 | 81 | FSBackend.prototype.fileExists = function(location) { 82 | try { 83 | fs.statSync(location); 84 | return true; 85 | } catch (er) { 86 | return false; 87 | } 88 | }; 89 | 90 | FSBackend.prototype.isDirectory = function(location) { 91 | try { 92 | if (fs.statSync(location).isDirectory()) { 93 | return true; 94 | } 95 | 96 | return false; 97 | } catch (er) { 98 | return false; 99 | } 100 | }; 101 | 102 | FSBackend.prototype.load = function() { 103 | var backend = this; 104 | 105 | if (!this.fileExists(this.location) && this.isDirectory(this.location)) { 106 | throw new Error("Unable to verify cache location exists."); 107 | } 108 | 109 | try { 110 | var fileData; 111 | if ((fileData = fs.readFileSync(this.location + "cacheindex.json")) && fileData.length) { 112 | this.index = JSON.parse(fileData.toString("utf8")); 113 | this.loaded = true; 114 | } 115 | } catch(error) { 116 | if (error.code === "ENOENT") { 117 | // Cache index doesn't exist. Assume this is a new cache. 118 | // Just leave the memory index empty for now. 119 | this.loaded = true; 120 | } else { 121 | throw error; 122 | } 123 | } 124 | 125 | // Flush store to disk when closing. 126 | process.on("exit",function() { 127 | backend.saveCache.apply(backend); 128 | }); 129 | }; 130 | 131 | FSBackend.prototype.saveCache = function(callback) { 132 | fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback); 133 | }; 134 | 135 | FSBackend.prototype.setItem = function(queueObject,data,callback) { 136 | callback = callback instanceof Function ? callback : function(){}; 137 | 138 | var backend = this; 139 | var pathStack = [queueObject.protocol, queueObject.domain, queueObject.port]; 140 | pathStack = pathStack.concat(sanitisePath(queueObject.path,queueObject).split(/\/+/g)); 141 | 142 | var cacheItemExists = false; 143 | var firstInstanceIndex = NaN; 144 | if (this.index.reduce(function(prev,current,index,array) { 145 | firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index; 146 | return prev || current.url === queueObject.url; 147 | },false)) { 148 | cacheItemExists = true; 149 | } 150 | 151 | var writeFileData = function(currentPath,data) { 152 | fs.writeFile(currentPath,data,function(error) { 153 | if (error) throw error; 154 | fs.writeFile(currentPath + ".cacheData.json",JSON.stringify(queueObject),function(error) { 155 | if (error) throw error; 156 | 157 | var cacheObject = { 158 | url: queueObject.url, 159 | etag: queueObject.stateData.headers.etag, 160 | lastModified: queueObject.stateData.headers['last-modified'], 161 | dataFile: currentPath, 162 | metaFile: currentPath + ".cacheData.json" 163 | }; 164 | 165 | if (cacheItemExists) { 166 | backend.index[firstInstanceIndex] = cacheObject; 167 | } else { 168 | backend.index.push(cacheObject); 169 | } 170 | 171 | callback(cacheObject); 172 | }); 173 | }); 174 | }; 175 | 176 | pathStack.forEach(function(pathChunk,count) { 177 | var currentPath = backend.location + pathStack.slice(0,count+1).join("/"); 178 | if (backend.fileExists(backend.location + pathStack.slice(0,count+1).join("/"))) { 179 | if (!backend.isDirectory(currentPath)) { 180 | if (count === pathStack.length -1) { 181 | // Just overwrite the file... 182 | writeFileData(currentPath,data); 183 | } else { 184 | throw new Error("Cache storage of resource (%s) blocked by file: %s",queueObject.url,currentPath); 185 | } 186 | } 187 | } else { 188 | if (count === pathStack.length -1) { 189 | // Write the file data in 190 | writeFileData(currentPath,data); 191 | } else { 192 | fs.mkdirSync(currentPath); 193 | } 194 | } 195 | }); 196 | }; 197 | 198 | FSBackend.prototype.getItem = function(queueObject,callback) { 199 | var cacheItemResult = this.index.filter(function(item) { 200 | return item.url === queueObject.url; 201 | }); 202 | 203 | if (cacheItemResult.length) { 204 | var cacheItem = cacheItemResult.shift(); 205 | 206 | callback({ 207 | "url": cacheItem.url, 208 | "etag": cacheItem.etag, 209 | "lastModified": cacheItem.lastModified, 210 | "getData": function(callback) { 211 | fs.readFile(cacheItem.dataFile,function(error,data) { 212 | if (error) { 213 | callback(error); 214 | return false; 215 | } 216 | 217 | callback(null,data); 218 | }); 219 | }, 220 | "getMetadata": function(callback) { 221 | fs.readFile(cacheItem.metaFile,function(error,data) { 222 | if (error) { 223 | callback(error); 224 | return false; 225 | } 226 | 227 | callback(null,JSON.parse(data.toString("utf8"))); 228 | }); 229 | } 230 | }); 231 | 232 | } else { 233 | callback(null); 234 | } 235 | 236 | return false; 237 | }; 238 | 239 | -------------------------------------------------------------------------------- /test/cookies.js: -------------------------------------------------------------------------------- 1 | // Ensures that cookie support is functional and reliable across 2 | // a variety of different cookie formats. The more cookies I can add to this 3 | // cookies array, the better the tests! 4 | 5 | var chai = require("chai"); 6 | chai.should(); 7 | 8 | var cookies = [ 9 | "Set-Cookie: RMID=007f010019155170d6ca005f; Expires=Sat, 19 Apr 2020 05:31:54 GMT; Path=/; Domain=.nytimes.com;", 10 | "Set-cookie: adxcs=-; path=/; domain=.nytimes.com", 11 | "Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2015 05:32:49 GMT; path=/; domain=.google.com", 12 | "Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2015 05:32:49 GMT; path=/; domain=.google.com; HttpOnly", 13 | "Set-Cookie: fpc=d=Yq1z8hbA9WextmPFlb7suMTfMRgtSc2FyzAB7now1ExfUZ.eW7s4QSwSKlB6ZB0juN8OLZxWf_XXEIcspYaQmVVD0mD0xJ.xpXBCSw5Dl_Ql6n.RLoM.7CnTbNSsiSr2fkNiCN47tRUB4j8iWevNwQdFDn1hB8z8t1xwWt76n.sLIRY9p2_jTBhukfSD4SBpBkJhI1o-&v=2; expires=Sat, 19-Apr-2020 05:48:42 GMT; path=/; domain=www.yahoo.com", 14 | "Set-Cookie: test=test; path=/test; domain=test.com" 15 | ]; 16 | 17 | describe("Cookies",function() { 18 | 19 | var CookieJar = require("../lib/cookies.js"), 20 | Cookie = CookieJar.Cookie; 21 | 22 | it("should be able parse from string properly",function() { 23 | 24 | Cookie.should.be.a("function"); 25 | Cookie.fromString.should.be.a("function"); 26 | Cookie.fromString(cookies[0]).should.be.an("object"); 27 | Cookie.fromString(cookies[0]).should.be.an.instanceof(Cookie); 28 | 29 | var tmpCookie = Cookie.fromString(cookies[0]); 30 | 31 | tmpCookie.name.should.equal("RMID"); 32 | tmpCookie.value.should.equal("007f010019155170d6ca005f"); 33 | tmpCookie.expires.should.equal(1587274314000); 34 | tmpCookie.path.should.equal("/"); 35 | tmpCookie.domain.should.equal(".nytimes.com"); 36 | 37 | // Test the next cookie... 38 | tmpCookie = Cookie.fromString(cookies[1]); 39 | 40 | tmpCookie.name.should.equal("adxcs"); 41 | tmpCookie.value.should.equal("-"); 42 | tmpCookie.expires.should.equal(-1); 43 | tmpCookie.path.should.equal("/"); 44 | tmpCookie.domain.should.equal(".nytimes.com"); 45 | 46 | }); 47 | 48 | it("should be able to test for expiry",function() { 49 | 50 | // Create a new cookie that should already have expired... 51 | var tmpCookie = new Cookie("test","test",Date.now()-1000); 52 | 53 | tmpCookie.isExpired().should.equal(true); 54 | 55 | // Create a new cookie with an expiry 20 seconds in the future 56 | tmpCookie = new Cookie("test","test",Date.now()+20000); 57 | 58 | tmpCookie.isExpired().should.equal(false); 59 | }); 60 | 61 | it("should be able to output the cookie object as a string",function() { 62 | 63 | cookies.forEach(function(cookie) { 64 | var tmpCookie = Cookie.fromString(cookie), 65 | outputString = tmpCookie.toString(true), 66 | reParsedCookie = Cookie.fromString(outputString); 67 | 68 | tmpCookie.name.should.equal(reParsedCookie.name); 69 | tmpCookie.value.should.equal(reParsedCookie.value); 70 | tmpCookie.expires.should.equal(reParsedCookie.expires); 71 | tmpCookie.path.should.equal(reParsedCookie.path); 72 | tmpCookie.domain.should.equal(reParsedCookie.domain); 73 | tmpCookie.httponly.should.equal(reParsedCookie.httponly); 74 | }) 75 | }); 76 | 77 | describe("Cookie Jar",function() { 78 | 79 | it("should be able to be instantiated",function() { 80 | var cookieJar = new CookieJar(); 81 | }); 82 | 83 | it("should be able to add cookies",function() { 84 | var cookieJar = new CookieJar(); 85 | 86 | cookies.forEach(function(cookie) { 87 | var parsedCookie = Cookie.fromString(cookie); 88 | 89 | cookieJar.add( 90 | parsedCookie.name, 91 | parsedCookie.value, 92 | parsedCookie.expires, 93 | parsedCookie.path, 94 | parsedCookie.domain, 95 | parsedCookie.httponly); 96 | 97 | var cookiesAdded = cookieJar.get(parsedCookie.name), 98 | parsedCookie2 = cookiesAdded.pop(); 99 | 100 | parsedCookie2.name.should.equal(parsedCookie.name); 101 | parsedCookie2.value.should.equal(parsedCookie.value); 102 | parsedCookie2.expires.should.equal(parsedCookie.expires); 103 | parsedCookie2.path.should.equal(parsedCookie.path); 104 | parsedCookie2.domain.should.equal(parsedCookie.domain); 105 | parsedCookie2.httponly.should.equal(parsedCookie.httponly); 106 | }); 107 | 108 | cookieJar.cookies.length.should.equal(cookies.length); 109 | }); 110 | 111 | it("should be able to remove cookies by name",function() { 112 | var cookieJar = new CookieJar(); 113 | 114 | cookies.forEach(function(cookie) { 115 | var parsedCookie = Cookie.fromString(cookie); 116 | 117 | cookieJar.add( 118 | parsedCookie.name, 119 | parsedCookie.value, 120 | parsedCookie.expires, 121 | parsedCookie.path, 122 | parsedCookie.domain, 123 | parsedCookie.httponly); 124 | }); 125 | 126 | cookieJar.cookies.length.should.equal(cookies.length); 127 | 128 | cookies.forEach(function(cookie,index) { 129 | var parsedCookie = Cookie.fromString(cookie); 130 | 131 | cookieJar.remove(parsedCookie.name); 132 | 133 | cookieJar.cookies.length.should.equal( 134 | cookies.length - (index+1)); 135 | }); 136 | }); 137 | 138 | it("should be able to retrieve cookies by name",function() { 139 | var cookieJar = new CookieJar(); 140 | 141 | cookies.forEach(function(cookie) { 142 | var parsedCookie = Cookie.fromString(cookie); 143 | 144 | cookieJar.add( 145 | parsedCookie.name, 146 | parsedCookie.value, 147 | parsedCookie.expires, 148 | parsedCookie.path, 149 | parsedCookie.domain, 150 | parsedCookie.httponly); 151 | 152 | var returnedCookies = cookieJar.get(parsedCookie.name), 153 | parsedCookie2 = returnedCookies.pop(); 154 | 155 | parsedCookie2.name.should.equal(parsedCookie.name); 156 | parsedCookie2.value.should.equal(parsedCookie.value); 157 | parsedCookie2.expires.should.equal(parsedCookie.expires); 158 | parsedCookie2.path.should.equal(parsedCookie.path); 159 | parsedCookie2.domain.should.equal(parsedCookie.domain); 160 | parsedCookie2.httponly.should.equal(parsedCookie.httponly); 161 | }); 162 | }); 163 | 164 | it("should be able to accept cookies from a header/s",function() { 165 | var cookieJar = new CookieJar(); 166 | cookieJar.addFromHeaders(cookies); 167 | 168 | cookies.forEach(function(cookie) { 169 | var parsedCookie = Cookie.fromString(cookie); 170 | var returnedCookies = cookieJar.get(parsedCookie.name), 171 | parsedCookie2 = returnedCookies.slice(0,1).pop(); 172 | 173 | returnedCookies.length.should.equal(1); 174 | parsedCookie2.name.should.equal(parsedCookie.name); 175 | parsedCookie2.value.should.equal(parsedCookie.value); 176 | parsedCookie2.expires.should.equal(parsedCookie.expires); 177 | parsedCookie2.path.should.equal(parsedCookie.path); 178 | parsedCookie2.domain.should.equal(parsedCookie.domain); 179 | parsedCookie2.httponly.should.equal(parsedCookie.httponly); 180 | }); 181 | }); 182 | 183 | it("should be able to generate a header from internal storage",function() { 184 | var cookieJar = new CookieJar(); 185 | cookieJar.addFromHeaders(cookies); 186 | var comparisonHeaderList = cookieJar.getAsHeader(); 187 | 188 | comparisonHeaderList.should.be.an("array"); 189 | comparisonHeaderList.length.should.equal(cookies.length); 190 | 191 | comparisonHeaderList.forEach(function(header,index) { 192 | var parsedCookie = Cookie.fromString(cookies[index]); 193 | var parsedCookie2 = Cookie.fromString(header); 194 | 195 | parsedCookie2.name.should.equal(parsedCookie.name); 196 | parsedCookie2.value.should.equal(parsedCookie.value); 197 | parsedCookie2.expires.should.equal(parsedCookie.expires); 198 | parsedCookie2.path.should.equal(parsedCookie.path); 199 | parsedCookie2.domain.should.equal(parsedCookie.domain); 200 | parsedCookie2.httponly.should.equal(parsedCookie.httponly); 201 | }); 202 | }); 203 | 204 | it("should be able to filter generated headers by domain and path",function() { 205 | var cookieJar = new CookieJar(); 206 | cookieJar.addFromHeaders(cookies); 207 | var comparisonHeaderList = cookieJar.getAsHeader("nytimes.com"); 208 | 209 | comparisonHeaderList.length.should.equal(2); 210 | 211 | comparisonHeaderList = cookieJar.getAsHeader(null,"/"); 212 | 213 | // Even though there's 6 cookies. 214 | comparisonHeaderList.length.should.equal(5); 215 | }); 216 | 217 | it("should be able to filter generated headers by expiry",function() { 218 | var cookieJar = new CookieJar(); 219 | cookieJar.addFromHeaders(cookies); 220 | 221 | // set the expiry on one of the headers to some point far in the past 222 | cookieJar.cookies[0].expires /= 2; 223 | 224 | // Get the headers... 225 | var comparisonHeaderList = cookieJar.getAsHeader(); 226 | 227 | comparisonHeaderList.length.should.equal(cookies.length-1); 228 | }); 229 | }); 230 | }); 231 | -------------------------------------------------------------------------------- /lib/queue.js: -------------------------------------------------------------------------------- 1 | // Simplecrawler - queue module 2 | // Christopher Giffard, 2011 3 | // 4 | // http://www.github.com/cgiffard/node-simplecrawler 5 | 6 | 7 | var fs = require("fs"); 8 | 9 | var allowedStatistics = [ 10 | "requestTime", 11 | "requestLatency", 12 | "downloadTime", 13 | "contentLength", 14 | "actualDataSize" 15 | ]; 16 | 17 | var FetchQueue = function(){ 18 | this.oldestUnfetchedIndex = 0; 19 | this.completeCache = 0; 20 | this.scanIndex = {}; 21 | }; 22 | 23 | module.exports = FetchQueue; 24 | 25 | FetchQueue.prototype = []; 26 | FetchQueue.prototype.add = function(protocol, domain, port, path, depth, callback) { 27 | 28 | // For legacy reasons 29 | if (depth instanceof Function) { 30 | callback = depth; 31 | depth = 1; 32 | } 33 | 34 | depth = depth || 1; 35 | callback = callback && callback instanceof Function ? callback : function(){}; 36 | var self = this; 37 | 38 | // Ensure all variables conform to reasonable defaults 39 | protocol = protocol === "https" ? "https" : "http"; 40 | 41 | if (isNaN(port) || !port) { 42 | return callback(new Error("Port must be numeric!")); 43 | } 44 | 45 | var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path; 46 | 47 | this.exists(protocol,domain,port,path, 48 | function(err,exists) { 49 | if (err) return callback(err); 50 | 51 | if (!exists) { 52 | var queueItem = { 53 | "url": url, 54 | "protocol": protocol, 55 | "host": domain, 56 | "port": port, 57 | "path": path, 58 | "depth": depth, 59 | "fetched": false, 60 | "status": "queued", 61 | "stateData": {} 62 | }; 63 | 64 | self.push(queueItem); 65 | callback(null, queueItem); 66 | } else { 67 | var error = new Error("Resource already exists in queue!"); 68 | error.code = "DUP"; 69 | 70 | callback(error); 71 | } 72 | }); 73 | }; 74 | 75 | // Check if an item already exists in the queue... 76 | FetchQueue.prototype.exists = function(protocol, domain, port, path, callback) { 77 | callback = callback && callback instanceof Function ? callback : function(){}; 78 | 79 | port = (port !== 80 ? ":" + port : ""); 80 | 81 | var url = 82 | (protocol + "://" + domain + port + path) 83 | .toLowerCase(); 84 | 85 | if (!!this.scanIndex[url]) { 86 | callback(null, 1); 87 | return 1; 88 | } else { 89 | this.scanIndex[url] = true; 90 | callback(null, 0); 91 | return 0; 92 | } 93 | }; 94 | 95 | // Get last item in queue... 96 | FetchQueue.prototype.last = function(callback) { 97 | callback = callback && callback instanceof Function ? callback : function(){}; 98 | var item, self = this; 99 | item = self[self.length-1]; 100 | callback(null, item); 101 | return item; 102 | }; 103 | 104 | // Get item from queue 105 | FetchQueue.prototype.get = function(id, callback) { 106 | callback = callback && callback instanceof Function ? callback : function(){}; 107 | var item, self = this; 108 | 109 | if (!isNaN(id) && self.length > id) { 110 | item = self[id]; 111 | callback(null, item); 112 | return item; 113 | } 114 | }; 115 | 116 | // Get first unfetched item in the queue (and return its index) 117 | FetchQueue.prototype.oldestUnfetchedItem = function(callback) { 118 | callback = callback && callback instanceof Function ? callback : function(){}; 119 | var item, self = this; 120 | 121 | for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex ++) { 122 | if (self[itemIndex].status === "queued") { 123 | self.oldestUnfetchedIndex = itemIndex; 124 | item = self[itemIndex]; 125 | callback(null, item); 126 | return item; 127 | } 128 | } 129 | 130 | callback(new Error("No unfetched items remain.")); 131 | }; 132 | 133 | // Gets the maximum total request time, request latency, or download time 134 | FetchQueue.prototype.max = function(statisticName, callback) { 135 | callback = callback && callback instanceof Function ? callback : function(){}; 136 | var maxStatisticValue = 0, self = this; 137 | 138 | if (allowedStatistics.join().indexOf(statisticName) === -1) { 139 | // Not a recognised statistic! 140 | return callback(new Error("Invalid statistic.")); 141 | } 142 | 143 | self.forEach(function(item) { 144 | if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) { 145 | maxStatisticValue = item.stateData[statisticName]; 146 | } 147 | }); 148 | 149 | callback(null, maxStatisticValue); 150 | return maxStatisticValue; 151 | }; 152 | 153 | // Gets the minimum total request time, request latency, or download time 154 | FetchQueue.prototype.min = function(statisticName, callback) { 155 | callback = callback && callback instanceof Function ? callback : function(){}; 156 | var minimum, minStatisticValue = Infinity, self = this; 157 | 158 | if (allowedStatistics.join().indexOf(statisticName) === -1) { 159 | // Not a recognised statistic! 160 | return callback(new Error("Invalid statistic.")); 161 | } 162 | 163 | self.forEach(function(item) { 164 | if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) { 165 | minStatisticValue = item.stateData[statisticName]; 166 | } 167 | }); 168 | 169 | minimum = minStatisticValue === Infinity? 0 : minStatisticValue; 170 | callback(null, minimum); 171 | return minimum; 172 | }; 173 | 174 | // Gets the minimum total request time, request latency, or download time 175 | FetchQueue.prototype.avg = function(statisticName, callback) { 176 | callback = callback && callback instanceof Function ? callback : function(){}; 177 | var average, NumberSum = 0, NumberCount = 0, self = this; 178 | 179 | if (allowedStatistics.join().indexOf(statisticName) === -1) { 180 | // Not a recognised statistic! 181 | return callback(new Error("Invalid statistic.")); 182 | } 183 | 184 | self.forEach(function(item) { 185 | if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) { 186 | NumberSum += item.stateData[statisticName]; 187 | NumberCount ++; 188 | } 189 | }); 190 | average = NumberSum / NumberCount; 191 | callback(null, average); 192 | return average; 193 | }; 194 | 195 | // Gets the number of requests which have been completed. 196 | FetchQueue.prototype.complete = function(callback) { 197 | callback = callback && callback instanceof Function ? callback : function(){}; 198 | var NumberComplete = 0, self = this; 199 | 200 | self.forEach(function(item) { 201 | if (item.fetched) { 202 | NumberComplete ++; 203 | } 204 | }); 205 | 206 | callback(null, NumberComplete); 207 | return NumberComplete; 208 | }; 209 | 210 | // Gets the number of queue items with the given status 211 | FetchQueue.prototype.countWithStatus = function(status, callback) { 212 | callback = callback && callback instanceof Function ? callback : function(){}; 213 | var queueItemsMatched = 0, self = this; 214 | 215 | self.forEach(function(item) { 216 | if (item.status === status) { 217 | queueItemsMatched ++; 218 | } 219 | }); 220 | 221 | callback(null,queueItemsMatched); 222 | return queueItemsMatched; 223 | }; 224 | 225 | // Gets the number of queue items with the given status 226 | FetchQueue.prototype.getWithStatus = function(status, callback) { 227 | callback = callback && callback instanceof Function ? callback : function(){}; 228 | var subqueue = [], self = this; 229 | 230 | self.forEach(function(item,index) { 231 | if (item.status === status) { 232 | subqueue.push(item); 233 | subqueue[subqueue.length-1].queueIndex = index; 234 | } 235 | }); 236 | 237 | callback(null,subqueue); 238 | return subqueue; 239 | }; 240 | 241 | // Gets the number of requests which have failed for some reason 242 | FetchQueue.prototype.errors = function(callback) { 243 | callback = callback && callback instanceof Function ? callback : function(){}; 244 | var total, failedCount, notFoundCount, self = this; 245 | 246 | failedCount = self.countWithStatus("failed"); 247 | notFoundCount = self.countWithStatus("notfound"); 248 | total = failedCount + notFoundCount; 249 | callback(null, total); 250 | return total; 251 | }; 252 | 253 | // Gets the number of items in the queue 254 | FetchQueue.prototype.getLength = function(callback) { 255 | return callback(null, this.length); 256 | }; 257 | 258 | // Writes the queue to disk 259 | FetchQueue.prototype.freeze = function(filename,callback) { 260 | callback = callback && callback instanceof Function ? callback : function(){}; 261 | var self = this; 262 | 263 | // Re-queue in-progress items before freezing... 264 | self.forEach(function(item) { 265 | if (item.fetched !== true) { 266 | item.status = "queued"; 267 | } 268 | }); 269 | 270 | fs.writeFile(filename,JSON.stringify(self),function(err) { 271 | callback(err, self); 272 | }); 273 | }; 274 | 275 | // Reads the queue from disk 276 | FetchQueue.prototype.defrost = function(filename, callback) { 277 | callback = callback && callback instanceof Function ? callback : function(){}; 278 | var fileData, self = this, defrostedQueue = []; 279 | 280 | fs.readFile(filename,function(err,fileData) { 281 | if (err) return callback(err); 282 | 283 | if (!fileData.toString("utf8").length) { 284 | return callback(new Error("Failed to defrost queue from zero-length JSON.")); 285 | } 286 | 287 | try { 288 | defrostedQueue = JSON.parse(fileData.toString("utf8")); 289 | } catch(error) { 290 | return callback(error); 291 | } 292 | 293 | self.oldestUnfetchedIndex = Infinity; 294 | self.scanIndex = {}; 295 | 296 | for (var index in defrostedQueue) { 297 | if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) { 298 | var queueItem = defrostedQueue[index]; 299 | self.push(queueItem); 300 | 301 | if (queueItem.status !== "downloaded") 302 | self.oldestUnfetchedIndex = Math.min( 303 | self.oldestUnfetchedIndex, index); 304 | 305 | self.scanIndex[queueItem.url] = true; 306 | } 307 | } 308 | 309 | if (self.oldestUnfetchedIndex === Infinity) 310 | self.oldestUnfetchedIndex = 0; 311 | 312 | callback(null,self); 313 | }); 314 | }; 315 | -------------------------------------------------------------------------------- /lib/cookies.js: -------------------------------------------------------------------------------- 1 | // Cookie Jar Functionality 2 | var EventEmitter = require("events").EventEmitter, 3 | util = require("util"); 4 | 5 | /* 6 | Public: Constructor for the cookie jar. 7 | 8 | Examples 9 | 10 | var cookieJar = new CookieJar(); 11 | 12 | Returns the cookie jar object which has now been constructed. 13 | 14 | */ 15 | function CookieJar() { 16 | var cookies = []; 17 | this.__defineGetter__("cookies",function() { 18 | return cookies; 19 | }); 20 | 21 | // Run the EventEmitter constructor 22 | EventEmitter.call(this); 23 | } 24 | 25 | util.inherits(CookieJar,EventEmitter); 26 | 27 | /* 28 | Public: Adds a new cookie to the jar, either by creating a new Cookie() object 29 | from specific details such as name, value, etc., accepting a string from a 30 | Set-Cookie header, or by passing in an existing Cookie() object. 31 | 32 | name - The name of the cookie to add. Alternately, set-cookie 33 | header as string, or an existing cookie object. 34 | value - The value of the cookie. 35 | expiry - Expiry timestamp in milliseconds. 36 | path - Limit cookie to path (defaults to "/") 37 | domain - Limit cookie to domain 38 | httponly - Boolean value specifying httponly 39 | cb - Optional callback. 40 | 41 | Emits 42 | 43 | addcookie - Emitted with new cookie object as an argument. 44 | 45 | Examples 46 | 47 | cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false); 48 | 49 | Returns the cookie jar object for chaining. 50 | 51 | */ 52 | CookieJar.prototype.add = function(name,value,expiry,path,domain,httponly,cb) { 53 | 54 | var existingIndex = -1, newCookie; 55 | 56 | if (arguments.length > 1) { 57 | newCookie = new Cookie(name,value,expiry,path,domain,httponly); 58 | } else if (name instanceof Cookie) { 59 | newCookie = name; 60 | } else { 61 | newCookie = Cookie.fromString(name); 62 | } 63 | 64 | // Are we updating an existing cookie or adding a new one? 65 | this.cookies.forEach(function(cookie,index) { 66 | if (cookie.name === newCookie.name && 67 | cookie.matchDomain(newCookie.domain)) { 68 | 69 | existingIndex = index; 70 | } 71 | }); 72 | 73 | if (existingIndex < 0) { 74 | this.cookies.push(newCookie); 75 | } else { 76 | this.cookies[existingIndex] = newCookie; 77 | } 78 | 79 | this.emit("addcookie",newCookie); 80 | 81 | if (cb && cb instanceof Function) 82 | cb(null,newCookie); 83 | 84 | return this; 85 | }; 86 | 87 | /* 88 | Public: Removes cookies from the cookie jar. If no domain and name are 89 | specified, all cookies in the jar are removed. 90 | 91 | name - The name of the cookie(s) to remove 92 | domain - The domain from which to remove cookies. 93 | cb - Optional callback. 94 | 95 | Emits 96 | 97 | removecookie - Emitted with array of removed cookies. 98 | 99 | Examples 100 | 101 | cookieJar.remove(null,"nytimes.com"); 102 | 103 | Returns an array of removed cookies. 104 | 105 | */ 106 | CookieJar.prototype.remove = function(name,domain,cb) { 107 | var cookiesRemoved = [], jar = this; 108 | 109 | this.cookies.forEach(function(cookie,index) { 110 | 111 | // If the names don't match, we're not removing this cookie 112 | if (!!name && cookie.name !== name) 113 | return false; 114 | 115 | // If the domains don't match, we're not removing this cookie 116 | if (!!domain && !cookie.matchDomain(domain)) 117 | return false; 118 | 119 | // Matched. Remove! 120 | cookiesRemoved.push(jar.cookies.splice(index,1)); 121 | }); 122 | 123 | jar.emit("removecookie",cookiesRemoved); 124 | 125 | if (cb && cb instanceof Function) 126 | cb(null,cookiesRemoved); 127 | 128 | return cookiesRemoved; 129 | }; 130 | 131 | /* 132 | Public: Gets an array of cookies based on name and domain. 133 | 134 | name - The name of the cookie(s) to retrieve 135 | domain - The domain from which to retrieve cookies. 136 | cb - Optional callback. 137 | 138 | Examples 139 | 140 | cookieJar.get(null,"nytimes.com"); 141 | 142 | Returns an array of cookies. 143 | 144 | */ 145 | CookieJar.prototype.get = function(name,domain,cb) { 146 | 147 | var cookies = 148 | this.cookies.filter(function(cookie,index) { 149 | 150 | // If the names don't match, we're not returning this cookie 151 | if (!!name && cookie.name !== name) 152 | return false; 153 | 154 | // If the domains don't match, we're not returning this cookie 155 | if (!!domain && !cookie.matchDomain(domain)) 156 | return false; 157 | 158 | return true; 159 | }); 160 | 161 | if (cb && cb instanceof Function) 162 | cb(null,cookies); 163 | 164 | return cookies; 165 | }; 166 | 167 | /* 168 | Public: Generates an array of headers based on the value of the cookie jar. 169 | 170 | domain - The domain from which to generate cookies. 171 | path - Filter headers to cookies applicable to this path. 172 | cb - Optional callback. 173 | 174 | Examples 175 | 176 | cookieJar.getAsHeader("nytimes.com","/myaccount"); 177 | 178 | Returns an array of cookie headers. 179 | 180 | */ 181 | CookieJar.prototype.getAsHeader = function(domain,path,cb) { 182 | 183 | var headers = 184 | this.cookies 185 | .filter(function(cookie) { 186 | if (cookie.isExpired()) return false; 187 | if (!domain && !path) return true; 188 | if (domain) return cookie.matchDomain(domain); 189 | if (path) return cookie.matchPath(path); 190 | }) 191 | .map(function(cookie) { 192 | return cookie.toString(); 193 | }); 194 | 195 | if (cb && cb instanceof Function) 196 | cb(null,headers); 197 | 198 | return headers; 199 | }; 200 | 201 | /* 202 | Public: Adds cookies to the cookie jar based on an array of 'set-cookie' 203 | headers provided by a webserver. Duplicate cookies are overwritten. 204 | 205 | headers - An array of 'set-cookie' headers 206 | cb - Optional callback. 207 | 208 | Examples 209 | 210 | cookieJar.addFromHeaders(res.headers["set-cookie"]); 211 | 212 | Returns the cookie jar for chaining. 213 | 214 | */ 215 | CookieJar.prototype.addFromHeaders = function(headers,cb) { 216 | var jar = this; 217 | 218 | if (!(headers instanceof Array)) 219 | headers = [headers]; 220 | 221 | headers.forEach(function(header) { 222 | jar.add(header); 223 | }); 224 | 225 | if (cb && cb instanceof Function) 226 | cb(jar); 227 | 228 | return jar; 229 | }; 230 | 231 | /* 232 | Public: Outputs a linefeed-separated list of set-cookie headers representing 233 | the entire contents of the cookie jar. 234 | 235 | Examples 236 | 237 | cookieJar.toString(); 238 | 239 | Returns a list of headers in string form. 240 | 241 | */ 242 | CookieJar.prototype.toString = function() { 243 | return this.getAsHeader().join("\n"); 244 | }; 245 | 246 | 247 | /* 248 | Public: Constructor for the Cookie() object: create a new cookie. 249 | 250 | name - The name of the cookie to add. 251 | value - The value of the cookie. 252 | expires - Expiry timestamp in milliseconds. 253 | path - Limit cookie to path (defaults to "/") 254 | domain - Limit cookie to domain 255 | httponly - Boolean value specifying httponly 256 | 257 | Examples 258 | 259 | var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false); 260 | 261 | Returns the newly created Cookie object. 262 | 263 | */ 264 | function Cookie(name,value,expires,path,domain,httponly) { 265 | 266 | if (!name) throw new Error("A name is required to create a cookie."); 267 | 268 | // Parse date to timestamp - consider it never expiring if timestamp is not 269 | // passed to the function 270 | if (expires) { 271 | 272 | if (typeof expires !== "number") 273 | expires = (new Date(expires)).getTime(); 274 | 275 | } else { 276 | expires = -1; 277 | } 278 | 279 | this.name = name; 280 | this.value = value || ""; 281 | this.expires = expires; 282 | this.path = path || "/"; 283 | this.domain = domain || "*"; 284 | this.httponly = !!httponly; 285 | } 286 | 287 | /* 288 | Public, Static: Returns a new Cookie() object based on a header string. 289 | 290 | string - A set-cookie header string 291 | 292 | Examples 293 | 294 | var myCookie = Cookie.fromString(response.headers["set-cookie"][0]); 295 | 296 | Returns the newly created Cookie object. 297 | 298 | */ 299 | Cookie.fromString = function(string) { 300 | 301 | if (!string || typeof string !== "string") 302 | throw new Error("String must be supplied to generate a cookie."); 303 | 304 | function parseKeyVal(input) { 305 | var key = input.split(/\=/).shift(), 306 | val = input.split(/\=/).slice(1).join("="); 307 | 308 | return [key,val]; 309 | } 310 | 311 | string = string.replace(/^\s*set\-cookie\s*\:\s*/i,""); 312 | 313 | var parts = string.split(/\s*\;\s*/i), 314 | name = parseKeyVal(parts.shift()), 315 | keyValParts = {}; 316 | 317 | keyValParts.name = name[0]; 318 | keyValParts.value = name[1]; 319 | 320 | parts 321 | .filter(function(input) { 322 | return !!input.replace(/\s+/ig,"").length; 323 | }) 324 | .map(parseKeyVal) 325 | .forEach(function(keyval) { 326 | var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig,""); 327 | keyValParts[key] = keyval[1]; 328 | }); 329 | 330 | return new Cookie( 331 | keyValParts.name, 332 | keyValParts.value, 333 | keyValParts.expires || keyValParts.expiry, 334 | keyValParts.path, 335 | keyValParts.domain, 336 | keyValParts.hasOwnProperty("httponly") 337 | ); 338 | }; 339 | 340 | /* 341 | Public: Outputs the cookie as a string, in the form of a set-cookie header. 342 | 343 | includeHeader - Boolean value specifying whether to include the 344 | 'Set-Cookie: ' header name at the beginning of the 345 | string. 346 | 347 | Examples 348 | 349 | var header = myCookie.toString(true); 350 | 351 | Returns the header string. 352 | 353 | */ 354 | Cookie.prototype.toString = function(includeHeader) { 355 | var string = ""; 356 | 357 | if (includeHeader) string = "Set-Cookie: "; 358 | 359 | string += this.name + "=" + this.value + "; "; 360 | 361 | if (this.expires > 0) 362 | string += "Expires=" + (new Date(this.expires)).toGMTString() + "; "; 363 | 364 | if (!!this.path) 365 | string += "Path=" + this.path + "; "; 366 | 367 | if (!!this.domain) 368 | string += "Domain=" + this.domain + "; "; 369 | 370 | if (!!this.httponly) 371 | string += "Httponly; "; 372 | 373 | return string; 374 | }; 375 | 376 | /* 377 | Public: Determines whether a cookie has expired or not. 378 | 379 | Examples 380 | 381 | if (myCookie.isExpired()) { ... } 382 | 383 | Returns a boolean value specifying whether the cookie has expired (true) or 384 | whether it is still valid (false.) 385 | 386 | */ 387 | Cookie.prototype.isExpired = function() { 388 | if (this.expires < 0) return false; 389 | return (this.expires < Date.now()); 390 | }; 391 | 392 | /* 393 | Public: Determines whether a cookie matches a given domain. 394 | 395 | Examples 396 | 397 | if (myCookie.matchDomain("example.com")) { ... } 398 | 399 | Returns a boolean value specifying whether the cookie matches (true) or 400 | doesn't match (false.) 401 | 402 | */ 403 | Cookie.prototype.matchDomain = function(domain) { 404 | var reverseDomain = this.domain.split("").reverse().join(""), 405 | reverseDomainComp = domain.split("").reverse().join(""); 406 | 407 | return reverseDomain.indexOf(reverseDomainComp) === 0; 408 | }; 409 | 410 | /* 411 | Public: Determines whether a cookie matches a given path. 412 | 413 | Examples 414 | 415 | if (myCookie.matchPath("/test/account")) { ... } 416 | 417 | Returns a boolean value specifying whether the cookie matches (true) or 418 | doesn't match (false.) 419 | 420 | */ 421 | Cookie.prototype.matchPath = function(path) { 422 | if (!this.path) return true; 423 | 424 | return path.indexOf(this.path) === 0; 425 | }; 426 | 427 | module.exports = CookieJar; 428 | module.exports.Cookie = Cookie; 429 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # Simple web-crawler for node.js [![Build Status](https://travis-ci.org/cgiffard/node-simplecrawler.png?branch=master)](https://travis-ci.org/cgiffard/node-simplecrawler) 2 | 3 | Simplecrawler is designed to provide the most basic possible API for crawling 4 | websites, while being as flexible and robust as possible. I wrote simplecrawler 5 | to archive, analyse, and search some very large websites. It has happily chewed 6 | through 50,000 pages and written tens of gigabytes to disk without issue. 7 | 8 | #### Example (simple mode) 9 | 10 | ```javascript 11 | var Crawler = require("simplecrawler"); 12 | 13 | Crawler.crawl("http://example.com/") 14 | .on("fetchcomplete",function(queueItem){ 15 | console.log("Completed fetching resource:",queueItem.url); 16 | }); 17 | ``` 18 | 19 | ### What does simplecrawler do? 20 | 21 | * Provides a very simple event driven API using `EventEmitter` 22 | * Extremely configurable base for writing your own crawler 23 | * Provides some simple logic for autodetecting linked resources - which you can 24 | replace or augment 25 | * Has a flexible queue system which can be frozen to disk and defrosted 26 | * Provides basic statistics on network performance 27 | * Uses buffers for fetching and managing data, preserving binary data (except 28 | when discovering links) 29 | 30 | ### Installation 31 | 32 | ``` 33 | npm install simplecrawler 34 | ``` 35 | 36 | ### Getting Started 37 | 38 | There are two ways of instantiating a new crawler - a simple but less flexible 39 | method inspired by [anemone](http://anemone.rubyforge.org), and the traditional 40 | method which provides a little more room to configure crawl parameters. 41 | 42 | Regardless of wether you use the simple or traditional methods of instantiation, 43 | you'll need to require simplecrawler: 44 | 45 | ```javascript 46 | var Crawler = require("simplecrawler"); 47 | ``` 48 | 49 | #### Simple Mode 50 | 51 | Simple mode generates a new crawler for you, preconfigures it based on a URL you 52 | provide, and returns the crawler to you for further configuration and so you can 53 | attach event handlers. 54 | 55 | Simply call `Crawler.crawl`, with a URL first parameter, and two optional 56 | functions that will be added as event listeners for `fetchcomplete` and 57 | `fetcherror` respectively. 58 | 59 | ```javascript 60 | Crawler.crawl("http://example.com/", function(queueItem){ 61 | console.log("Completed fetching resource:",queueItem.url); 62 | }); 63 | ``` 64 | 65 | Alternately, if you decide to omit these functions, you can use the returned 66 | crawler object to add the event listeners yourself, and tweak configuration 67 | options: 68 | 69 | ```javascript 70 | var crawler = Crawler.crawl("http://example.com/"); 71 | 72 | crawler.interval = 500; 73 | 74 | crawler.on("fetchcomplete",function(queueItem){ 75 | console.log("Completed fetching resource:",queueItem.url); 76 | }); 77 | ``` 78 | 79 | #### Advanced Mode 80 | 81 | The alternative method of creating a crawler is to call the `simplecrawler` 82 | constructor yourself, and to initiate the crawl manually. 83 | 84 | ```javascript 85 | var myCrawler = new Crawler("www.example.com"); 86 | ``` 87 | 88 | Nonstandard port? HTTPS? Want to start archiving a specific path? No problem: 89 | 90 | ```javascript 91 | myCrawler.initialPath = "/archive"; 92 | myCrawler.initialPort = 8080; 93 | myCrawler.initialProtocol = "https"; 94 | 95 | // Or: 96 | var myCrawler = new Crawler("www.example.com","/archive",8080); 97 | 98 | ``` 99 | 100 | And of course, you're probably wanting to ensure you don't take down your web 101 | server. Decrease the concurrency from five simultaneous requests - and increase 102 | the request interval from the default 250ms like this: 103 | 104 | ```javascript 105 | myCrawler.interval = 10000; // Ten seconds 106 | myCrawler.maxConcurrency = 1; 107 | ``` 108 | 109 | You can also define a max depth for links to fetch : 110 | ```javascript 111 | myCrawler.maxDepth = 1; // Only first page is fetched (with linked CSS & images) 112 | // Or: 113 | myCrawler.maxDepth = 2; // First page and discovered links from it are fetched 114 | // Or: 115 | myCrawler.maxDepth = 3; // Etc. 116 | ``` 117 | 118 | For brevity, you may also specify the initial path and request interval when 119 | creating the crawler: 120 | 121 | ```javascript 122 | var myCrawler = new Crawler("www.example.com","/",8080,300); 123 | ``` 124 | 125 | ### Running the crawler 126 | 127 | First, you'll need to set up an event listener to get the fetched data: 128 | 129 | ```javascript 130 | myCrawler.on("fetchcomplete",function(queueItem, responseBuffer, response) { 131 | console.log("I just received %s (%d bytes)",queueItem.url,responseBuffer.length); 132 | console.log("It was a resource of type %s",response.headers['content-type']); 133 | 134 | // Do something with the data in responseBuffer 135 | }); 136 | ``` 137 | 138 | Then, when you're satisfied you're ready to go, start the crawler! It'll run 139 | through its queue finding linked resources on the domain to download, until it 140 | can't find any more. 141 | 142 | ```javascript 143 | myCrawler.start(); 144 | ``` 145 | 146 | Of course, once you've got that down pat, there's a fair bit more you can listen for... 147 | 148 | ### Events 149 | 150 | * `crawlstart` 151 | Fired when the crawl begins or is restarted. 152 | * `queueadd` ( queueItem ) 153 | Fired when a new item is automatically added to the queue (not when you manually 154 | queue an item yourself.) 155 | * `queueduplicate` ( URLData ) 156 | Fired when an item cannot be added to the queue because it is already present in 157 | the queue. Frequent firing of this event is normal and expected. 158 | * `queueerror` ( errorData , URLData ) 159 | Fired when an item cannot be added to the queue due to error. 160 | * `fetchstart` ( queueItem , requestOptions ) 161 | Fired when an item is spooled for fetching. If your event handler is synchronous, 162 | you can modify the crawler request options (including headers and request method.) 163 | * `fetchheaders` ( queueItem , responseObject ) 164 | Fired when the headers for a resource are received from the server. The node http 165 | response object is returned for your perusal. 166 | * `fetchcomplete` ( queueItem , responseBuffer , response ) 167 | Fired when the resource is completely downloaded. The entire file data is provided 168 | as a buffer, as well as the response object. 169 | * `fetchdataerror` ( queueItem, response ) 170 | Fired when a resource can't be downloaded, because it exceeds the maximum size 171 | we're prepared to receive (16MB by default.) 172 | * `fetchredirect` ( queueItem, parsedURL, response ) 173 | Fired when a redirect header is encountered. The new URL is validated and returned 174 | as a complete canonical link to the new resource. 175 | * `fetch404` ( queueItem, response ) 176 | Fired when a 404 HTTP status code is returned for a request. 177 | * `fetcherror` ( queueItem, response ) 178 | Fired when an alternate 400 or 500 series HTTP status code is returned for a 179 | request. 180 | * `fetchtimeout` ( queueItem, crawlerTimeoutValue ) 181 | Fired when a request time exceeds the internal crawler threshold. 182 | * `fetchclienterror` ( queueItem, errorData ) 183 | Fired when a request dies locally for some reason. The error data is returned as 184 | the second parameter. 185 | * `discoverycomplete` ( queueItem, resources ) 186 | Fired when linked resources have been discovered. Passes an array of resources 187 | (as URLs) as the second parameter. 188 | * `complete` 189 | Fired when the crawler completes processing all the items in its queue, and does 190 | not find any more to add. This event returns no arguments. 191 | 192 | #### A note about HTTP error conditions 193 | By default, simplecrawler does not download the response body when it encounters 194 | an HTTP error status in the response. If you need this information, you can listen 195 | to simplecrawler's error events, and through node's native `data` event 196 | (`response.on("data",function(chunk) {...})`) you can save the information yourself. 197 | 198 | If this is annoying, and you'd really like to retain error pages by default, let 199 | me know. I didn't include it because I didn't need it - but if it's important to 200 | people I might put it back in. :) 201 | 202 | #### Waiting for Asynchronous Event Listeners 203 | 204 | Sometimes, you might want to wait for simplecrawler to wait for you while you 205 | perform sone asynchronous tasks in an event listener, instead of having it 206 | racing off and firing the `complete` event, halting your crawl. For example, 207 | if you're doing your own link discovery using an asynchronous library method. 208 | 209 | Simplecrawler provides a `wait` method you can call at any time. It is available 210 | via `this` from inside listeners, and on the crawler object itself. It returns 211 | a callback function. 212 | 213 | Once you've called this method, simplecrawler will not fire the `complete` event 214 | until either you execute the callback it returns, or a timeout is reached 215 | (configured in `crawler.listenerTTL`, by default 10000 msec.) 216 | 217 | ##### Example Asynchronous Event Listener 218 | 219 | ```javascript 220 | crawler.on("fetchcomplete",function(queueItem,data,res) { 221 | var continue = this.wait(); 222 | doSomeDiscovery(data,function(foundURLs){ 223 | foundURLs.forEach(crawler.queueURL.bind(crawler)); 224 | continue(); 225 | }); 226 | }); 227 | ``` 228 | 229 | ### Configuring the crawler 230 | 231 | Here's a complete list of what you can stuff with at this stage: 232 | 233 | * `crawler.host` - 234 | The domain to scan. By default, simplecrawler will restrict all requests to 235 | this domain. 236 | * `crawler.initialPath` - 237 | The initial path with which the crawler will formulate its first request. 238 | Does not restrict subsequent requests. 239 | * `crawler.initialPort` - 240 | The initial port with which the crawler will formulate its first request. 241 | Does not restrict subsequent requests. 242 | * `crawler.initialProtocol` - 243 | The initial protocol with which the crawler will formulate its first request. 244 | Does not restrict subsequent requests. 245 | * `crawler.interval` - 246 | The interval with which the crawler will spool up new requests (one per 247 | tick.) Defaults to 250ms. 248 | * `crawler.maxConcurrency` - 249 | The maximum number of requests the crawler will run simultaneously. Defaults 250 | to 5 - the default number of http agents node will run. 251 | * `crawler.timeout` - 252 | The maximum time in milliseconds the crawler will wait for headers before 253 | aborting the request. 254 | * `crawler.listenerTTL` - 255 | The maximum time in milliseconds the crawler will wait for async listeners. 256 | * `crawler.userAgent` - 257 | The user agent the crawler will report. Defaults to 258 | `Node/SimpleCrawler (http://www.github.com/cgiffard/node-simplecrawler)`. 259 | * `crawler.queue` - 260 | The queue in use by the crawler (Must implement the `FetchQueue` interface) 261 | * `crawler.filterByDomain` - 262 | Specifies whether the crawler will restrict queued requests to a given 263 | domain/domains. 264 | * `crawler.scanSubdomains` - 265 | Enables scanning subdomains (other than www) as well as the specified domain. 266 | Defaults to false. 267 | * `crawler.ignoreWWWDomain` - 268 | Treats the `www` domain the same as the originally specified domain. 269 | Defaults to true. 270 | * `crawler.stripWWWDomain` - 271 | Or go even further and strip WWW subdomain from requests altogether! 272 | * `crawler.stripQuerystring` - 273 | Specify to strip querystring parameters from URLs. Defaults to false. 274 | * `crawler.discoverResources` - 275 | Use simplecrawler's internal resource discovery function. Defaults to true. 276 | (switch it off if you'd prefer to discover and queue resources yourself!) 277 | * `crawler.discoverRegex` - 278 | Array of regex objects that simplecrawler uses to discover resources. 279 | * `crawler.cache` - 280 | Specify a cache architecture to use when crawling. Must implement 281 | `SimpleCache` interface. 282 | * `crawler.useProxy` - 283 | The crawler should use an HTTP proxy to make its requests. 284 | * `crawler.proxyHostname` - 285 | The hostname of the proxy to use for requests. 286 | * `crawler.proxyPort` - 287 | The port of the proxy to use for requests. 288 | * `crawler.proxyUser` - 289 | The username for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.) 290 | * `crawler.proxyPass` - 291 | The password for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.) 292 | * `crawler.domainWhitelist` - 293 | An array of domains the crawler is permitted to crawl from. If other settings 294 | are more permissive, they will override this setting. 295 | * `crawler.supportedMimeTypes` - 296 | An array of RegEx objects used to determine supported MIME types (types of 297 | data simplecrawler will scan for links.) If you're not using simplecrawler's 298 | resource discovery function, this won't have any effect. 299 | * `crawler.allowedProtocols` - 300 | An array of RegEx objects used to determine whether a URL protocol is supported. 301 | This is to deal with nonstandard protocol handlers that regular HTTP is 302 | sometimes given, like `feed:`. It does not provide support for non-http 303 | protocols (and why would it!?) 304 | * `crawler.maxResourceSize` - 305 | The maximum resource size, in bytes, which will be downloaded. Defaults to 16MB. 306 | * `crawler.downloadUnsupported` - 307 | Simplecrawler will download files it can't parse. Defaults to true, but if 308 | you'd rather save the RAM and GC lag, switch it off. 309 | * `crawler.needsAuth` - 310 | Flag to specify if the domain you are hitting requires basic authentication 311 | * `crawler.authUser` - 312 | Username provided for needsAuth flag 313 | * `crawler.authPass` - 314 | Password provided for needsAuth flag 315 | * `crawler.customHeaders` - 316 | An object specifying a number of custom headers simplecrawler will add to 317 | every request. These override the default headers simplecrawler sets, so 318 | be careful with them. If you want to tamper with headers on a per-request basis, 319 | see the `fetchqueue` event. 320 | * `crawler.acceptCookies` - 321 | Flag to indicate if the crawler should hold on to cookies 322 | * `crawler.urlEncoding` - 323 | Set this to `iso8859` to trigger URIjs' re-encoding of iso8859 URLs to unicode. 324 | Defaults to `unicode`. 325 | * `crawler.parseHTMLComments` - 326 | Whether to scan for URLs inside HTML comments. 327 | Defaults to `true`. 328 | * `crawler.parseScriptTags` - 329 | Whether to scan for URLs inside script tags. 330 | Defaults to `true`. 331 | * `myCrawler.maxDepth` - 332 | Defines a maximum distance from the original request at which resources will 333 | be downloaded. Asset files are excluded from this distance condition. 334 | Defaults to `0` — no max depth. 335 | 336 | #### Excluding certain resources from downloading 337 | 338 | Simplecrawler has a mechanism you can use to prevent certain resources from being 339 | fetched, based on the URL, called *Fetch Conditions**. A fetch condition is just 340 | a function, which, when given a parsed URL object, will return a true or a false 341 | value, indicating whether a given resource should be downloaded. 342 | 343 | You may add as many fetch conditions as you like, and remove them at runtime. 344 | Simplecrawler will evaluate every single condition against every queued URL, and 345 | should just one of them return a falsy value (this includes null and undefined, 346 | so remember to always return a value!) then the resource in question will not be 347 | fetched. 348 | 349 | ##### Adding a fetch condition 350 | 351 | This example fetch condition prevents URLs ending in `.pdf` from downloading. 352 | Adding a fetch condition assigns it an ID, which the `addFetchCondition` function 353 | returns. You can use this ID to remove the condition later. 354 | 355 | ```javascript 356 | var conditionID = myCrawler.addFetchCondition(function(parsedURL) { 357 | return !parsedURL.path.match(/\.pdf$/i); 358 | }); 359 | ``` 360 | 361 | NOTE: simplecrawler uses slightly different terminology to URIjs. `parsedURL.path` 362 | includes the query string too. If you want the path without the query string, 363 | use `parsedURL.uriPath`. 364 | 365 | ##### Removing a fetch condition 366 | 367 | If you stored the ID of the fetch condition you added earlier, you can remove it 368 | from the crawler: 369 | 370 | ```javascript 371 | myCrawler.removeFetchCondition(conditionID); 372 | ``` 373 | 374 | ### The Simplecrawler Queue 375 | 376 | Simplecrawler has a queue like any other web crawler. It can be directly accessed 377 | at `crawler.queue` (assuming you called your Crawler() object `crawler`.) It 378 | provides array access, so you can get to queue items just with array notation 379 | and an index. 380 | 381 | ```javascript 382 | crawler.queue[5]; 383 | ``` 384 | 385 | For compatibility with different backing stores, it now provides an alternate 386 | interface which the crawler core makes use of: 387 | 388 | ```javascript 389 | crawler.queue.get(5); 390 | ``` 391 | 392 | It's not just an array though. 393 | 394 | #### Adding to the queue 395 | 396 | The simplest way to add to the queue is to use the crawler's own method, 397 | `crawler.queueURL`. This method takes a complete URL, validates and deconstructs 398 | it, and adds it to the queue. 399 | 400 | If you instead want to add a resource by its components, you may call the 401 | `queue.add` method directly: 402 | 403 | ```javascript 404 | crawler.queue.add(protocol,hostname,port,path); 405 | ``` 406 | 407 | That's it! It's basically just a URL, but comma separated (that's how you can 408 | remember the order.) 409 | 410 | #### Queue items 411 | 412 | Because when working with simplecrawler, you'll constantly be handed queue items, 413 | it helps to know what's inside them. These are the properties every queue item 414 | is expected to have: 415 | 416 | * `url` - The complete, canonical URL of the resource. 417 | * `protocol` - The protocol of the resource (http, https) 418 | * `host` - The full domain/hostname of the resource 419 | * `port` - The port of the resource 420 | * `path` - The bit of the URL after the domain - includes the querystring. 421 | * `fetched` - Has the request for this item been completed? You can monitor this as requests are processed. 422 | * `status` - The internal status of the item, always a string. This can be one of: 423 | * `queued` - The resource is in the queue to be fetched, but nothing's happened to it yet. 424 | * `spooled` - A request has been made to the remote server, but we're still waiting for a response. 425 | * `headers` - The headers for the resource have been received. 426 | * `downloaded` - The item has been entirely downloaded. 427 | * `redirected` - The resource request returned a 300 series response, with a Location header and a new URL. 428 | * `notfound` - The resource could not be found. (404) 429 | * `failed` - An error occurred when attempting to fetch the resource. 430 | * `stateData` - An object containing state data and other information about the request: 431 | * `requestLatency` - The time taken for headers to be received after the request was made. 432 | * `requestTime` - The total time taken for the request (including download time.) 433 | * `downloadTime` - The total time taken for the resource to be downloaded. 434 | * `contentLength` - The length (in bytes) of the returned content. Calculated based on the `content-length` header. 435 | * `contentType` - The MIME type of the content. 436 | * `code` - The HTTP status code returned for the request. 437 | * `headers` - An object containing the header information returned by the server. This is the object node returns as part of the `response` object. 438 | * `actualDataSize` - The length (in bytes) of the returned content. Calculated based on what is actually received, not the `content-length` header. 439 | * `sentIncorrectSize` - True if the data length returned by the server did not match what we were told to expect by the `content-length` header. 440 | 441 | You can address these properties like you would any other object: 442 | 443 | ```javascript 444 | crawler.queue[52].url; 445 | queueItem.stateData.contentLength; 446 | queueItem.status === "queued"; 447 | ``` 448 | 449 | As you can see, you can get a lot of meta-information out about each request. The 450 | upside is, the queue actually has some convenient functions for getting simple 451 | aggregate data about the queue... 452 | 453 | #### Queue Statistics and Reporting 454 | 455 | First of all, the queue can provide some basic statistics about the network 456 | performance of your crawl (so far.) This is done live, so don't check it thirty 457 | times a second. You can test the following properties: 458 | 459 | * `requestTime` 460 | * `requestLatency` 461 | * `downloadTime` 462 | * `contentLength` 463 | * `actualDataSize` 464 | 465 | And you can get the maximum, minimum, and average values for each with the 466 | `crawler.queue.max`, `crawler.queue.min`, and `crawler.queue.avg` functions 467 | respectively. Like so: 468 | 469 | ```javascript 470 | console.log("The maximum request latency was %dms.",crawler.queue.max("requestLatency")); 471 | console.log("The minimum download time was %dms.",crawler.queue.min("downloadTime")); 472 | console.log("The average resource size received is %d bytes.",crawler.queue.avg("actualDataSize")); 473 | ``` 474 | 475 | You'll probably often need to determine how many items in the queue have a given 476 | status at any one time, and/or retreive them. That's easy with 477 | `crawler.queue.countWithStatus` and `crawler.queue.getWithStatus`. 478 | 479 | `crawler.queue.countWithStatus` returns the number of queued items with a given 480 | status, while `crawler.queue.getWithStatus` returns an array of the queue items 481 | themselves. 482 | 483 | ```javascript 484 | var redirectCount = crawler.queue.countWithStatus("redirected"); 485 | 486 | crawler.queue.getWithStatus("failed").forEach(function(queueItem) { 487 | console.log("Whoah, the request for %s failed!",queueItem.url); 488 | 489 | // do something... 490 | }); 491 | ``` 492 | 493 | Then there's some even simpler convenience functions: 494 | 495 | * `crawler.queue.complete` - returns the number of queue items which have been 496 | completed (marked as fetched) 497 | * `crawler.queue.errors` - returns the number of requests which have failed 498 | (404s and other 400/500 errors, as well as client errors) 499 | 500 | #### Saving and reloading the queue (freeze/defrost) 501 | 502 | You'll probably want to be able to save your progress and reload it later, if 503 | your application fails or you need to abort the crawl for some reason. (Perhaps 504 | you just want to finish off for the night and pick it up tomorrow!) The 505 | `crawler.queue.freeze` and `crawler.queue.defrost` functions perform this task. 506 | 507 | **A word of warning though** - they are not CPU friendly as they rely on 508 | JSON.parse and JSON.stringify. Use them only when you need to save the queue - 509 | don't call them every request or your application's performance will be incredibly 510 | poor - they block like *crazy*. That said, using them when your crawler commences 511 | and stops is perfectly reasonable. 512 | 513 | Note that the methods themselves are asynchronous, so if you are going to exit the 514 | process after you do the freezing, make sure you wait for callback - otherwise 515 | you'll get an empty file. 516 | 517 | ```javascript 518 | // Freeze queue 519 | crawler.queue.freeze("mysavedqueue.json", function() { 520 | process.exit(); 521 | }); 522 | 523 | // Defrost queue 524 | crawler.queue.defrost("mysavedqueue.json"); 525 | ``` 526 | 527 | ## Cookies 528 | 529 | Simplecrawler now has an internal cookie jar, which collects and resends cookies 530 | automatically, and by default. 531 | 532 | If you want to turn this off, set the `crawler.acceptCookies` option to `false`. 533 | 534 | The cookie jar is accessible via `crawler.cookies`, and is an event emitter itself: 535 | 536 | ### Cookie Events 537 | 538 | * `addcookie` ( cookie ) 539 | Fired when a new cookie is added to the jar. 540 | * `removecookie` ( cookie array ) 541 | Fired when one or more cookies are removed from the jar. 542 | 543 | ## Building and Testing 544 | 545 | #### Build Status: 546 | 547 | * Master: [![Build Status](https://travis-ci.org/cgiffard/node-simplecrawler.png?branch=master)](https://travis-ci.org/cgiffard/node-simplecrawler) 548 | * Development: [![Build Status](https://travis-ci.org/cgiffard/node-simplecrawler.png?branch=development)](https://travis-ci.org/cgiffard/node-simplecrawler) 549 | 550 | ## Contributors 551 | 552 | I'd like to extend sincere thanks to: 553 | 554 | * [Nick Crohn](https://github.com/ncrohn) for the HTTP Basic auth support, and 555 | initial cookie support. 556 | * [Mike Moulton](https://github.com/mmoulton) for 557 | [fixing a bug in the URL discovery mechanism] 558 | (https://github.com/cgiffard/node-simplecrawler/pull/3), as well as 559 | [adding the `discoverycomplete` event] 560 | (https://github.com/cgiffard/node-simplecrawler/pull/10), 561 | * [Mike Iannacone](https://github.com/mikeiannacone) for correcting a keyword 562 | naming collision with node 0.8's EventEmitter. 563 | * [Greg Molnar](https://github.com/gregmolnar) for 564 | [adding a querystring-free path parameter to parsed URL objects.] 565 | (https://github.com/cgiffard/node-simplecrawler/pull/31) 566 | * [Breck Yunits](https://github.com/breck7) for contributing a useful code 567 | sample demonstrating using simplecrawler for caching a website to disk! 568 | * [Luke Plaster](https://github.com/notatestuser) for enabling protocol-agnostic 569 | link discovery 570 | * [Zeus](https://github.com/distracteddev) for fixing a bug where [default port 571 | info was wrongly specified in requests] 572 | (https://github.com/cgiffard/node-simplecrawler/pull/40) 573 | and for fixing the missing request timeout handling! 574 | * [Graham Hutchinson](https://github.com/ghhutch) for adding 575 | querystring-stripping option 576 | * [Jellyfrog](https://github.com/jellyfrog) for assisting in diagnosing some 577 | nasty EventEmitter issues. 578 | * [Brian Moeskau](https://github.com/bmoeskau) for helping to fix the confusing 579 | 'async' events API, and providing invaluable feedback. 580 | 581 | And everybody else who has helped out in some way! :) 582 | 583 | ## Licence 584 | 585 | Copyright (c) 2013, Christopher Giffard. 586 | 587 | All rights reserved. 588 | 589 | Redistribution and use in source and binary forms, with or without modification, 590 | are permitted provided that the following conditions are met: 591 | 592 | * Redistributions of source code must retain the above copyright notice, this 593 | list of conditions and the following disclaimer. 594 | * Redistributions in binary form must reproduce the above copyright notice, this 595 | list of conditions and the following disclaimer in the documentation and/or 596 | other materials provided with the distribution. 597 | 598 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 599 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 600 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 601 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 602 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 603 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 604 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 605 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 606 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 607 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 608 | -------------------------------------------------------------------------------- /lib/crawler.js: -------------------------------------------------------------------------------- 1 | // Simplecrawler 2 | // Christopher Giffard, 2011 - 2013+ 3 | // 4 | // http://www.github.com/cgiffard/node-simplecrawler 5 | 6 | // Queue Dependency 7 | var FetchQueue = require("./queue.js"), 8 | Cache = require("./cache.js"), 9 | CookieJar = require("./cookies.js"), 10 | MetaInfo = require("../package.json"); 11 | 12 | var http = require("http"), 13 | https = require("https"), 14 | EventEmitter = require('events').EventEmitter, 15 | URI = require("URIjs"), 16 | zlib = require("zlib"), 17 | util = require("util"); 18 | 19 | var QUEUE_ITEM_INITIAL_DEPTH = 1; 20 | 21 | /* 22 | Public: Constructor for the crawler. 23 | 24 | host - Initial hostname/domain to begin crawling from. By 25 | default, the crawl will be locked to this hostname. 26 | initialPath - Initial path to begin crawling from. 27 | initialPort - Port to begin crawling from. 28 | interval - Request interval for the crawler. Defaults to 250ms. 29 | 30 | Examples 31 | 32 | var crawler = new Crawler("example.com","/",80,500); 33 | 34 | var crawler = new Crawler("example.com"); 35 | 36 | Returns the crawler object which has now been constructed. 37 | 38 | */ 39 | var Crawler = function(host,initialPath,initialPort,interval) { 40 | var crawler = this; 41 | 42 | // Data integrity checks 43 | if (initialPort && isNaN(initialPort)) 44 | throw new Error("Port must be a number!"); 45 | 46 | // SETTINGS TO STUFF WITH 47 | // (not here! Do it when you create a `new Crawler()`) 48 | 49 | // Domain to crawl 50 | crawler.host = host || ""; 51 | 52 | // Gotta start crawling *somewhere* 53 | crawler.initialPath = initialPath || "/"; 54 | crawler.initialPort = initialPort || 80; 55 | crawler.initialProtocol = "http"; 56 | 57 | // Internal 'tick' interval for spawning new requests 58 | // (as long as concurrency is under cap) 59 | // One request will be spooled per tick, up to the concurrency threshold. 60 | crawler.interval = interval || 250; 61 | 62 | // Maximum request concurrency. Be sensible. Five ties in with node's 63 | // default maxSockets value. 64 | crawler.maxConcurrency = 5; 65 | 66 | // Maximum time we'll wait for headers 67 | crawler.timeout = 5 * 60 * 1000; 68 | 69 | // Maximum time we'll wait for async listeners. 70 | crawler.listenerTTL = 10 * 1000; 71 | 72 | // User Agent 73 | crawler.userAgent = 74 | "Node/" + MetaInfo.name + " " + MetaInfo.version + 75 | " (" + MetaInfo.repository.url + ")"; 76 | 77 | // Queue for requests - FetchQueue gives us stats and other sugar 78 | // (but it's basically just an array) 79 | crawler.queue = new FetchQueue(); 80 | 81 | // Do we filter by domain? 82 | // Unless you want to be crawling the entire internet, I would 83 | // recommend leaving this on! 84 | crawler.filterByDomain = true; 85 | 86 | // Do we scan subdomains? 87 | crawler.scanSubdomains = false; 88 | 89 | // Treat WWW subdomain the same as the main domain (and don't count 90 | // it as a separate subdomain) 91 | crawler.ignoreWWWDomain = true; 92 | 93 | // Or go even further and strip WWW subdomain from domains altogether! 94 | crawler.stripWWWDomain = false; 95 | 96 | // Internal cachestore 97 | crawler.cache = null; 98 | 99 | // Use an HTTP Proxy? 100 | crawler.useProxy = false; 101 | crawler.proxyHostname = "127.0.0.1"; 102 | crawler.proxyPort = 8123; 103 | crawler.proxyUser = null; 104 | crawler.proxyPass = null; 105 | 106 | // Support for HTTP basic auth 107 | crawler.needsAuth = false; 108 | crawler.authUser = ""; 109 | crawler.authPass = ""; 110 | 111 | // Support for retaining cookies for parse duration 112 | crawler.acceptCookies = true; 113 | crawler.cookies = new CookieJar(); 114 | 115 | // Support for custom headers... 116 | crawler.customHeaders = {}; 117 | 118 | // Domain Whitelist 119 | // We allow domains to be whitelisted, so cross-domain requests can be made. 120 | crawler.domainWhitelist = []; 121 | 122 | // Supported Protocols 123 | crawler.allowedProtocols = [ 124 | /^http(s)?$/i, // HTTP & HTTPS 125 | /^(rss|atom|feed)(\+xml)?$/i // RSS / XML 126 | ]; 127 | 128 | // Max file size to download/store 129 | crawler.maxResourceSize = 1024 * 1024 * 16; // 16mb 130 | 131 | // Supported MIME-types 132 | // Matching MIME-types will be scanned for links 133 | crawler.supportedMimeTypes = [ 134 | /^text\//i, 135 | /^application\/(rss|html|xhtml)?[\+\/\-]?xml/i, 136 | /^application\/javascript/i, 137 | /^xml/i 138 | ]; 139 | 140 | // Download linked, but unsupported files (binary - images, documents, etc) 141 | crawler.downloadUnsupported = true; 142 | 143 | // URL Encoding setting... 144 | crawler.urlEncoding = "unicode"; 145 | 146 | // Strip Querystring Parameters from URL 147 | crawler.stripQuerystring = false; 148 | 149 | // Regular expressions for finding URL items in HTML and text 150 | crawler.discoverRegex = [ 151 | /(\shref\s?=\s?|\ssrc\s?=\s?|url\()([^\"\'\s>\)]+)/ig, 152 | /(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]([^"']+)/ig, 153 | /http(s)?\:\/\/[^?\s><\'\"]+/ig, 154 | /url\([^\)]+/ig, 155 | 156 | // This might be a bit of a gamble... but get hard-coded 157 | // strings out of javacript: URLs. They're often popup-image 158 | // or preview windows, which would otherwise be unavailable to us. 159 | // Worst case scenario is we make some junky requests. 160 | /^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig 161 | ]; 162 | 163 | // Whether to parse inside HTML comments 164 | crawler.parseHTMLComments = true; 165 | 166 | // Whether to parse inside script tags 167 | crawler.parseScriptTags = true; 168 | 169 | // Max depth parameter 170 | crawler.maxDepth = 0; 171 | 172 | // STATE (AND OTHER) VARIABLES NOT TO STUFF WITH 173 | var hiddenProps = { 174 | "_openRequests": 0, 175 | "_fetchConditions": [], 176 | "_openListeners": 0 177 | }; 178 | 179 | // Run the EventEmitter constructor 180 | EventEmitter.call(crawler); 181 | 182 | // Apply all the hidden props 183 | Object.keys(hiddenProps).forEach(function(key) { 184 | Object.defineProperty(crawler, key, { 185 | "writable": true, 186 | "enumerable": false, 187 | "value": hiddenProps[key] 188 | }); 189 | }); 190 | }; 191 | 192 | util.inherits(Crawler,EventEmitter); 193 | 194 | /* 195 | Public: Starts or resumes the crawl. If the queue is empty, it adds a new 196 | queue item from which to begin crawling based on the initial configuration 197 | of the crawler itself. The crawler waits for process.nextTick to begin, so 198 | handlers and other properties can be altered or addressed before the crawl 199 | commences. 200 | 201 | Examples 202 | 203 | crawler.start(); 204 | 205 | Returns the crawler object, to enable chaining. 206 | 207 | */ 208 | Crawler.prototype.start = function() { 209 | var crawler = this; 210 | 211 | // only if we haven't already got stuff in our queue... 212 | crawler.queue.getLength(function(err, length) { 213 | if (err) throw err; 214 | 215 | if (!length) { 216 | 217 | // Initialise our queue by pushing the initial request data into it... 218 | crawler.queue.add( 219 | crawler.initialProtocol, 220 | crawler.host, 221 | crawler.initialPort, 222 | crawler.initialPath, 223 | QUEUE_ITEM_INITIAL_DEPTH, 224 | function(error) { 225 | if (error) throw error; 226 | }); 227 | } 228 | 229 | crawler.crawlIntervalID = 230 | setInterval( 231 | function() { 232 | crawler.crawl.call(crawler); 233 | }, 234 | crawler.interval); 235 | 236 | crawler.emit("crawlstart"); 237 | crawler.running = true; 238 | 239 | // Now kick off the initial crawl 240 | process.nextTick(function() { 241 | crawler.crawl(); 242 | }); 243 | }); 244 | 245 | return crawler; 246 | }; 247 | 248 | /* 249 | Public: Determines whether the protocol is supported, given a URL. 250 | 251 | URL - URL with a protocol, for testing. 252 | 253 | Examples 254 | 255 | crawler.protocolSupported("http://google.com/") // true, by default 256 | crawler.protocolSupported("wss://google.com/") // false, by default 257 | 258 | Returns a boolean, true if the protocol is supported - false if not. 259 | 260 | */ 261 | Crawler.prototype.protocolSupported = function(URL) { 262 | var protocol, crawler = this; 263 | 264 | try { 265 | protocol = URI(URL).protocol(); 266 | 267 | // Unspecified protocol. Assume http 268 | if (!protocol) 269 | protocol = "http"; 270 | 271 | } catch(e) { 272 | // If URIjs died, we definitely /do not/ support the protocol. 273 | return false; 274 | } 275 | 276 | return crawler.allowedProtocols.reduce(function(prev,protocolCheck) { 277 | return prev || !!protocolCheck.exec(protocol); 278 | },false); 279 | }; 280 | 281 | /* 282 | Public: Determines whether the mimetype is supported, given a mimetype 283 | 284 | MIMEType - String containing MIME type to test 285 | 286 | Examples 287 | 288 | crawler.mimeTypeSupported("text/html") // true, by default 289 | crawler.mimeTypeSupported("application/octet-stream") // false, by default 290 | 291 | Returns a boolean, true if the MIME type is supported - false if not. 292 | 293 | */ 294 | Crawler.prototype.mimeTypeSupported = function(MIMEType) { 295 | var crawler = this; 296 | 297 | return ( 298 | crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) { 299 | return prev || !!mimeCheck.exec(MIMEType); 300 | },false) 301 | ); 302 | }; 303 | 304 | /* 305 | Public: Determines whether the queueItem can be fetched from its depth 306 | 307 | In fact, the queueItem need to be fetched before calling this (because we need its MIMEType). 308 | This will just determine if we need to send an event for this item & if we need to fetch linked 309 | resources. 310 | 311 | If the queue item is a CSS or JS file, it will always be fetched (we need all images in CSS files, 312 | even if max depth is already reached). If it's an HTML page, we will check if max depth is reached 313 | or not. 314 | 315 | queueItem - Queue item object to check 316 | 317 | Returns a boolean, true if the queue item can be fetched - false if not. 318 | 319 | */ 320 | Crawler.prototype.depthAllowed = function(queueItem) { 321 | var crawler = this; 322 | 323 | // Items matching this pattern will always be fetched, even if max depth is reached 324 | var mimeTypesWhitelist = [ 325 | /^text\/(css|javascript|ecmascript)/i, 326 | /^application\/javascript/i, 327 | /^application\/x-font/i, 328 | /^application\/font/i, 329 | /^image\//i, 330 | /^font\//i 331 | ]; 332 | 333 | return ( 334 | crawler.maxDepth === 0 || 335 | queueItem.depth <= crawler.maxDepth || 336 | mimeTypesWhitelist.reduce(function(prev,mimeCheck) { 337 | return prev || !!mimeCheck.exec(queueItem.stateData.contentType); 338 | }, false) 339 | ); 340 | }; 341 | 342 | /* 343 | Public: Extracts protocol, host, port and resource (path) given a URL string. 344 | 345 | URL - String containing URL to process 346 | 347 | Examples 348 | 349 | var URLInfo = crawler.processURL("http://www.google.com/fish"); 350 | 351 | Returns an object containing keys and values for "protocol", "host", "port", 352 | and "path". 353 | 354 | */ 355 | Crawler.prototype.processURL = function(URL,context) { 356 | var newURL, crawler = this; 357 | 358 | if (!context || typeof(context) !== "object") 359 | context = { 360 | url: ( 361 | crawler.initialProtocol + "://" + 362 | crawler.host + ":" + 363 | crawler.initialPort + "/" 364 | ), 365 | depth: QUEUE_ITEM_INITIAL_DEPTH 366 | }; 367 | 368 | // If the URL didn't contain anything, don't fetch it. 369 | if (!URL.replace(/\s+/ig,"").length) return false; 370 | 371 | // Check if querystring should be ignored 372 | if (crawler.stripQuerystring === true) 373 | URL = crawler.removeQuerystring(URL); 374 | 375 | try { 376 | newURL = 377 | URI(URL) 378 | .absoluteTo(context.url) 379 | .normalize(); 380 | 381 | if (crawler.urlEncoding === "iso8859") { 382 | newURL = newURL.iso8859(); 383 | } 384 | 385 | } catch(e) { 386 | // Couldn't process the URL, since URIjs choked on it. 387 | return false; 388 | } 389 | 390 | // simplecrawler uses slightly different terminology to URIjs. Sorry! 391 | return { 392 | "protocol": newURL.protocol() || "http", 393 | "host": newURL.hostname(), 394 | "port": newURL.port() || 80, 395 | "path": newURL.resource(), 396 | "uriPath": newURL.path(), 397 | "depth": context.depth + 1 398 | }; 399 | }; 400 | 401 | /* 402 | Public: Discovers linked resources in an HTML, XML or text document. 403 | 404 | resourceData - String containing document with linked resources. 405 | queueItem - Queue item corresponding to document being searched. 406 | 407 | Examples 408 | 409 | crawler.discoverResources("http://www.google.com") 410 | crawler.discoverResources("test") 411 | 412 | Returns an array of the (string) resource URLs found in the document. If none 413 | were found, the array will be empty. 414 | 415 | */ 416 | Crawler.prototype.discoverResources = function(resourceData,queueItem) { 417 | // Convert to UTF-8 418 | // TODO: account for text-encoding. 419 | var resources = [], 420 | resourceText = resourceData.toString("utf8"), 421 | crawler = this; 422 | 423 | if (!queueItem) 424 | queueItem = {}; 425 | 426 | if (!queueItem.protocol) 427 | queueItem.protocol = "http"; 428 | 429 | if (!crawler.parseHTMLComments) { 430 | resourceText = resourceText.replace(//g, ""); 431 | } 432 | 433 | if (!crawler.parseScriptTags) { 434 | resourceText = resourceText.replace(/([\s\S]+?)<\/script>/gi, ""); 435 | } 436 | 437 | function cleanURL(URL) { 438 | return URL 439 | .replace(/^(\s?href|\s?src)=['"]?/i,"") 440 | .replace(/^\s*/,"") 441 | .replace(/^url\(['"]*/i,"") 442 | .replace(/^javascript\:[a-z0-9]+\(['"]/i,"") 443 | .replace(/["'\)]$/i,"") 444 | .replace(/^\/\//, queueItem.protocol + "://") 445 | .replace(/\&/gi,"&") 446 | .split("#") 447 | .shift(); 448 | } 449 | 450 | // Clean links 451 | function cleanAndQueue(urlMatch) { 452 | if (!urlMatch) return []; 453 | 454 | return urlMatch 455 | .map(cleanURL) 456 | .reduce(function(list,URL) { 457 | 458 | // Ensure URL is whole and complete 459 | try { 460 | URL = URI(URL) 461 | .absoluteTo(queueItem.url) 462 | .normalize() 463 | .toString(); 464 | } catch(e) { 465 | 466 | // But if URI.js couldn't parse it - nobody can! 467 | return list; 468 | } 469 | 470 | // If we hit an empty item, don't add return it 471 | if (!URL.length) return list; 472 | 473 | // If we don't support the protocol in question 474 | if (!crawler.protocolSupported(URL)) return list; 475 | 476 | // Does the item already exist in the list? 477 | if (resources.reduce(function(prev,current) { 478 | return prev || current === URL; 479 | },false)) 480 | return list; 481 | 482 | return list.concat(URL); 483 | },[]); 484 | } 485 | 486 | // Rough scan for URLs 487 | return crawler.discoverRegex 488 | .reduce(function(list,regex) { 489 | return list.concat( 490 | cleanAndQueue( 491 | resourceText.match(regex))); 492 | },[]) 493 | .reduce(function(list,check) { 494 | if (list.indexOf(check) < 0) 495 | return list.concat([check]); 496 | 497 | return list; 498 | },[]); 499 | }; 500 | 501 | /* 502 | Public: Determines based on crawler state whether a domain is valid for 503 | crawling. 504 | 505 | host - String containing the hostname of the resource to be fetched. 506 | 507 | Examples 508 | 509 | crawler.domainValid("127.0.0.1"); 510 | crawler.domainValid("google.com"); 511 | crawler.domainValid("test.example.com"); 512 | 513 | Returns an true if the domain is valid for crawling, false if not. 514 | 515 | */ 516 | Crawler.prototype.domainValid = function(host) { 517 | var crawler = this, 518 | crawlerHost = crawler.host; 519 | 520 | // If we're ignoring the WWW domain, remove the WWW for comparisons... 521 | if (crawler.ignoreWWWDomain) 522 | host = host.replace(/^www\./i,""); 523 | 524 | function domainInWhitelist(host) { 525 | 526 | // If there's no whitelist, or the whitelist is of zero length, 527 | // just return false. 528 | if (!crawler.domainWhitelist || 529 | !crawler.domainWhitelist.length) return false; 530 | 531 | // Otherwise, scan through it. 532 | return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) { 533 | 534 | // If we already located the relevant domain in the whitelist... 535 | if (prev) return prev; 536 | 537 | // If the domain is just equal, return true. 538 | if (host === cur) return true; 539 | 540 | // If we're ignoring WWW subdomains, and both domains, 541 | // less www. are the same, return true. 542 | if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,"")) 543 | return true; 544 | 545 | // Otherwise, sorry. No dice. 546 | return false; 547 | },false); 548 | } 549 | 550 | // Checks if the first domain is a subdomain of the second 551 | function isSubdomainOf(subdomain,host) { 552 | 553 | // Comparisons must be case-insensitive 554 | subdomain = subdomain.toLowerCase(); 555 | host = host.toLowerCase(); 556 | 557 | // If we're ignoring www, remove it from both 558 | // (if www is the first domain component...) 559 | if (crawler.ignoreWWWDomain) { 560 | subdomain.replace(/^www./ig,""); 561 | host.replace(/^www./ig,""); 562 | } 563 | 564 | // They should be the same flipped around! 565 | return ( 566 | subdomain.split("").reverse().join("").substr(0,host.length) === 567 | host.split("").reverse().join("")); 568 | } 569 | 570 | // If we're not filtering by domain, just return true. 571 | return (!crawler.filterByDomain || 572 | // Or if the domain is just the right one, return true. 573 | (host === crawler.host) || 574 | // Or if we're ignoring WWW subdomains, and both domains, 575 | // less www. are the same, return true. 576 | ( 577 | crawler.ignoreWWWDomain && 578 | crawler.host.replace(/^www\./i,"") === 579 | host.replace(/^www\./i,"") 580 | ) || 581 | // Or if the domain in question exists in the domain whitelist, 582 | // return true. 583 | domainInWhitelist(host) || 584 | // Or if we're scanning subdomains, and this domain is a subdomain 585 | // of the crawler's set domain, return true. 586 | (crawler.scanSubdomains && isSubdomainOf(host,crawler.host))); 587 | }; 588 | 589 | /* 590 | Public: Given a text or HTML document, initiates discovery of linked 591 | resources in the text, and queues the resources if applicable. Emits 592 | "discoverycomplete". Not to be confused with `crawler.discoverResources`, 593 | despite the `discoverResources` function being the main component of this 594 | one, since this function queues the resources in addition to 595 | discovering them. 596 | 597 | resourceData - Text document containing linked resource URLs. 598 | queueItem - Queue item from which the resource document was derived. 599 | decompressed - Content is already decompressed (default: false) 600 | 601 | Examples 602 | 603 | crawler.queueLinkedItems("test",queueItem); 604 | 605 | Returns the crawler object for chaining. 606 | 607 | */ 608 | Crawler.prototype.queueLinkedItems = function(resourceData,queueItem,decompressed) { 609 | var crawler = this, 610 | resources = []; 611 | 612 | if (!decompressed && 613 | queueItem.stateData && 614 | queueItem.stateData.headers['content-encoding'] && ( 615 | queueItem.stateData.headers['content-encoding'].match(/gzip/) || 616 | queueItem.stateData.headers['content-encoding'].match(/deflate/))) { 617 | 618 | return zlib.unzip(resourceData,function(err,newData) { 619 | if (err) { 620 | return crawler.emit("fetcherror",queueItem); 621 | } 622 | 623 | crawler.queueLinkedItems(newData,queueItem,true); 624 | }); 625 | } 626 | 627 | resources = crawler.discoverResources(resourceData,queueItem); 628 | 629 | // Emit discovered resources. ie: might be useful in building a graph of 630 | // page relationships. 631 | crawler.emit("discoverycomplete",queueItem,resources); 632 | 633 | resources.forEach(function(url){ crawler.queueURL(url,queueItem); }); 634 | 635 | return crawler; 636 | }; 637 | 638 | /* 639 | Public: Given a single URL, this function cleans, validates, parses it and 640 | adds it to the queue. This is the best and simplest way to add an item to 641 | the queue. 642 | 643 | url - URL to be queued. 644 | queueItem - Queue item from which the resource was linked. 645 | 646 | Emits 647 | 648 | queueduplicate 649 | queueerror 650 | queueadd 651 | 652 | Examples 653 | 654 | crawler.queueURL("http://www.google.com/",queueItem); 655 | 656 | Returns a boolean value indicating whether the URL was successfully queued 657 | or not. 658 | 659 | */ 660 | Crawler.prototype.queueURL = function(url,queueItem) { 661 | var crawler = this; 662 | var parsedURL = 663 | typeof(url) === "object" ? url : crawler.processURL(url,queueItem); 664 | 665 | // URL Parser decided this URL was junky. Next please! 666 | if (!parsedURL) { 667 | return false; 668 | } 669 | 670 | // Pass this URL past fetch conditions to ensure the user thinks it's valid 671 | var fetchDenied = false; 672 | fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) { 673 | return prev || !callback(parsedURL); 674 | },false); 675 | 676 | if (fetchDenied) { 677 | // Fetch Conditions conspired to block URL 678 | return false; 679 | } 680 | 681 | // Check the domain is valid before adding it to the queue 682 | if (crawler.domainValid(parsedURL.host)) { 683 | crawler.queue.add( 684 | parsedURL.protocol, 685 | parsedURL.host, 686 | parsedURL.port, 687 | parsedURL.path, 688 | parsedURL.depth, 689 | function queueAddCallback(error,newQueueItem) { 690 | if (error) { 691 | // We received an error condition when adding the callback 692 | if (error.code && error.code === "DUP") 693 | return crawler.emit("queueduplicate",parsedURL); 694 | 695 | return crawler.emit("queueerror",error,parsedURL); 696 | } 697 | 698 | crawler.emit("queueadd",newQueueItem,parsedURL); 699 | newQueueItem.referrer = queueItem ? queueItem.url : null; 700 | } 701 | ); 702 | } 703 | 704 | return true; 705 | }; 706 | 707 | /* 708 | Public: The guts of the crawler: takes a queue item and spools a request for 709 | it, downloads, caches, and fires events based on the result of the request. 710 | It kicks off resource discovery and queues any new resources found. 711 | 712 | queueItem - Queue item to be fetched. 713 | 714 | Emits 715 | fetchstart 716 | fetchheaders 717 | fetchcomplete 718 | fetchdataerror 719 | notmodified 720 | fetchredirect 721 | fetch404 722 | fetcherror 723 | fetchclienterror 724 | 725 | Examples 726 | 727 | crawler.fetchQueueItem(queueItem); 728 | 729 | Returns the crawler object for chaining. 730 | 731 | */ 732 | Crawler.prototype.fetchQueueItem = function(queueItem) { 733 | var crawler = this; 734 | crawler._openRequests ++; 735 | 736 | // Variable declarations 737 | var fetchData = false, 738 | requestOptions, 739 | clientRequest, 740 | timeCommenced; 741 | 742 | // Mark as spooled 743 | queueItem.status = "spooled"; 744 | var client = (queueItem.protocol === "https" ? https : http); 745 | 746 | // Up the socket limit if required. 747 | if (client.globalAgent.maxSockets < crawler.maxConcurrency) { 748 | client.globalAgent.maxSockets = crawler.maxConcurrency; 749 | } 750 | 751 | // Extract request options from queue; 752 | var requestHost = queueItem.host, 753 | requestPort = queueItem.port, 754 | requestPath = queueItem.path; 755 | 756 | // Are we passing through an HTTP proxy? 757 | if (crawler.useProxy) { 758 | requestHost = crawler.proxyHostname; 759 | requestPort = crawler.proxyPort; 760 | requestPath = queueItem.url; 761 | } 762 | 763 | // Load in request options 764 | requestOptions = { 765 | method: "GET", 766 | host: requestHost, 767 | port: requestPort, 768 | path: requestPath, 769 | headers: { 770 | "User-Agent": crawler.userAgent, 771 | "Host": queueItem.host + ( 772 | queueItem.port !== 80 ? 773 | ":" + queueItem.port : 774 | "" 775 | ), 776 | "Referer": queueItem.referrer 777 | } 778 | }; 779 | 780 | // If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts 781 | if (requestOptions.port === 80 || requestOptions.port === 443) { 782 | delete requestOptions.port; 783 | } 784 | 785 | // Add cookie header from cookie jar if we're configured to 786 | // send/accept cookies 787 | if (crawler.acceptCookies && crawler.cookies.getAsHeader()) { 788 | requestOptions.headers.cookie = 789 | crawler.cookies.getAsHeader(queueItem.host,queueItem.path); 790 | } 791 | 792 | // Add auth headers if we need them 793 | if (crawler.needsAuth) { 794 | var auth = crawler.authUser + ":" + crawler.authPass; 795 | 796 | // Generate auth header 797 | auth = 'Basic ' + (new Buffer(auth).toString('base64')); 798 | requestOptions.headers.Authorization = auth; 799 | } 800 | 801 | // Add proxy auth if we need it 802 | if (crawler.proxyUser !== null && crawler.proxyPass !== null) { 803 | var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass; 804 | 805 | // Generate auth header 806 | proxyAuth = 'Basic ' + (new Buffer(proxyAuth).toString('base64')); 807 | requestOptions.headers["Proxy-Authorization"] = proxyAuth; 808 | } 809 | 810 | // And if we've got any custom headers available 811 | if (crawler.customHeaders) { 812 | for (var header in crawler.customHeaders) { 813 | if (!crawler.customHeaders.hasOwnProperty(header)) continue; 814 | 815 | requestOptions.headers[header] = crawler.customHeaders[header]; 816 | } 817 | } 818 | 819 | // Emit fetchstart event - gives the user time to mangle the request options 820 | // if required. 821 | crawler.emit("fetchstart", queueItem, requestOptions); 822 | 823 | process.nextTick(function() { 824 | // Record what time we started this request 825 | timeCommenced = Date.now(); 826 | 827 | // Get the resource! 828 | clientRequest = 829 | client.request(requestOptions,function(response) { 830 | crawler.handleResponse(queueItem,response,timeCommenced); 831 | }); 832 | 833 | clientRequest.end(); 834 | 835 | clientRequest.setTimeout(crawler.timeout, function() { 836 | clientRequest.abort(); 837 | crawler.emit("fetchtimeout",queueItem,crawler.timeout); 838 | }); 839 | 840 | clientRequest.on("error",function(errorData) { 841 | crawler._openRequests --; 842 | 843 | // Emit 5xx / 4xx event 844 | crawler.emit("fetchclienterror",queueItem,errorData); 845 | queueItem.fetched = true; 846 | queueItem.stateData.code = 599; 847 | queueItem.status = "failed"; 848 | }); 849 | 850 | return crawler; 851 | }); 852 | }; 853 | 854 | 855 | /* 856 | Public: Given a queueItem and a matching response object, the crawler will 857 | handle downloading the resource, queueing of linked items, etc. 858 | 859 | Examples 860 | 861 | // Passing in a response from `request` 862 | request(queueItem.url,function(err,res,body) { 863 | crawler.handleResponse(queueItem,res); 864 | }); 865 | 866 | Returns the crawler object for chaining. 867 | 868 | */ 869 | Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) { 870 | var crawler = this, 871 | dataReceived = false, 872 | timeHeadersReceived, 873 | timeDataReceived, 874 | parsedURL, 875 | responseBuffer, 876 | responseLength, 877 | responseLengthReceived = 0, 878 | contentType, 879 | stateData = queueItem.stateData; 880 | 881 | // Record what time we first received the header information 882 | timeHeadersReceived = Date.now(); 883 | 884 | // If we weren't passed a time of commencement, assume Now() 885 | timeCommenced = timeCommenced || Date.now(); 886 | 887 | responseLength = parseInt(response.headers["content-length"],10); 888 | responseLength = !isNaN(responseLength) ? responseLength : 0; 889 | 890 | // Save timing and content some header information into queue 891 | stateData.requestLatency = (timeHeadersReceived - timeCommenced); 892 | stateData.requestTime = (timeHeadersReceived - timeCommenced); 893 | stateData.contentLength = responseLength; 894 | stateData.contentType = contentType = response.headers["content-type"]; 895 | stateData.code = response.statusCode; 896 | stateData.headers = response.headers; 897 | 898 | // Do we need to save cookies? Were we sent any? 899 | if (crawler.acceptCookies && 900 | response.headers.hasOwnProperty('set-cookie')) 901 | crawler.cookies.addFromHeaders(response.headers["set-cookie"]); 902 | 903 | // Emit header receive event 904 | crawler.emit("fetchheaders",queueItem,response); 905 | 906 | // Ensure response length is reasonable... 907 | responseLength = 908 | responseLength > 0 ? responseLength : crawler.maxResourceSize; 909 | 910 | queueItem.stateData.contentLength = responseLength; 911 | 912 | // Function for dealing with 200 responses 913 | function processReceivedData() { 914 | if (queueItem.fetched) return; 915 | 916 | timeDataReceived = (new Date().getTime()); 917 | 918 | queueItem.fetched = true; 919 | queueItem.status = "downloaded"; 920 | 921 | // Save state information 922 | stateData.downloadTime = (timeDataReceived - timeHeadersReceived); 923 | stateData.requestTime = (timeDataReceived - timeCommenced); 924 | stateData.actualDataSize = responseBuffer.length; 925 | stateData.sentIncorrectSize = responseBuffer.length !== responseLength; 926 | 927 | // First, save item to cache (if we're using a cache!) 928 | if (crawler.cache !== null && 929 | crawler.cache.setCacheData instanceof Function) { 930 | 931 | crawler.cache.setCacheData(queueItem,responseBuffer); 932 | } 933 | 934 | // Is the item allowed by depth conditions ? 935 | if(crawler.depthAllowed(queueItem)) { 936 | crawler.emit("fetchcomplete",queueItem,responseBuffer,response); 937 | 938 | // We only process the item if it's of a valid mimetype 939 | // and only if the crawler is set to discover its own resources 940 | if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) { 941 | crawler.queueLinkedItems(responseBuffer,queueItem); 942 | } 943 | } 944 | 945 | crawler._openRequests --; 946 | } 947 | 948 | function receiveData(chunk) { 949 | if (chunk && chunk.length && !dataReceived) { 950 | if (responseLengthReceived + chunk.length > responseBuffer.length) { 951 | // Oh dear. We've been sent more data than we were initially told. 952 | // This could be a mis-calculation, or a streaming resource. 953 | // Let's increase the size of our buffer to match, as long as it isn't 954 | // larger than our maximum resource size. 955 | 956 | if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) { 957 | 958 | // Start by creating a new buffer, which will be our main 959 | // buffer from now on... 960 | 961 | var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length); 962 | 963 | // Copy all our old data into it... 964 | responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length); 965 | 966 | // And now the new chunk 967 | chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length); 968 | 969 | // And now make the response buffer our new buffer, 970 | // leaving the original for GC 971 | responseBuffer = tmpNewBuffer; 972 | 973 | } else { 974 | // Oh dear oh dear! The response is not only more data 975 | // than we were initially told, but it also exceeds the 976 | // maximum amount of data we're prepared to download per 977 | // resource. 978 | // 979 | // Throw error event and ignore. 980 | // 981 | // We'll then deal with the data that we have. 982 | 983 | crawler.emit("fetchdataerror",queueItem,response); 984 | } 985 | } else { 986 | // Copy the chunk data into our main buffer 987 | chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length); 988 | } 989 | 990 | // Increment our data received counter 991 | responseLengthReceived += chunk.length; 992 | } 993 | 994 | 995 | if ((responseLengthReceived >= responseLength || response.complete) && 996 | !dataReceived) { 997 | 998 | // Slice the buffer to chop off any unused space 999 | responseBuffer = responseBuffer.slice(0,responseLengthReceived); 1000 | 1001 | dataReceived = true; 1002 | processReceivedData(); 1003 | } 1004 | } 1005 | 1006 | // If we should just go ahead and get the data 1007 | if (response.statusCode >= 200 && response.statusCode < 300 && 1008 | responseLength <= crawler.maxResourceSize) { 1009 | 1010 | queueItem.status = "headers"; 1011 | 1012 | // Create a buffer with our response length 1013 | responseBuffer = new Buffer(responseLength); 1014 | 1015 | response.on("data",receiveData); 1016 | response.on("end",receiveData); 1017 | 1018 | // We've got a not-modified response back 1019 | } else if (response.statusCode === 304) { 1020 | 1021 | if (crawler.cache !== null && crawler.cache.getCacheData) { 1022 | // We've got access to a cache 1023 | crawler.cache.getCacheData(queueItem,function(cacheObject) { 1024 | crawler.emit("notmodified",queueItem,response,cacheObject); 1025 | }); 1026 | } else { 1027 | // Emit notmodified event. We don't have a cache available, so 1028 | // we don't send any data. 1029 | crawler.emit("notmodified",queueItem,response); 1030 | } 1031 | 1032 | // If we should queue a redirect 1033 | } else if (response.statusCode >= 300 && response.statusCode < 400 && 1034 | response.headers.location) { 1035 | 1036 | queueItem.fetched = true; 1037 | queueItem.status = "redirected"; 1038 | 1039 | // Parse the redirect URL ready for adding to the queue... 1040 | parsedURL = crawler.processURL(response.headers.location,queueItem); 1041 | 1042 | // Emit redirect event 1043 | crawler.emit("fetchredirect",queueItem,parsedURL,response); 1044 | 1045 | // Clean URL, add to queue... 1046 | crawler.queueURL(parsedURL,queueItem); 1047 | 1048 | crawler._openRequests --; 1049 | 1050 | // Ignore this request, but record that we had a 404 1051 | } else if (response.statusCode === 404) { 1052 | queueItem.fetched = true; 1053 | queueItem.status = "notfound"; 1054 | 1055 | // Emit 404 event 1056 | crawler.emit("fetch404",queueItem,response); 1057 | 1058 | crawler._openRequests --; 1059 | 1060 | // And oh dear. Handle this one as well. (other 400s, 500s, etc) 1061 | } else { 1062 | queueItem.fetched = true; 1063 | queueItem.status = "failed"; 1064 | 1065 | // Emit 5xx / 4xx event 1066 | crawler.emit("fetcherror",queueItem,response); 1067 | 1068 | crawler._openRequests --; 1069 | } 1070 | 1071 | return crawler; 1072 | }; 1073 | 1074 | /* 1075 | Public: The main crawler runloop. Fires at the interval specified in the 1076 | crawler configuration, when the crawl is running. May be manually fired. 1077 | This function initiates fetching of a queue item if there are enough workers 1078 | to do so and there are unfetched items in the queue. 1079 | 1080 | Examples 1081 | 1082 | crawler.crawl(); 1083 | 1084 | Returns the crawler object for chaining. 1085 | 1086 | */ 1087 | Crawler.prototype.crawl = function() { 1088 | var crawler = this; 1089 | 1090 | if (crawler._openRequests > crawler.maxConcurrency) return; 1091 | 1092 | crawler.queue.oldestUnfetchedItem(function(err, queueItem) { 1093 | 1094 | if (queueItem) { 1095 | crawler.fetchQueueItem(queueItem); 1096 | 1097 | } else if ( !crawler._openRequests && 1098 | !crawler._openListeners) { 1099 | 1100 | crawler.queue.complete(function(err, completeCount) { 1101 | if (err) throw err; 1102 | 1103 | crawler.queue.getLength(function(err, length) { 1104 | if (err) throw err; 1105 | 1106 | if (completeCount === length) { 1107 | crawler.emit("complete"); 1108 | crawler.stop(); 1109 | } 1110 | }); 1111 | }); 1112 | } 1113 | }); 1114 | 1115 | return crawler; 1116 | }; 1117 | 1118 | /* 1119 | Public: Stops the crawler, terminating the crawl runloop. 1120 | 1121 | Examples 1122 | 1123 | crawler.stop(); 1124 | 1125 | Returns the crawler object for chaining. 1126 | 1127 | */ 1128 | Crawler.prototype.stop = function() { 1129 | var crawler = this; 1130 | clearInterval(crawler.crawlIntervalID); 1131 | crawler.running = false; 1132 | return crawler; 1133 | }; 1134 | 1135 | /* 1136 | Public: Holds the crawler in a 'running' state, preventing the `complete` 1137 | event from firing until the callback this function returns has been executed, 1138 | or a predetermined timeout (as specified by `crawler.listenerTTL`) has 1139 | elapsed. 1140 | 1141 | Examples 1142 | 1143 | crawler.on("fetchcomplete",function(queueItem,data) { 1144 | continue = this.wait(); 1145 | doSomethingThatTakesAlongTime(function callback() { 1146 | continue(); 1147 | }); 1148 | }); 1149 | 1150 | Returns callback which will allow the crawler to continue. 1151 | 1152 | */ 1153 | Crawler.prototype.wait = function() { 1154 | var crawler = this, 1155 | cleared = false, 1156 | timeout = 1157 | setTimeout(function() { 1158 | if (cleared) return; 1159 | cleared = true; 1160 | crawler._openListeners --; 1161 | }, crawler.listenerTTL); 1162 | 1163 | crawler._openListeners ++; 1164 | 1165 | return function() { 1166 | if (cleared) return; 1167 | cleared = true; 1168 | crawler._openListeners --; 1169 | clearTimeout(timeout); 1170 | }; 1171 | }; 1172 | 1173 | /* 1174 | Public: Given a function, this method adds it to an internal list maintained 1175 | by the crawler to be executed against each URL to determine whether it should 1176 | be fetched or not. 1177 | 1178 | callback - Function to be called when evaluating a URL. This function is 1179 | passed an object containing the protocol, hostname, port, and path 1180 | of a resource to be fetched. It can determine whether it should 1181 | be requested or not by returning a boolean - false for no, true 1182 | for yes. 1183 | 1184 | Examples 1185 | 1186 | crawler.addFetchCondition(function(parsedURL) { 1187 | return (parsedURL.host !== "evildomain.com"); 1188 | }); 1189 | 1190 | Returns the ID of the fetch condition - used for removing it from the crawler 1191 | later. 1192 | 1193 | */ 1194 | Crawler.prototype.addFetchCondition = function(callback) { 1195 | var crawler = this; 1196 | if (callback instanceof Function) { 1197 | crawler._fetchConditions.push(callback); 1198 | return crawler._fetchConditions.length - 1; 1199 | } else { 1200 | throw new Error("Fetch Condition must be a function."); 1201 | } 1202 | }; 1203 | 1204 | /* 1205 | Public: Given the ID of an existing fetch condition, this function removes 1206 | it from the crawler's internal list of conditions. 1207 | 1208 | index - ID of fetch condition to be removed. 1209 | 1210 | Examples 1211 | 1212 | crawler.removeFetchCondition(3); 1213 | 1214 | Returns true if the fetch condition was removed, and throws an error if it 1215 | could not be found. 1216 | 1217 | */ 1218 | Crawler.prototype.removeFetchCondition = function(index) { 1219 | var crawler = this; 1220 | if (crawler._fetchConditions[index] && 1221 | crawler._fetchConditions[index] instanceof Function) { 1222 | 1223 | return !!crawler._fetchConditions.splice(index,1); 1224 | } else { 1225 | throw new Error("Unable to find indexed Fetch Condition."); 1226 | } 1227 | }; 1228 | 1229 | /* 1230 | Public: Given a URL it will remove the querstring if it exists. 1231 | 1232 | url - URL from which to remove the querystring 1233 | 1234 | Examples 1235 | 1236 | crawler.removeQuerystring(url); 1237 | 1238 | Returns URL without querystring if it exists 1239 | 1240 | */ 1241 | Crawler.prototype.removeQuerystring = function(url) { 1242 | if (url.indexOf("?") > -1) { 1243 | return url.substr(0,url.indexOf("?")); 1244 | } else { 1245 | return url; 1246 | } 1247 | }; 1248 | 1249 | module.exports = Crawler; 1250 | --------------------------------------------------------------------------------