├── .gitignore
├── lib
    ├── cli.js
    ├── index.js
    ├── cache.js
    ├── quickcrawl.js
    ├── cache-backend-fs.js
    ├── queue.js
    ├── cookies.js
    └── crawler.js
├── .travis.yml
├── .jshintrc
├── .editorconfig
├── example
    ├── quickcrawl-example.js
    ├── testcrawler.js
    └── savetodisk.js
├── test
    ├── init.js
    ├── jshint.js
    ├── lib
    │   ├── testserver.js
    │   └── routes.js
    ├── depth.js
    ├── testcrawl.js
    ├── reliability.js
    ├── discovery.js
    ├── resourcevalidity.js
    └── cookies.js
├── package.json
└── README.markdown


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules


--------------------------------------------------------------------------------
/lib/cli.js:
--------------------------------------------------------------------------------
1 | // CLI module for crawling.
2 | // Not yet built.
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |   - "0.11"
4 |   - "0.10"


--------------------------------------------------------------------------------
/.jshintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "asi": false,
 3 |   "node": true,
 4 |   "require": true,
 5 |   "process": true,
 6 |   "module": true,
 7 |   "setInterval": true,
 8 |   "setTimeout": true,
 9 |   "clearTimeout": true,
10 |   "Buffer": true
11 | }
12 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig is awesome: http://EditorConfig.org
 2 | 
 3 | # top-most EditorConfig file
 4 | root = false
 5 | 
 6 | # Unix-style newlines with a newline ending every file
 7 | [*]
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | 
11 | # Tab indentation
12 | [*.js]
13 | indent_style = tab
14 | indent_size = 4


--------------------------------------------------------------------------------
/lib/index.js:
--------------------------------------------------------------------------------
 1 | // SimpleCrawler
 2 | // Export interfaces
 3 | 
 4 | module.exports = require("./crawler.js");
 5 | 
 6 | // Aliasing for compatibility with legacy code.
 7 | module.exports.Crawler = module.exports;
 8 | 
 9 | module.exports.queue = require("./queue.js");
10 | module.exports.cache = require("./cache.js");
11 | 
12 | // Convenience function for small, fast crawls
13 | module.exports.crawl = require("./quickcrawl.js");
14 | 


--------------------------------------------------------------------------------
/example/quickcrawl-example.js:
--------------------------------------------------------------------------------
 1 | // Example demonstrating the simple (but less flexible) way of initiating
 2 | // a crawler.
 3 | 
 4 | var Crawler = require("../lib");
 5 | 
 6 | Crawler.crawl("http://deewr.gov.au/")
 7 | 	.on("fetchstart",function(queueItem){
 8 | 		console.log("Starting request for:",queueItem.url);
 9 | 	})
10 | 	.on("fetchcomplete",function(queueItem){
11 | 		console.log("Completed fetching resource:",queueItem.url);
12 | 	});
13 | 


--------------------------------------------------------------------------------
/example/testcrawler.js:
--------------------------------------------------------------------------------
 1 | var Crawler = require("../"),
 2 | 	crawler = new Crawler("127.0.0.1","/",3000);
 3 | 
 4 | crawler.on("crawlstart",function() {
 5 | 	console.log("Crawl starting");
 6 | });
 7 | 	
 8 | crawler.on("fetchstart",function(queueItem) {
 9 | 	console.log("fetchStart",queueItem);
10 | });
11 | 
12 | crawler.on("fetchcomplete",function(queueItem) {
13 | 	console.log("fetchcomplete",queueItem);
14 | });
15 | 
16 | crawler.on("complete",function() {
17 | 	console.log("Finished!");
18 | });
19 | 
20 | crawler.start();


--------------------------------------------------------------------------------
/test/init.js:
--------------------------------------------------------------------------------
 1 | // Ensures that the crawler object is requireable, and doesn't die
 2 | // horribly right off the bat
 3 | 
 4 | var chai = require("chai");
 5 | 	chai.should();
 6 | 
 7 | describe("Crawler object",function() {
 8 | 
 9 | 	it("should be able to be required",function() {
10 | 		var Crawler = require("../");
11 | 
12 | 		Crawler.should.be.a("function");
13 | 		Crawler.Crawler.should.be.a("function");
14 | 	});
15 | 
16 | 	it("should import the queue",function() {
17 | 		var Crawler = require("../");
18 | 
19 | 		Crawler.queue.should.be.a("function");
20 | 	});
21 | 
22 | 	it("should import the cache system",function() {
23 | 		var Crawler = require("../");
24 | 
25 | 		Crawler.cache.should.be.a("function");
26 | 	});
27 | 
28 | 	it("should be able to be initialised",function() {
29 | 		var Crawler = require("../"),
30 | 			myCrawler = new Crawler("127.0.0.1","/",3000);
31 | 
32 | 		myCrawler.should.be.an.instanceof(Crawler);
33 | 	});
34 | 
35 | })
36 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "simplecrawler",
 3 |   "description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
 4 |   "version": "0.3.11",
 5 |   "homepage": "http://github.com/cgiffard/node-simplecrawler",
 6 |   "author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
 7 |   "keywords": [
 8 |     "simple",
 9 |     "crawler",
10 |     "spider",
11 |     "cache",
12 |     "queue",
13 |     "simplecrawler",
14 |     "eventemitter"
15 |   ],
16 |   "scripts": {
17 |     "test": "mocha -R spec -t 5000"
18 |   },
19 |   "bin": {
20 |     "crawl": "./lib/cli.js"
21 |   },
22 |   "repository": {
23 |     "type": "git",
24 |     "url": "http://github.com/cgiffard/node-simplecrawler.git"
25 |   },
26 |   "bugs": {
27 |     "url": "https://github.com/cgiffard/node-simplecrawler/issues"
28 |   },
29 |   "main": "./lib/index.js",
30 |   "engines": {
31 |     "node": ">=0.8.0"
32 |   },
33 |   "devDependencies": {
34 |     "mocha": "~1.8.2",
35 |     "jshint": "~0.7.x",
36 |     "chai": "~1.2.0"
37 |   },
38 |   "dependencies": {
39 |     "URIjs": "~1.10.2"
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/test/jshint.js:
--------------------------------------------------------------------------------
 1 | // Tests to ensure crawler code is well formed
 2 | 
 3 | var chai = require("chai");
 4 | 	chai.should();
 5 | 
 6 | describe("Core code",function() {
 7 | 	var JSHINT = require("jshint").JSHINT,
 8 | 		fs = require("fs");
 9 | 
10 | 	function readCode(file) {
11 | 		file = __dirname + "/../lib/" + file + ".js";
12 | 		return fs.readFileSync(file).toString("utf8");
13 | 	}
14 | 
15 | 	[	"cache-backend-fs",
16 | 		"cache",
17 | 		"cli",
18 | 		"cookies",
19 | 		"crawler",
20 | 		"index",
21 | 		"queue",
22 | 		"quickcrawl"	].forEach(function(item) {
23 | 
24 | 		var code = readCode(item);
25 | 
26 | 		it("module `" + item + "` should pass JSHint with no errors",function() {
27 | 
28 | 			var slowThresholdMilliseconds = 200;
29 | 			this.slow(slowThresholdMilliseconds);
30 | 
31 | 			JSHINT(code,{
32 | 					"indent": 4,
33 | 					"undef": true
34 | 				},
35 | 				{
36 | 					// Don't want no errant logging statements going to production!
37 | 					// `console` has been deliberately omitted from this whitelist.
38 | 
39 | 					// All the regular node stuff
40 | 					"require": true,
41 | 					"module": true,
42 | 					"process": true,
43 | 					"setInterval": true,
44 | 					"clearInterval": true,
45 | 					"setTimeout": true,
46 | 					"clearTimeout": true,
47 | 					"Buffer": true
48 | 				});
49 | 
50 | 			if (JSHINT.errors.length) {
51 | 				throw new Error(
52 | 							"Line " +
53 | 							JSHINT.errors[0].line + ": " +
54 | 							JSHINT.errors[0].reason);
55 | 			}
56 | 		});
57 | 
58 | 	});
59 | });
60 | 


--------------------------------------------------------------------------------
/test/lib/testserver.js:
--------------------------------------------------------------------------------
 1 | // Server for testing HTTP crawls!
 2 | // Ultra simple - only for running with mocha tests.
 3 | 
 4 | // Include HTTP
 5 | var http = require("http");
 6 | 
 7 | // Create server for crawling
 8 | var httpServer = http.createServer();
 9 | 
10 | var testRoutes = require("./routes");
11 | 
12 | // Listen to events
13 | httpServer.on("request",function(req,res) {
14 | 
15 | 	function write(status,data,contentType) {
16 | 		res.writeHead(
17 | 			status,
18 | 			http.STATUS_CODES[status],
19 | 			{
20 | 				"Content-Type":		contentType || "text/html",
21 | 				"Content-Length":	Buffer.byteLength(data),
22 | 			});
23 | 
24 | 		res.write(data);
25 | 		res.end();
26 | 	}
27 | 
28 | 	function redir(to) {
29 | 		var data = "Redirecting you to " + to;
30 | 
31 | 		res.writeHead(
32 | 			301,
33 | 			http.STATUS_CODES[301],
34 | 			{
35 | 				"Content-Type":		"text/plain",
36 | 				"Content-Length":	Buffer.byteLength(data),
37 | 				"Location":			to
38 | 			});
39 | 
40 | 		res.write(data);
41 | 		res.end();
42 | 	}
43 | 
44 | 	if (testRoutes[req.url] &&
45 | 		testRoutes[req.url] instanceof Function) {
46 | 
47 | 		// Pass in a function that takes a status and some data to write back
48 | 		// out to the client
49 | 		testRoutes[req.url](write,redir);
50 | 
51 | 	} else {
52 | 
53 | 		// Otherwise, a 404
54 | 		res.writeHead(404,"Page Not Found");
55 | 		res.write("Page not found.");
56 | 		res.end();
57 | 	}
58 | });
59 | 
60 | httpServer.listen(3000);
61 | 
62 | module.exports = httpServer;
63 | module.exports.routes = testRoutes;
64 | 


--------------------------------------------------------------------------------
/lib/cache.js:
--------------------------------------------------------------------------------
 1 | // Simplecrawler - cache module
 2 | // Christopher Giffard, 2011
 3 | //
 4 | // http://www.github.com/cgiffard/node-simplecrawler
 5 | 
 6 | var fs = require("fs");
 7 | var EventEmitter = require('events').EventEmitter;
 8 | var FilesystemBackend = require("./cache-backend-fs.js");
 9 | // var RedisBackend = require("cache-backend-redis.js");
10 | // var MongoBackend = require("cache-backend-mongo.js");
11 | 
12 | // Init cache wrapper for backend...
13 | var Cache = function Cache(cacheLoadParameter,cacheBackend) {
14 | 
15 | 	// Ensure parameters are how we want them...
16 | 	cacheBackend = typeof cacheBackend === "object" ? cacheBackend : FilesystemBackend;
17 | 	cacheLoadParameter = cacheLoadParameter instanceof Array ? cacheLoadParameter : [cacheLoadParameter];
18 | 
19 | 	// Now we can just run the factory.
20 | 	this.datastore = cacheBackend.apply(cacheBackend,cacheLoadParameter);
21 | 
22 | 	// Instruct the backend to load up.
23 | 	this.datastore.load();
24 | };
25 | 
26 | Cache.prototype = new EventEmitter();
27 | 
28 | // Set up data import and export functions
29 | Cache.prototype.setCacheData = function(queueObject,data,callback) {
30 | 	this.datastore.setItem(queueObject,data,callback);
31 | 	this.emit("setcache",queueObject,data);
32 | };
33 | 
34 | Cache.prototype.getCacheData = function(queueObject,callback) {
35 | 	this.datastore.getItem(queueObject,callback);
36 | };
37 | 
38 | Cache.prototype.saveCache = function() {
39 | 	this.datastore.saveCache();
40 | };
41 | 
42 | module.exports = Cache;
43 | module.exports.Cache = Cache;
44 | module.exports.FilesystemBackend = FilesystemBackend;
45 | 


--------------------------------------------------------------------------------
/test/depth.js:
--------------------------------------------------------------------------------
 1 | // Runs a very simple crawl on an HTTP server with different depth
 2 | 
 3 | var chai = require("chai");
 4 |     chai.should();
 5 | 
 6 | var testserver = require("./lib/testserver.js");
 7 | 
 8 | var Crawler	= require("../");
 9 | 
10 | // Test the number of links discovered for the given "depth" and compare it to "linksToDiscover"
11 | var depthTest = function(depth, linksToDiscover) {
12 | 	depth = parseInt(depth); // Force depth to be a number
13 | 
14 | 	var crawler;
15 | 	var linksDiscovered;
16 | 
17 | 	describe("depth "+ depth, function() {
18 | 		before(function() {
19 | 			// Create a new crawler to crawl our local test server
20 | 			crawler = new Crawler("127.0.0.1","/depth/1",3000);
21 | 
22 | 			// Speed up tests. No point waiting for every request when we're running
23 | 			// our own server.
24 | 			crawler.interval = 1;
25 | 
26 | 			// Define max depth for this crawl
27 | 			crawler.maxDepth = depth;
28 | 
29 | 			linksDiscovered = 0;
30 | 
31 | 			crawler.on("fetchcomplete",function(queueItem) {
32 | 				linksDiscovered++;
33 | 			});
34 | 
35 | 			crawler.start();
36 | 		});
37 | 
38 | 		after(function() {
39 | 			// Clean listeners and crawler
40 | 			crawler.removeAllListeners("discoverycomplete");
41 | 			crawler.removeAllListeners("complete");
42 | 			crawler = null;
43 | 		});
44 | 
45 | 		it("should discover "+ linksToDiscover +" linked resources",function(done) {
46 | 			crawler.on("complete",function() {
47 | 				linksDiscovered.should.equal(linksToDiscover);
48 | 				done();
49 | 			});
50 | 		});
51 | 	});
52 | };
53 | 
54 | describe("Crawler max depth",function() {
55 | 
56 | 	// depth: linksToDiscover
57 | 	var linksToDiscover = {
58 | 		0: 11, // links for depth 0
59 | 		1: 6,  // links for depth 1
60 | 		2: 7,  // links for depth 2
61 | 		3: 11  // links for depth 3
62 | 	};
63 | 
64 | 	for(var depth in linksToDiscover) {
65 | 		depthTest(depth, linksToDiscover[depth]);
66 | 	}
67 | 
68 | });
69 | 


--------------------------------------------------------------------------------
/example/savetodisk.js:
--------------------------------------------------------------------------------
 1 | // Example use of simplecrawler, courtesy of @breck7! Thanks mate. :)
 2 | 
 3 | var fs = require('node-fs'),
 4 |     url = require('url'),
 5 |     wrench = require('wrench'),
 6 |     Crawler = require("simplecrawler").Crawler
 7 | 
 8 | /**
 9 |  * @param String. Domain to download.
10 |  * @Param Function. Callback when crawl is complete.
11 |  */
12 | var downloadSite = function (domain, callback) {
13 | 
14 |   // Where to save downloaded data
15 |   var outputDirectory = __dirname + '/' + domain
16 |   var myCrawler = new Crawler(domain)
17 |   myCrawler.interval = 250
18 |   myCrawler.maxConcurrency = 5
19 | 
20 |   myCrawler.on("fetchcomplete",function(queueItem, responseBuffer, response) {
21 | 
22 |     // Parse url
23 |     var parsed = url.parse(queueItem.url)
24 |     
25 |     // Rename / to index.html
26 |     if (parsed.pathname === '/')
27 |       parsed.pathname = '/index.html'
28 | 
29 |     // Get directory name in order to create any nested dirs
30 |     var dirname = outputDirectory + parsed.pathname.replace(/\/[^\/]+$/, '')
31 |     
32 |     // Path to save file
33 |     var filepath = outputDirectory + parsed.pathname
34 | 
35 |     // Check if DIR exists
36 |     fs.exists(dirname, function (exists) {
37 | 
38 |       // If DIR exists, write file
39 |       if (exists)
40 |         fs.writeFile(filepath, responseBuffer, function () {})
41 |       
42 |       // Else, recursively create dir using node-fs, then write file
43 |       else
44 |         fs.mkdir(dirname, 0755, true, function (err) {
45 |           fs.writeFile(filepath, responseBuffer, function () {})
46 |         })
47 |         
48 |     })
49 | 
50 |     console.log("I just received %s (%d bytes)",queueItem.url,responseBuffer.length)
51 |     console.log("It was a resource of type %s",response.headers['content-type'])
52 | 
53 |   })
54 | 
55 |   // Fire callback
56 |   myCrawler.on('complete', function () {
57 |     callback()
58 |   })
59 | 
60 |   // Start Crawl
61 |   myCrawler.start()
62 |   
63 | }
64 | 
65 | if (process.argv.length < 3) {
66 |   console.log('Usage: node downloadSiteExample.js mysite.com')
67 |   process.exit(1)
68 | }
69 | downloadSite(process.argv[2], function () {
70 |   console.log('Done!')
71 | })
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/lib/quickcrawl.js:
--------------------------------------------------------------------------------
 1 | var Crawler	= require("./crawler.js"),
 2 | 	URI		= require("URIjs");
 3 | 
 4 | 
 5 | /*
 6 | 	Public: Convenience function for really quick, simple crawls. It generates
 7 | 	a new crawler, parses the URL provided, and sets up the new crawler with
 8 | 	the host and path information extracted from the URL. It returns the crawler
 9 | 	object, so you can set up event handlers, and waits until `process.nextTick`
10 | 	before kicking off the crawl.
11 | 
12 | 	url					-	URL to begin crawl from.
13 | 	successCallback		-	Optional function called once an item is completely
14 | 							downloaded. Functionally identical to a fetchcomplete
15 | 							event listener.
16 | 	failCallback		-	Optional function to be called if an item fails to
17 | 							download. Functionally identical to a fetcherror
18 | 							event listener.
19 | 
20 | 	Examples
21 | 
22 | 		Crawler.crawl(
23 | 			"http://example.com:3000/start",
24 | 			function(queueItem,data) {
25 | 				console.log("I got a new item!");
26 | 			}
27 | 		);
28 | 
29 | 		Crawler
30 | 			.crawl("http://www.example.com/")
31 | 			.on("fetchstart",function(queueItem) {
32 | 				console.log("Beginning fetch for",queueItem.url);
33 | 			});
34 | 
35 | 	Returns the crawler object which has now been constructed.
36 | 
37 | */
38 | module.exports = function crawl(url,successCallback,failCallback) {
39 | 
40 | 	// Parse the URL first
41 | 	url = URI(url);
42 | 
43 | 	// If either the protocol, path, or hostname are unset, we can't really
44 | 	// do much. Die with error.
45 | 	if (!url.protocol())
46 | 		throw new Error("Can't crawl with unspecified protocol.");
47 | 
48 | 	if (!url.hostname())
49 | 		throw new Error("Can't crawl with unspecified hostname.");
50 | 
51 | 	if (!url.path())
52 | 		throw new Error("Can't crawl with unspecified path.");
53 | 
54 | 	var tmpCrawler =
55 | 			new Crawler(
56 | 				url.hostname(),
57 | 				url.path(),
58 | 				url.port() || 80);
59 | 
60 | 	// Attach callbacks if they were provided
61 | 	if (successCallback)	tmpCrawler.on("fetchcomplete",successCallback);
62 | 	if (failCallback)		tmpCrawler.on("fetcherror",failCallback);
63 | 
64 | 	// Start the crawler on the next runloop
65 | 	// This enables initial configuration options and event handlers to take
66 | 	// effect before the first resource is queued.
67 | 	process.nextTick(function() {
68 | 		tmpCrawler.start();
69 | 	});
70 | 
71 | 	// Return crawler
72 | 	return tmpCrawler;
73 | };
74 | 


--------------------------------------------------------------------------------
/test/lib/routes.js:
--------------------------------------------------------------------------------
 1 | // Routes for testing server
 2 | 
 3 | 
 4 | module.exports = {
 5 | 	"/": function(write) {
 6 | 		write(200,"Home. <a href='stage2'>stage2</a>");
 7 | 	},
 8 | 
 9 | 	"/stage2": function(write) {
10 | 		write(200,"Stage2. http://127.0.0.1:3000/stage/3");
11 | 	},
12 | 
13 | 	"/stage/3": function(write) {
14 | 		write(200,"Stage3. <a href='//127.0.0.1:3000/stage/4'>stage4</a>");
15 | 	},
16 | 
17 | 	"/stage/4": function(write) {
18 | 		write(200,"Stage4. <a href='../stage5'>stage5</a>");
19 | 	},
20 | 
21 | 	"/stage5": function(write,redir) {
22 | 		redir("/stage6");
23 | 	},
24 | 
25 | 	"/stage6": function(write) {
26 | 		write(200,"Crawl complete!");
27 | 	},
28 | 
29 | 	"/async-stage1": function(write) {
30 | 		write(200,"http://127.0.0.1:3000/async-stage2");
31 | 	},
32 | 
33 | 	"/async-stage2": function(write) {
34 | 		write(200,"http://127.0.0.1:3000/async-stage3");
35 | 	},
36 | 
37 | 	"/async-stage3": function(write) {
38 | 		write(200,"Complete!");
39 | 	},
40 | 
41 | 	"/timeout": function(write) {
42 | 		// We want to trigger a timeout. Never respond.
43 | 	},
44 | 
45 | 	// Routes for depth tests
46 | 	"/depth/1": function(write) {
47 | 		write(200,"<link rel='stylesheet' href='/css'> Home. <a href='/depth/2'>depth2</a>");
48 | 	},
49 | 
50 | 	"/depth/2": function(write) {
51 | 		write(200,"Depth 2. http://127.0.0.1:3000/depth/3");
52 | 	},
53 | 
54 | 	"/depth/3": function(write) {
55 | 		write(200,"Depth 3. <link rel='stylesheet' href='/css/2'> <link rel='stylesheet' href='/css/4'>");
56 | 	},
57 | 
58 | 	"/css": function(write) {
59 | 		write(200,"/* CSS 1 */ @import url('/css/2'); @font-face { url(/font/1) format('woff'); }", "text/css");
60 | 	},
61 | 
62 | 	"/css/2": function(write) {
63 | 		write(200,"/* CSS 2 */ @import url('/css/3'); .img1 { background-image:url('/img/1'); }", "text/css");
64 | 	},
65 | 
66 | 	"/css/3": function(write) {
67 | 		write(200,"/* CSS 3 */", "text/css");
68 | 	},
69 | 
70 | 	"/css/4": function(write) {
71 | 		write(200,"/* CSS 4 */ .img1 { background-image:url('/img/2'); } @font-face { url(/font/2) format('woff'); }", "text/css");
72 | 	},
73 | 
74 | 	"/img/1": function(write) {
75 | 		write(200,"", "image/png");
76 | 	},
77 | 
78 | 	"/img/2": function(write) {
79 | 		write(200,"", "image/png");
80 | 	},
81 | 
82 | 	"/font/1": function(write) {
83 | 		write(200,"", "font/woff");
84 | 	},
85 | 
86 | 	"/font/2": function(write) {
87 | 		write(200,"", "application/font-woff");
88 | 	}
89 | };
90 | 


--------------------------------------------------------------------------------
/test/testcrawl.js:
--------------------------------------------------------------------------------
 1 | // Runs a very simple crawl on an HTTP server
 2 | // This is more of an integration test than a unit test.
 3 | 
 4 | var chai = require("chai");
 5 |     chai.should();
 6 | 
 7 | var testserver = require("./lib/testserver.js");
 8 | 
 9 | describe("Test Crawl",function() {
10 | 
11 | 	var Crawler	= require("../");
12 | 
13 | 	// Create a new crawler to crawl this server
14 | 	var localCrawler = new Crawler("127.0.0.1","/",3000),
15 | 		asyncCrawler = new Crawler("127.0.0.1","/",3000);
16 | 
17 | 	// Speed up tests. No point waiting for every request when we're running
18 | 	// our own server.
19 | 	localCrawler.interval = asyncCrawler.interval = 1;
20 | 
21 | 	var linksDiscovered = 0;
22 | 
23 | 	it("should be able to be started",function(done) {
24 | 
25 | 		localCrawler.on("crawlstart",function() { done() });
26 | 		localCrawler.on("discoverycomplete",function() {
27 | 			linksDiscovered ++;
28 | 		});
29 | 
30 | 		localCrawler.start();
31 | 		localCrawler.running.should.be.truthy;
32 | 	});
33 | 
34 | 	it("should have a queue with at least the initial crawl path",function() {
35 | 
36 | 		localCrawler.queue.length.should.be.greaterThan(0);
37 | 	});
38 | 
39 | 	it("should discover all linked resources in the queue",function(done) {
40 | 
41 | 		localCrawler.on("complete",function() {
42 | 			linksDiscovered.should.equal(5);
43 | 			done();
44 | 		});
45 | 	});
46 | 
47 | 	it("should support async event listeners for manual discovery",function(done) {
48 | 
49 | 		this.slow('1s')
50 | 
51 | 		// Use a different crawler this time
52 | 		asyncCrawler.discoverResources = false;
53 | 		asyncCrawler.queueURL("http://127.0.0.1:3000/async-stage1");
54 | 		asyncCrawler.start();
55 | 
56 | 		asyncCrawler.on("fetchcomplete",function(queueItem,data,res) {
57 | 			var evtDone = this.wait();
58 | 
59 | 			setTimeout(function(){
60 | 				linksDiscovered ++;
61 | 
62 | 				if (String(data).match(/complete/i))
63 | 					return evtDone();
64 | 
65 | 				// Taking advantage of the fact that for these, the sum total
66 | 				// of the body data is a URL.
67 | 				asyncCrawler.queueURL(String(data)).should.be.true;
68 | 
69 | 				evtDone();
70 | 			},100);
71 | 		});
72 | 
73 | 		asyncCrawler.on("complete",function() {
74 | 			linksDiscovered.should.equal(8);
75 | 			done();
76 | 		});
77 | 	});
78 | 
79 | 	// TODO
80 | 
81 | 	// Test how simple error conditions, content types, and responses are handled.
82 | 
83 | 	// Test encodings.
84 | 
85 | 	// Test URL detection
86 | 
87 | 	// Test handling binary data
88 | 
89 | 	// Test bad content length
90 | 
91 | });
92 | 


--------------------------------------------------------------------------------
/test/reliability.js:
--------------------------------------------------------------------------------
 1 | // Runs a very simple crawl on an HTTP server
 2 | 
 3 | var chai = require("chai");
 4 | 	chai.should();
 5 | 
 6 | // Require the same server as in our previous tests...
 7 | var testserver = require("./lib/testserver.js");
 8 | 
 9 | describe("Crawler reliability",function() {
10 | 
11 | 	var Crawler	= require("../");
12 | 
13 | 	it("should be able to handle a timeout",function(done) {
14 | 
15 | 		this.slow('1s')
16 | 
17 | 		var localCrawler = Crawler.crawl("http://127.0.0.1:3000/timeout");
18 | 			localCrawler.timeout = 200;
19 | 
20 | 		localCrawler.on("fetchtimeout",function(queueItem) {
21 | 			queueItem.should.be.an("object");
22 | 			queueItem.url.should.equal("http://127.0.0.1:3000/timeout");
23 | 			done();
24 | 		});
25 | 	});
26 | 
27 | 	it("should be able to freeze and defrost the queue", function(done) {
28 | 
29 | 		var localCrawler = new Crawler("127.0.0.1", "/", 3000),
30 | 			newCrawler = new Crawler("127.0.0.1", "/", 3000),
31 | 			tmp = (process.env.TMPDIR || __dirname) + "/queue.json";
32 | 			localCrawler.start();
33 | 
34 | 		var test = function() {
35 | 			this.stop();
36 | 
37 | 			// Lets the queue be populated
38 | 			process.nextTick(function() {
39 | 				localCrawler.queue.length.should.equal(3);
40 | 				localCrawler.queue.oldestUnfetchedIndex.should.equal(1);
41 | 				localCrawler.queue.scanIndex["http://127.0.0.1:3000/"]
42 | 					.should.equal(true);
43 | 				localCrawler.queue.scanIndex["http://127.0.0.1:3000/stage2"]
44 | 					.should.equal(true);
45 | 				localCrawler.queue.scanIndex["http://127.0.0.1:3000/stage/3"]
46 | 					.should.equal(true);
47 | 
48 | 				localCrawler.queue[0].status.should.equal("downloaded");
49 | 				localCrawler.queue[1].status.should.equal("downloaded");
50 | 				localCrawler.queue[2].status.should.equal("queued");
51 | 
52 | 				localCrawler.queue.freeze(tmp, defrost);
53 | 			});
54 | 		};
55 | 
56 | 		var defrost = function() {
57 | 			newCrawler.queue.defrost(tmp, checkDefrost);
58 | 		};
59 | 
60 | 		var checkDefrost = function() {
61 | 			newCrawler.queue.length.should.equal(3);
62 | 			newCrawler.queue.oldestUnfetchedIndex.should.equal(2);
63 | 			newCrawler.queue.scanIndex["http://127.0.0.1:3000/"]
64 | 				.should.equal(true);
65 | 			newCrawler.queue.scanIndex["http://127.0.0.1:3000/stage2"]
66 | 				.should.equal(true);
67 | 			newCrawler.queue.scanIndex["http://127.0.0.1:3000/stage/3"]
68 | 				.should.equal(true);
69 | 
70 | 			newCrawler.queue[0].status.should.equal("downloaded");
71 | 			newCrawler.queue[1].status.should.equal("downloaded");
72 | 			newCrawler.queue[2].status.should.equal("queued");
73 | 
74 | 			newCrawler.queue.oldestUnfetchedItem(function(err, queueItem) {
75 | 				queueItem.url.should.equal("http://127.0.0.1:3000/stage/3");
76 | 				done();
77 | 			});
78 | 		};
79 | 
80 | 		localCrawler.once("fetchcomplete",
81 | 			localCrawler.once.bind(localCrawler, "fetchcomplete", test));
82 | 
83 | 	});
84 | });
85 | 


--------------------------------------------------------------------------------
/test/discovery.js:
--------------------------------------------------------------------------------
  1 | // Runs a very simple crawl on an HTTP server
  2 | 
  3 | var chai = require("chai");
  4 | 	chai.should();
  5 | 
  6 | describe("Crawler link discovery",function() {
  7 | 
  8 | 	var Crawler = null,
  9 | 		crawler = null,
 10 | 		discover = null;
 11 | 
 12 | 	beforeEach(function() {
 13 | 		Crawler	= require("../");
 14 | 		crawler = new Crawler();
 15 | 		discover = crawler.discoverResources.bind(crawler);
 16 | 	});
 17 | 
 18 | 	it("should discover http/s prefixed URLs in the document",function() {
 19 | 
 20 | 		var links =
 21 | 			discover("	blah blah http://google.com/ \
 22 | 						blah blah https://fish.com/resource blah \
 23 | 						//example.com");
 24 | 
 25 | 		links.should.be.an("array");
 26 | 		links.length.should.equal(2);
 27 | 		links[0].should.equal("http://google.com/");
 28 | 		links[1].should.equal("https://fish.com/resource");
 29 | 	});
 30 | 
 31 | 	it("should discover URLS in quoted attributes in the document",function() {
 32 | 
 33 | 		var links =
 34 | 			discover("	<a href='google.com'> \
 35 | 						<img src=\"http://example.com/resource with spaces.txt\"> \
 36 | 						url('thingo.com/test.html')");
 37 | 
 38 | 		links.should.be.an("array");
 39 | 		links.length.should.equal(4);
 40 | 		links[0].should.equal("google.com");
 41 | 		links[1].should.equal("http://example.com/resource%20with%20spaces.txt");
 42 | 		links[2].should.equal("thingo.com/test.html");
 43 | 	});
 44 | 
 45 | 	it("should discover URLS in unquoted attributes in the document",function() {
 46 | 
 47 | 		var links =
 48 | 			discover("	<a href=google.com> \
 49 | 						<img src=http://example.com/resource with spaces.txt> \
 50 | 						url(thingo.com/test.html)");
 51 | 
 52 | 		links.should.be.an("array");
 53 | 		links.length.should.equal(3);
 54 | 		links[0].should.equal("google.com");
 55 | 		links[1].should.equal("http://example.com/resource");
 56 | 		links[2].should.equal("thingo.com/test.html");
 57 | 	});
 58 | 
 59 | 	it("should replace all '&amp;'s with ampersands",function() {
 60 | 
 61 | 		var links =
 62 | 			discover("<a href='http://example.com/resource?with&amp;query=params&amp;and=entities'>");
 63 | 
 64 | 		links.should.be.an("array");
 65 | 		links.length.should.equal(2);
 66 | 		links[0].should.equal("http://example.com/resource?with&query=params&and=entities");
 67 | 		links[1].should.equal("http://example.com/resource");
 68 | 	});
 69 | 
 70 | 	it("should ignore HTML comments with parseHTMLComments = false",function() {
 71 | 
 72 | 		crawler.parseHTMLComments = false;
 73 | 
 74 | 		var links =
 75 | 			discover("	<!-- http://example.com/oneline_comment --> \
 76 | 						<a href=google.com> \
 77 | 						<!-- \
 78 | 						http://example.com/resource \
 79 | 						<a href=example.com> \
 80 | 						-->");
 81 | 
 82 | 		links.should.be.an("array");
 83 | 		links.length.should.equal(1);
 84 | 		links[0].should.equal("google.com");
 85 | 	});
 86 | 
 87 | 	it("should ignore script tags with parseScriptTags = false",function() {
 88 | 
 89 | 		crawler.parseScriptTags = false;
 90 | 
 91 | 		var links =
 92 | 			discover("	<script>var a = \"<a href='http://example.com/oneline_script'></a>\";</script> \
 93 | 						<a href=google.com> \
 94 | 						<script type='text/javascript'> \
 95 | 						http://example.com/resource \
 96 | 						<a href=example.com> \
 97 | 						</SCRIPT>");
 98 | 
 99 | 		links.should.be.an("array");
100 | 		links.length.should.equal(1);
101 | 		links[0].should.equal("google.com");
102 | 	});
103 | });
104 | 


--------------------------------------------------------------------------------
/test/resourcevalidity.js:
--------------------------------------------------------------------------------
  1 | // Tests whether a given resource is considered 'valid' for crawling under
  2 | // a number of different conditions.
  3 | 
  4 | var chai = require("chai");
  5 | 	chai.should();
  6 | 
  7 | describe("Resource validity checker",function() {
  8 | 
  9 | 	it("should be able to determine whether a domain is in crawl scope",
 10 | 		function() {
 11 | 
 12 | 		var crawler = new (require("../"))("example.com",3000);
 13 | 
 14 | 		// The domain itself should be allowed.
 15 | 		crawler.domainValid("example.com").should.equal(true);
 16 | 
 17 | 		// Whereas other domains should not be allowed.
 18 | 		crawler.domainValid("somethingelse").should.equal(false);
 19 | 		crawler.domainValid("microsoft.com").should.equal(false);
 20 | 		crawler.domainValid("a.really.complex.fqdn.").should.equal(false);
 21 | 
 22 | 	});
 23 | 
 24 | 	it("should be able to determine whether a domain is a subdomain of another",
 25 | 		function() {
 26 | 
 27 | 		var crawler = new (require("../"))("example.com",3000);
 28 | 
 29 | 		// Enable scanning subdomains, important for this test
 30 | 		crawler.scanSubdomains = true;
 31 | 
 32 | 		// The domain itself isn't a subdomain per-se, but should be allowed
 33 | 		crawler.domainValid("example.com").should.equal(true);
 34 | 
 35 | 		// WWW is a subdomain
 36 | 		crawler.domainValid("www.example.com").should.equal(true);
 37 | 
 38 | 		// More complex examples
 39 | 		crawler.domainValid("testing.example.com").should.equal(true);
 40 | 
 41 | 		// Multiple levels
 42 | 		crawler.domainValid("system.cache.example.com").should.equal(true);
 43 | 
 44 | 		// These aren't valid...
 45 | 		crawler.domainValid("com.example").should.equal(false);
 46 | 		crawler.domainValid("example.com.au").should.equal(false);
 47 | 		crawler.domainValid("example.us").should.equal(false);
 48 | 
 49 | 	});
 50 | 
 51 | 
 52 | 	it("should consider WWW domains and non-WWW domains alike by default",
 53 | 		function() {
 54 | 
 55 | 		var crawler = new (require("../"))("example.com",3000);
 56 | 
 57 | 		// Explicitly disallow crawling subdomains, important for this test
 58 | 		crawler.scanSubdomains = false;
 59 | 
 60 | 		// The domain itself isn't a subdomain per-se, but should be allowed
 61 | 		crawler.domainValid("example.com").should.equal(true);
 62 | 
 63 | 		// Its WWW domain should be allowed by default
 64 | 		crawler.domainValid("www.example.com").should.equal(true);
 65 | 
 66 | 	});
 67 | 
 68 | 	it("should consider WWW domains and non-WWW domains as separate if requested",
 69 | 		function() {
 70 | 
 71 | 		var crawler = new (require("../"))("example.com",3000);
 72 | 
 73 | 		// Explicitly disallow crawling subdomains, important for this test
 74 | 		crawler.scanSubdomains = false;
 75 | 
 76 | 		// Explicitly consider www a separate subdomain (ordinarily, true)
 77 | 		crawler.ignoreWWWDomain = false;
 78 | 
 79 | 		// The domain itself isn't a subdomain per-se, but should be allowed
 80 | 		crawler.domainValid("example.com").should.equal(true);
 81 | 
 82 | 		// Its WWW domain should be allowed by default
 83 | 		crawler.domainValid("www.example.com").should.equal(false);
 84 | 
 85 | 	});
 86 | 
 87 | 	it("should permit a specified set of domains based on the internal whitelist",
 88 | 		function() {
 89 | 
 90 | 		var crawler = new (require("../"))("example.com",3000);
 91 | 
 92 | 		// Add a few specific subdomains
 93 | 		crawler.domainWhitelist.push("foo.com");
 94 | 		crawler.domainWhitelist.push("bar.com");
 95 | 		crawler.domainWhitelist.push("abcdefg.net.nz");
 96 | 
 97 | 		// The domain itself isn't a subdomain per-se, but should be allowed
 98 | 		crawler.domainValid("example.com").should.equal(true);
 99 | 
100 | 		// The explicitly set domains should be permitted
101 | 		crawler.domainValid("foo.com").should.equal(true);
102 | 		crawler.domainValid("bar.com").should.equal(true);
103 | 		crawler.domainValid("abcdefg.net.nz").should.equal(true);
104 | 
105 | 		// These domains were never whitelisted, and should be denied
106 | 		crawler.domainValid("wumpus.com").should.equal(false);
107 | 		crawler.domainValid("fish.net").should.equal(false);
108 | 
109 | 	});
110 | 
111 | 	it("should permit fetching of specified protocols based on internal whitelist",
112 | 		function() {
113 | 
114 | 		var crawler = new (require("../"))("example.com",3000);
115 | 
116 | 		// Protocols supported by default
117 | 		crawler.protocolSupported("http://google.com").should.equal(true);
118 | 		crawler.protocolSupported("https://google.com").should.equal(true);
119 | 		crawler.protocolSupported("rss://google.com").should.equal(true);
120 | 		crawler.protocolSupported("feed://google.com").should.equal(true);
121 | 		crawler.protocolSupported("atom://google.com").should.equal(true);
122 | 
123 | 		// Protocols not supported
124 | 		crawler.protocolSupported("gopher://google.com").should.equal(false);
125 | 		crawler.protocolSupported("ws://google.com").should.equal(false);
126 | 		crawler.protocolSupported("wss://google.com").should.equal(false);
127 | 	});
128 | 
129 | 	it("should permit parsing of specified resources based on mimetype checks",
130 | 		function() {
131 | 
132 | 		this.supportedMimeTypes = [
133 | 			/^text\//i,
134 | 			/^application\/(rss)?[\+\/\-]?xml/i,
135 | 			/^application\/javascript/i,
136 | 			/^xml/i
137 | 		];
138 | 
139 | 		var crawler = new (require("../"))("example.com",3000);
140 | 
141 | 		// Protocols supported by default
142 | 		crawler.mimeTypeSupported("text/plain").should.equal(true);
143 | 
144 | 		// Crawler should be able to process all plain-text formats
145 | 		crawler.mimeTypeSupported("text/SomeFormat").should.equal(true);
146 | 		crawler.mimeTypeSupported("text/html").should.equal(true);
147 | 
148 | 		// XML based formats
149 | 		crawler.mimeTypeSupported("application/rss+xml").should.equal(true);
150 | 		crawler.mimeTypeSupported("application/html+xml").should.equal(true);
151 | 		crawler.mimeTypeSupported("application/xhtml+xml").should.equal(true);
152 | 
153 | 		// Some weird JS mimetypes
154 | 		crawler.mimeTypeSupported("application/javascript").should.equal(true);
155 | 
156 | 		// Anything with XML...
157 | 		crawler.mimeTypeSupported("xml/manifest").should.equal(true);
158 | 
159 | 		// And these should fail
160 | 		crawler.mimeTypeSupported("application/octet-stream").should.equal(false);
161 | 		crawler.mimeTypeSupported("img/png").should.equal(false);
162 | 		crawler.mimeTypeSupported("video/webm").should.equal(false);
163 | 		crawler.mimeTypeSupported("blah/blah").should.equal(false);
164 | 
165 | 	});
166 | 
167 | 
168 | 	describe("Link parser",function() {
169 | 
170 | 		var crawler = new (require("../"))("127.0.0.1",3000);
171 | 
172 | 		it("should throw out junky or invalid URLs without dying",function() {
173 | 
174 | 			var urlContext = {
175 | 				"url": "http://www.example.com"
176 | 			};
177 | 
178 | 			crawler.processURL("",urlContext).should.equal(false);
179 | 			crawler.processURL("\n\n",urlContext).should.equal(false);
180 | 			crawler.processURL("ur34nfie4985:s////dsf/",urlContext).should.equal(false);
181 | 
182 | 		});
183 | 
184 | 	});
185 | });
186 | 
187 | 


--------------------------------------------------------------------------------
/lib/cache-backend-fs.js:
--------------------------------------------------------------------------------
  1 | // Simplecrawler - FS cache backend
  2 | // Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself.
  3 | // The idea is that it is then possible to re-serve the website just using the cache.
  4 | 
  5 | var fs = require("fs");
  6 | var crypto = require("crypto");
  7 | 
  8 | // Factory for FSBackend
  9 | var backend = function backend(loadParameter) {
 10 | 	return new FSBackend(loadParameter);
 11 | };
 12 | 
 13 | module.exports = backend;
 14 | 
 15 | // Constructor for filesystem cache backend
 16 | var FSBackend = function FSBackend(loadParameter) {
 17 | 	this.loaded = false;
 18 | 	this.index = [];
 19 | 	this.location = typeof(loadParameter) === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/";
 20 | 	this.location = this.location.substr(this.location.length-1) === "/" ? this.location : this.location + "/";
 21 | };
 22 | 
 23 | // Function for sanitising paths
 24 | // We try to get the most understandable, file-system friendly paths we can.
 25 | // An extension is added if not present or inappropriate - if a better one can be determined.
 26 | // Querystrings are hashed to truncate without (hopefully) collision.
 27 | 
 28 | function sanitisePath(path,queueObject) {
 29 | 	// Remove first slash (as we set one later.)
 30 | 	path = path.replace(/^\//,"");
 31 | 
 32 | 	var pathStack = [];
 33 | 
 34 | 	// Trim whitespace. If no path is present - assume index.html.
 35 | 	var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html";
 36 | 	var headers = queueObject.stateData.headers, sanitisedPathParts;
 37 | 
 38 | 	if (sanitisedPath.match(/\?/)) {
 39 | 		sanitisedPathParts = sanitisedPath.split(/\?/g);
 40 | 		var resource	= sanitisedPathParts.shift();
 41 | 		var hashedQS	= crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex");
 42 | 		sanitisedPath	= resource + "?" + hashedQS;
 43 | 	}
 44 | 
 45 | 	pathStack = sanitisedPath.split(/\//g);
 46 | 	pathStack = pathStack.map(function(pathChunk,count) {
 47 | 		if (pathChunk.length >= 250) {
 48 | 			return crypto.createHash("sha1").update(pathChunk).digest("hex");
 49 | 		}
 50 | 
 51 | 		return pathChunk;
 52 | 	});
 53 | 
 54 | 	sanitisedPath = pathStack.join("/");
 55 | 
 56 | 	// Try to get a file extension for the file - for ease of identification
 57 | 	// We run through this if we either:
 58 | 	//	1) haven't got a file extension at all, or:
 59 | 	//	2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type)
 60 | 
 61 | 	if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || (headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i))) {
 62 | 		var subMimeType = "";
 63 | 		var mimeParts = [];
 64 | 
 65 | 		if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) {
 66 | 			if (sanitisedPath.match(/\/$/)) {
 67 | 				sanitisedPath += "index.html";
 68 | 			} else {
 69 | 				sanitisedPath += ".html";
 70 | 			}
 71 | 
 72 | 		} else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) {
 73 | 			subMimeType = mimeParts[2];
 74 | 			sanitisedPath += "." + subMimeType;
 75 | 		}
 76 | 	}
 77 | 
 78 | 	return sanitisedPath;
 79 | }
 80 | 
 81 | FSBackend.prototype.fileExists = function(location) {
 82 | 	try {
 83 | 		fs.statSync(location);
 84 | 		return true;
 85 | 	} catch (er) {
 86 | 		return false;
 87 | 	}
 88 | };
 89 | 
 90 | FSBackend.prototype.isDirectory = function(location) {
 91 | 	try {
 92 | 		if (fs.statSync(location).isDirectory()) {
 93 | 			return true;
 94 | 		}
 95 | 
 96 | 		return false;
 97 | 	} catch (er) {
 98 | 		return false;
 99 | 	}
100 | };
101 | 
102 | FSBackend.prototype.load = function() {
103 | 	var backend = this;
104 | 
105 | 	if (!this.fileExists(this.location) && this.isDirectory(this.location)) {
106 | 		throw new Error("Unable to verify cache location exists.");
107 | 	}
108 | 
109 | 	try {
110 | 		var fileData;
111 | 		if ((fileData = fs.readFileSync(this.location + "cacheindex.json")) && fileData.length) {
112 | 			this.index = JSON.parse(fileData.toString("utf8"));
113 | 			this.loaded = true;
114 | 		}
115 | 	} catch(error) {
116 | 		if (error.code === "ENOENT") {
117 | 			// Cache index doesn't exist. Assume this is a new cache.
118 | 			// Just leave the memory index empty for now.
119 | 			this.loaded = true;
120 | 		} else {
121 | 			throw error;
122 | 		}
123 | 	}
124 | 
125 | 	// Flush store to disk when closing.
126 | 	process.on("exit",function() {
127 | 		backend.saveCache.apply(backend);
128 | 	});
129 | };
130 | 
131 | FSBackend.prototype.saveCache = function(callback) {
132 | 	fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback);
133 | };
134 | 
135 | FSBackend.prototype.setItem = function(queueObject,data,callback) {
136 | 	callback = callback instanceof Function ? callback : function(){};
137 | 
138 | 	var backend = this;
139 | 	var pathStack = [queueObject.protocol, queueObject.domain, queueObject.port];
140 | 	pathStack = pathStack.concat(sanitisePath(queueObject.path,queueObject).split(/\/+/g));
141 | 
142 | 	var cacheItemExists = false;
143 | 	var firstInstanceIndex = NaN;
144 | 	if (this.index.reduce(function(prev,current,index,array) {
145 | 			firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index;
146 | 			return prev || current.url === queueObject.url;
147 | 		},false)) {
148 | 		cacheItemExists = true;
149 | 	}
150 | 
151 | 	var writeFileData = function(currentPath,data) {
152 | 		fs.writeFile(currentPath,data,function(error) {
153 | 			if (error) throw error;
154 | 			fs.writeFile(currentPath + ".cacheData.json",JSON.stringify(queueObject),function(error) {
155 | 				if (error) throw error;
156 | 
157 | 				var cacheObject = {
158 | 					url: queueObject.url,
159 | 					etag: queueObject.stateData.headers.etag,
160 | 					lastModified: queueObject.stateData.headers['last-modified'],
161 | 					dataFile: currentPath,
162 | 					metaFile: currentPath + ".cacheData.json"
163 | 				};
164 | 
165 | 				if (cacheItemExists) {
166 | 					backend.index[firstInstanceIndex] = cacheObject;
167 | 				} else {
168 | 					backend.index.push(cacheObject);
169 | 				}
170 | 
171 | 				callback(cacheObject);
172 | 			});
173 | 		});
174 | 	};
175 | 
176 | 	pathStack.forEach(function(pathChunk,count) {
177 | 		var currentPath = backend.location + pathStack.slice(0,count+1).join("/");
178 | 		if (backend.fileExists(backend.location + pathStack.slice(0,count+1).join("/"))) {
179 | 			if (!backend.isDirectory(currentPath)) {
180 | 				if (count === pathStack.length -1) {
181 | 					// Just overwrite the file...
182 | 					writeFileData(currentPath,data);
183 | 				} else {
184 | 					throw new Error("Cache storage of resource (%s) blocked by file: %s",queueObject.url,currentPath);
185 | 				}
186 | 			}
187 | 		} else {
188 | 			if (count === pathStack.length -1) {
189 | 				// Write the file data in
190 | 				writeFileData(currentPath,data);
191 | 			} else {
192 | 				fs.mkdirSync(currentPath);
193 | 			}
194 | 		}
195 | 	});
196 | };
197 | 
198 | FSBackend.prototype.getItem = function(queueObject,callback) {
199 | 	var cacheItemResult = this.index.filter(function(item) {
200 | 			return item.url === queueObject.url;
201 | 		});
202 | 
203 | 	if (cacheItemResult.length) {
204 | 		var cacheItem = cacheItemResult.shift();
205 | 
206 | 		callback({
207 | 			"url": cacheItem.url,
208 | 			"etag": cacheItem.etag,
209 | 			"lastModified": cacheItem.lastModified,
210 | 			"getData": function(callback) {
211 | 				fs.readFile(cacheItem.dataFile,function(error,data) {
212 | 					if (error) {
213 | 						callback(error);
214 | 						return false;
215 | 					}
216 | 
217 | 					callback(null,data);
218 | 				});
219 | 			},
220 | 			"getMetadata": function(callback) {
221 | 				fs.readFile(cacheItem.metaFile,function(error,data) {
222 | 					if (error) {
223 | 						callback(error);
224 | 						return false;
225 | 					}
226 | 
227 | 					callback(null,JSON.parse(data.toString("utf8")));
228 | 				});
229 | 			}
230 | 		});
231 | 
232 | 	} else {
233 | 		callback(null);
234 | 	}
235 | 
236 | 	return false;
237 | };
238 | 
239 | 


--------------------------------------------------------------------------------
/test/cookies.js:
--------------------------------------------------------------------------------
  1 | // Ensures that cookie support is functional and reliable across
  2 | // a variety of different cookie formats. The more cookies I can add to this
  3 | // cookies array, the better the tests!
  4 | 
  5 | var chai = require("chai");
  6 | 	chai.should();
  7 | 
  8 | var cookies = [
  9 | 	"Set-Cookie: RMID=007f010019155170d6ca005f; Expires=Sat, 19 Apr 2020 05:31:54 GMT; Path=/; Domain=.nytimes.com;",
 10 | 	"Set-cookie: adxcs=-; path=/; domain=.nytimes.com",
 11 | 	"Set-Cookie: PREF=ID=8c63f2522e22574d:FF=0:TM=1366349569:LM=1366349569:S=p1Urbmfwfs-R573P; expires=Sun, 19-Apr-2015 05:32:49 GMT; path=/; domain=.google.com",
 12 | 	"Set-Cookie: NID=67=DhLO04YPAMlhETrTIe2oFPqWZfypQXLZfCIPItOvf70zhtUEMEItYfdVh6aROEzRHqtd9jHT6HJ7Oo93eqP3cjYNp8GgShfa6r0WVbsmQQRUvutbjBOPwzo7ybwYcWdB; expires=Sat, 19-Oct-2015 05:32:49 GMT; path=/; domain=.google.com; HttpOnly",
 13 | 	"Set-Cookie: fpc=d=Yq1z8hbA9WextmPFlb7suMTfMRgtSc2FyzAB7now1ExfUZ.eW7s4QSwSKlB6ZB0juN8OLZxWf_XXEIcspYaQmVVD0mD0xJ.xpXBCSw5Dl_Ql6n.RLoM.7CnTbNSsiSr2fkNiCN47tRUB4j8iWevNwQdFDn1hB8z8t1xwWt76n.sLIRY9p2_jTBhukfSD4SBpBkJhI1o-&v=2; expires=Sat, 19-Apr-2020 05:48:42 GMT; path=/; domain=www.yahoo.com",
 14 | 	"Set-Cookie: test=test; path=/test; domain=test.com"
 15 | ];
 16 | 
 17 | describe("Cookies",function() {
 18 | 
 19 | 	var CookieJar	= require("../lib/cookies.js"),
 20 | 		Cookie		= CookieJar.Cookie;
 21 | 
 22 | 	it("should be able parse from string properly",function() {
 23 | 
 24 | 		Cookie.should.be.a("function");
 25 | 		Cookie.fromString.should.be.a("function");
 26 | 		Cookie.fromString(cookies[0]).should.be.an("object");
 27 | 		Cookie.fromString(cookies[0]).should.be.an.instanceof(Cookie);
 28 | 
 29 | 		var tmpCookie = Cookie.fromString(cookies[0]);
 30 | 
 31 | 		tmpCookie.name.should.equal("RMID");
 32 | 		tmpCookie.value.should.equal("007f010019155170d6ca005f");
 33 | 		tmpCookie.expires.should.equal(1587274314000);
 34 | 		tmpCookie.path.should.equal("/");
 35 | 		tmpCookie.domain.should.equal(".nytimes.com");
 36 | 
 37 | 		// Test the next cookie...
 38 | 		tmpCookie = Cookie.fromString(cookies[1]);
 39 | 
 40 | 		tmpCookie.name.should.equal("adxcs");
 41 | 		tmpCookie.value.should.equal("-");
 42 | 		tmpCookie.expires.should.equal(-1);
 43 | 		tmpCookie.path.should.equal("/");
 44 | 		tmpCookie.domain.should.equal(".nytimes.com");
 45 | 
 46 | 	});
 47 | 
 48 | 	it("should be able to test for expiry",function() {
 49 | 
 50 | 		// Create a new cookie that should already have expired...
 51 | 		var tmpCookie = new Cookie("test","test",Date.now()-1000);
 52 | 
 53 | 		tmpCookie.isExpired().should.equal(true);
 54 | 
 55 | 		// Create a new cookie with an expiry 20 seconds in the future
 56 | 		tmpCookie = new Cookie("test","test",Date.now()+20000);
 57 | 
 58 | 		tmpCookie.isExpired().should.equal(false);
 59 | 	});
 60 | 
 61 | 	it("should be able to output the cookie object as a string",function() {
 62 | 
 63 | 		cookies.forEach(function(cookie) {
 64 | 			var tmpCookie		= Cookie.fromString(cookie),
 65 | 				outputString	= tmpCookie.toString(true),
 66 | 				reParsedCookie	= Cookie.fromString(outputString);
 67 | 
 68 | 			tmpCookie.name.should.equal(reParsedCookie.name);
 69 | 			tmpCookie.value.should.equal(reParsedCookie.value);
 70 | 			tmpCookie.expires.should.equal(reParsedCookie.expires);
 71 | 			tmpCookie.path.should.equal(reParsedCookie.path);
 72 | 			tmpCookie.domain.should.equal(reParsedCookie.domain);
 73 | 			tmpCookie.httponly.should.equal(reParsedCookie.httponly);
 74 | 		})
 75 | 	});
 76 | 
 77 | 	describe("Cookie Jar",function() {
 78 | 
 79 | 		it("should be able to be instantiated",function() {
 80 | 			var cookieJar = new CookieJar();
 81 | 		});
 82 | 
 83 | 		it("should be able to add cookies",function() {
 84 | 			var cookieJar = new CookieJar();
 85 | 
 86 | 			cookies.forEach(function(cookie) {
 87 | 				var parsedCookie = Cookie.fromString(cookie);
 88 | 
 89 | 				cookieJar.add(
 90 | 						parsedCookie.name,
 91 | 						parsedCookie.value,
 92 | 						parsedCookie.expires,
 93 | 						parsedCookie.path,
 94 | 						parsedCookie.domain,
 95 | 						parsedCookie.httponly);
 96 | 
 97 | 				var cookiesAdded = cookieJar.get(parsedCookie.name),
 98 | 					parsedCookie2 = cookiesAdded.pop();
 99 | 
100 | 				parsedCookie2.name.should.equal(parsedCookie.name);
101 | 				parsedCookie2.value.should.equal(parsedCookie.value);
102 | 				parsedCookie2.expires.should.equal(parsedCookie.expires);
103 | 				parsedCookie2.path.should.equal(parsedCookie.path);
104 | 				parsedCookie2.domain.should.equal(parsedCookie.domain);
105 | 				parsedCookie2.httponly.should.equal(parsedCookie.httponly);
106 | 			});
107 | 
108 | 			cookieJar.cookies.length.should.equal(cookies.length);
109 | 		});
110 | 
111 | 		it("should be able to remove cookies by name",function() {
112 | 			var cookieJar = new CookieJar();
113 | 
114 | 			cookies.forEach(function(cookie) {
115 | 				var parsedCookie = Cookie.fromString(cookie);
116 | 
117 | 				cookieJar.add(
118 | 						parsedCookie.name,
119 | 						parsedCookie.value,
120 | 						parsedCookie.expires,
121 | 						parsedCookie.path,
122 | 						parsedCookie.domain,
123 | 						parsedCookie.httponly);
124 | 			});
125 | 
126 | 			cookieJar.cookies.length.should.equal(cookies.length);
127 | 
128 | 			cookies.forEach(function(cookie,index) {
129 | 				var parsedCookie = Cookie.fromString(cookie);
130 | 
131 | 				cookieJar.remove(parsedCookie.name);
132 | 
133 | 				cookieJar.cookies.length.should.equal(
134 | 										cookies.length - (index+1));
135 | 			});
136 | 		});
137 | 
138 | 		it("should be able to retrieve cookies by name",function() {
139 | 			var cookieJar = new CookieJar();
140 | 
141 | 			cookies.forEach(function(cookie) {
142 | 				var parsedCookie = Cookie.fromString(cookie);
143 | 
144 | 				cookieJar.add(
145 | 						parsedCookie.name,
146 | 						parsedCookie.value,
147 | 						parsedCookie.expires,
148 | 						parsedCookie.path,
149 | 						parsedCookie.domain,
150 | 						parsedCookie.httponly);
151 | 
152 | 				var returnedCookies = cookieJar.get(parsedCookie.name),
153 | 					parsedCookie2 = returnedCookies.pop();
154 | 
155 | 				parsedCookie2.name.should.equal(parsedCookie.name);
156 | 				parsedCookie2.value.should.equal(parsedCookie.value);
157 | 				parsedCookie2.expires.should.equal(parsedCookie.expires);
158 | 				parsedCookie2.path.should.equal(parsedCookie.path);
159 | 				parsedCookie2.domain.should.equal(parsedCookie.domain);
160 | 				parsedCookie2.httponly.should.equal(parsedCookie.httponly);
161 | 			});
162 | 		});
163 | 
164 | 		it("should be able to accept cookies from a header/s",function() {
165 | 			var cookieJar = new CookieJar();
166 | 			cookieJar.addFromHeaders(cookies);
167 | 
168 | 			cookies.forEach(function(cookie) {
169 | 				var parsedCookie = Cookie.fromString(cookie);
170 | 				var returnedCookies = cookieJar.get(parsedCookie.name),
171 | 					parsedCookie2 = returnedCookies.slice(0,1).pop();
172 | 
173 | 				returnedCookies.length.should.equal(1);
174 | 				parsedCookie2.name.should.equal(parsedCookie.name);
175 | 				parsedCookie2.value.should.equal(parsedCookie.value);
176 | 				parsedCookie2.expires.should.equal(parsedCookie.expires);
177 | 				parsedCookie2.path.should.equal(parsedCookie.path);
178 | 				parsedCookie2.domain.should.equal(parsedCookie.domain);
179 | 				parsedCookie2.httponly.should.equal(parsedCookie.httponly);
180 | 			});
181 | 		});
182 | 
183 | 		it("should be able to generate a header from internal storage",function() {
184 | 			var cookieJar = new CookieJar();
185 | 			cookieJar.addFromHeaders(cookies);
186 | 			var comparisonHeaderList = cookieJar.getAsHeader();
187 | 
188 | 			comparisonHeaderList.should.be.an("array");
189 | 			comparisonHeaderList.length.should.equal(cookies.length);
190 | 
191 | 			comparisonHeaderList.forEach(function(header,index) {
192 | 				var parsedCookie = Cookie.fromString(cookies[index]);
193 | 				var parsedCookie2 = Cookie.fromString(header);
194 | 
195 | 				parsedCookie2.name.should.equal(parsedCookie.name);
196 | 				parsedCookie2.value.should.equal(parsedCookie.value);
197 | 				parsedCookie2.expires.should.equal(parsedCookie.expires);
198 | 				parsedCookie2.path.should.equal(parsedCookie.path);
199 | 				parsedCookie2.domain.should.equal(parsedCookie.domain);
200 | 				parsedCookie2.httponly.should.equal(parsedCookie.httponly);
201 | 			});
202 | 		});
203 | 
204 | 		it("should be able to filter generated headers by domain and path",function() {
205 | 			var cookieJar = new CookieJar();
206 | 			cookieJar.addFromHeaders(cookies);
207 | 			var comparisonHeaderList = cookieJar.getAsHeader("nytimes.com");
208 | 
209 | 			comparisonHeaderList.length.should.equal(2);
210 | 
211 | 			comparisonHeaderList = cookieJar.getAsHeader(null,"/");
212 | 
213 | 			// Even though there's 6 cookies.
214 | 			comparisonHeaderList.length.should.equal(5);
215 | 		});
216 | 
217 | 		it("should be able to filter generated headers by expiry",function() {
218 | 			var cookieJar = new CookieJar();
219 | 			cookieJar.addFromHeaders(cookies);
220 | 
221 | 			// set the expiry on one of the headers to some point far in the past
222 | 			cookieJar.cookies[0].expires /= 2;
223 | 
224 | 			// Get the headers...
225 | 			var comparisonHeaderList = cookieJar.getAsHeader();
226 | 
227 | 			comparisonHeaderList.length.should.equal(cookies.length-1);
228 | 		});
229 | 	});
230 | });
231 | 


--------------------------------------------------------------------------------
/lib/queue.js:
--------------------------------------------------------------------------------
  1 | // Simplecrawler - queue module
  2 | // Christopher Giffard, 2011
  3 | //
  4 | // http://www.github.com/cgiffard/node-simplecrawler
  5 | 
  6 | 
  7 | var fs = require("fs");
  8 | 
  9 | var allowedStatistics = [
 10 | 	"requestTime",
 11 | 	"requestLatency",
 12 | 	"downloadTime",
 13 | 	"contentLength",
 14 | 	"actualDataSize"
 15 | ];
 16 | 
 17 | var FetchQueue = function(){
 18 | 	this.oldestUnfetchedIndex = 0;
 19 | 	this.completeCache = 0;
 20 | 	this.scanIndex = {};
 21 | };
 22 | 
 23 | module.exports = FetchQueue;
 24 | 
 25 | FetchQueue.prototype = [];
 26 | FetchQueue.prototype.add = function(protocol, domain, port, path, depth, callback) {
 27 | 
 28 | 	// For legacy reasons
 29 | 	if (depth instanceof Function) {
 30 | 		callback = depth;
 31 | 		depth = 1;
 32 | 	}
 33 | 	
 34 | 	depth = depth || 1;
 35 | 	callback = callback && callback instanceof Function ? callback : function(){};
 36 | 	var self = this;
 37 | 
 38 | 	// Ensure all variables conform to reasonable defaults
 39 | 	protocol = protocol === "https" ? "https" : "http";
 40 | 
 41 | 	if (isNaN(port) || !port) {
 42 | 		return callback(new Error("Port must be numeric!"));
 43 | 	}
 44 | 
 45 | 	var url = protocol + "://" + domain + (port !== 80 ? ":" + port : "") + path;
 46 | 
 47 | 	this.exists(protocol,domain,port,path,
 48 | 		function(err,exists) {
 49 | 			if (err) return callback(err);
 50 | 
 51 | 			if (!exists) {
 52 | 				var queueItem = {
 53 | 					"url": url,
 54 | 					"protocol": protocol,
 55 | 					"host": domain,
 56 | 					"port": port,
 57 | 					"path": path,
 58 | 					"depth": depth,
 59 | 					"fetched": false,
 60 | 					"status": "queued",
 61 | 					"stateData": {}
 62 | 				};
 63 | 
 64 | 				self.push(queueItem);
 65 | 				callback(null, queueItem);
 66 | 			} else {
 67 | 				var error = new Error("Resource already exists in queue!");
 68 | 				error.code = "DUP";
 69 | 
 70 | 				callback(error);
 71 | 			}
 72 | 		});
 73 | };
 74 | 
 75 | // Check if an item already exists in the queue...
 76 | FetchQueue.prototype.exists = function(protocol, domain, port, path, callback) {
 77 | 	callback = callback && callback instanceof Function ? callback : function(){};
 78 | 
 79 | 	port = (port !== 80 ? ":" + port : "");
 80 | 
 81 | 	var url =
 82 | 		(protocol + "://" + domain + port + path)
 83 | 			.toLowerCase();
 84 | 
 85 | 	if (!!this.scanIndex[url]) {
 86 | 		callback(null, 1);
 87 | 		return 1;
 88 | 	} else {
 89 | 		this.scanIndex[url] = true;
 90 | 		callback(null, 0);
 91 | 		return 0;
 92 | 	}
 93 | };
 94 | 
 95 | // Get last item in queue...
 96 | FetchQueue.prototype.last = function(callback) {
 97 | 	callback = callback && callback instanceof Function ? callback : function(){};
 98 | 	var item, self = this;
 99 | 	item = self[self.length-1];
100 | 	callback(null, item);
101 | 	return item;
102 | };
103 | 
104 | // Get item from queue
105 | FetchQueue.prototype.get = function(id, callback) {
106 | 	callback = callback && callback instanceof Function ? callback : function(){};
107 | 	var item, self = this;
108 | 
109 | 	if (!isNaN(id) && self.length > id) {
110 | 		item = self[id];
111 | 		callback(null, item);
112 | 		return item;
113 | 	}
114 | };
115 | 
116 | // Get first unfetched item in the queue (and return its index)
117 | FetchQueue.prototype.oldestUnfetchedItem = function(callback) {
118 | 	callback = callback && callback instanceof Function ? callback : function(){};
119 | 	var item, self = this;
120 | 
121 | 	for (var itemIndex = self.oldestUnfetchedIndex; itemIndex < self.length; itemIndex ++) {
122 | 		if (self[itemIndex].status === "queued") {
123 | 			self.oldestUnfetchedIndex = itemIndex;
124 | 			item = self[itemIndex];
125 | 			callback(null, item);
126 | 			return item;
127 | 		}
128 | 	}
129 | 
130 | 	callback(new Error("No unfetched items remain."));
131 | };
132 | 
133 | // Gets the maximum total request time, request latency, or download time
134 | FetchQueue.prototype.max = function(statisticName, callback) {
135 | 	callback = callback && callback instanceof Function ? callback : function(){};
136 | 	var maxStatisticValue = 0, self = this;
137 | 
138 | 	if (allowedStatistics.join().indexOf(statisticName) === -1) {
139 | 		// Not a recognised statistic!
140 | 		return callback(new Error("Invalid statistic."));
141 | 	}
142 | 
143 | 	self.forEach(function(item) {
144 | 		if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] > maxStatisticValue) {
145 | 			maxStatisticValue = item.stateData[statisticName];
146 | 		}
147 | 	});
148 | 
149 | 	callback(null, maxStatisticValue);
150 | 	return maxStatisticValue;
151 | };
152 | 
153 | // Gets the minimum total request time, request latency, or download time
154 | FetchQueue.prototype.min = function(statisticName, callback) {
155 | 	callback = callback && callback instanceof Function ? callback : function(){};
156 | 	var minimum, minStatisticValue = Infinity, self = this;
157 | 
158 | 	if (allowedStatistics.join().indexOf(statisticName) === -1) {
159 | 		// Not a recognised statistic!
160 | 		return callback(new Error("Invalid statistic."));
161 | 	}
162 | 
163 | 	self.forEach(function(item) {
164 | 		if (item.fetched && item.stateData[statisticName] !== null && item.stateData[statisticName] < minStatisticValue) {
165 | 			minStatisticValue = item.stateData[statisticName];
166 | 		}
167 | 	});
168 | 	
169 | 	minimum = minStatisticValue === Infinity? 0 : minStatisticValue;
170 | 	callback(null, minimum);
171 | 	return minimum;
172 | };
173 | 
174 | // Gets the minimum total request time, request latency, or download time
175 | FetchQueue.prototype.avg = function(statisticName, callback) {
176 | 	callback = callback && callback instanceof Function ? callback : function(){};
177 | 	var average, NumberSum = 0, NumberCount = 0, self = this;
178 | 
179 | 	if (allowedStatistics.join().indexOf(statisticName) === -1) {
180 | 		// Not a recognised statistic!
181 | 		return callback(new Error("Invalid statistic."));
182 | 	}
183 | 
184 | 	self.forEach(function(item) {
185 | 		if (item.fetched && item.stateData[statisticName] !== null && !isNaN(item.stateData[statisticName])) {
186 | 			NumberSum += item.stateData[statisticName];
187 | 			NumberCount ++;
188 | 		}
189 | 	});
190 | 	average = NumberSum / NumberCount;
191 | 	callback(null, average);
192 | 	return average;
193 | };
194 | 
195 | // Gets the number of requests which have been completed.
196 | FetchQueue.prototype.complete = function(callback) {
197 | 	callback = callback && callback instanceof Function ? callback : function(){};
198 | 	var NumberComplete = 0, self = this;
199 | 
200 | 	self.forEach(function(item) {
201 | 		if (item.fetched) {
202 | 			NumberComplete ++;
203 | 		}
204 | 	});
205 | 
206 | 	callback(null, NumberComplete);
207 | 	return NumberComplete;
208 | };
209 | 
210 | // Gets the number of queue items with the given status
211 | FetchQueue.prototype.countWithStatus = function(status, callback) {
212 | 	callback = callback && callback instanceof Function ? callback : function(){};
213 | 	var queueItemsMatched = 0, self = this;
214 | 
215 | 	self.forEach(function(item) {
216 | 		if (item.status === status) {
217 | 			queueItemsMatched ++;
218 | 		}
219 | 	});
220 | 
221 | 	callback(null,queueItemsMatched);
222 | 	return queueItemsMatched;
223 | };
224 | 
225 | // Gets the number of queue items with the given status
226 | FetchQueue.prototype.getWithStatus = function(status, callback) {
227 | 	callback = callback && callback instanceof Function ? callback : function(){};
228 | 	var subqueue = [], self = this;
229 | 
230 | 	self.forEach(function(item,index) {
231 | 		if (item.status === status) {
232 | 			subqueue.push(item);
233 | 			subqueue[subqueue.length-1].queueIndex = index;
234 | 		}
235 | 	});
236 | 
237 | 	callback(null,subqueue);
238 | 	return subqueue;
239 | };
240 | 
241 | // Gets the number of requests which have failed for some reason
242 | FetchQueue.prototype.errors = function(callback) {
243 | 	callback = callback && callback instanceof Function ? callback : function(){};
244 | 	var total, failedCount, notFoundCount, self = this;
245 | 
246 | 	failedCount = self.countWithStatus("failed");
247 | 	notFoundCount = self.countWithStatus("notfound");
248 | 	total = failedCount + notFoundCount;
249 | 	callback(null, total);
250 | 	return total;
251 | };
252 | 
253 | // Gets the number of items in the queue
254 | FetchQueue.prototype.getLength = function(callback) {
255 | 	return callback(null, this.length);
256 | };
257 | 
258 | // Writes the queue to disk
259 | FetchQueue.prototype.freeze = function(filename,callback) {
260 | 	callback = callback && callback instanceof Function ? callback : function(){};
261 | 	var self = this;
262 | 
263 | 	// Re-queue in-progress items before freezing...
264 | 	self.forEach(function(item) {
265 | 		if (item.fetched !== true) {
266 | 			item.status = "queued";
267 | 		}
268 | 	});
269 | 
270 | 	fs.writeFile(filename,JSON.stringify(self),function(err) {
271 | 		callback(err, self);
272 | 	});
273 | };
274 | 
275 | // Reads the queue from disk
276 | FetchQueue.prototype.defrost = function(filename, callback) {
277 | 	callback = callback && callback instanceof Function ? callback : function(){};
278 | 	var fileData, self = this, defrostedQueue = [];
279 | 
280 | 	fs.readFile(filename,function(err,fileData) {
281 | 		if (err) return callback(err);
282 | 
283 | 		if (!fileData.toString("utf8").length) {
284 | 			return callback(new Error("Failed to defrost queue from zero-length JSON."));
285 | 		}
286 | 
287 | 		try {
288 | 			defrostedQueue = JSON.parse(fileData.toString("utf8"));
289 | 		} catch(error) {
290 | 			return callback(error);
291 | 		}
292 | 
293 | 		self.oldestUnfetchedIndex = Infinity;
294 | 		self.scanIndex = {};
295 | 
296 | 		for (var index in defrostedQueue) {
297 | 			if (defrostedQueue.hasOwnProperty(index) && !isNaN(index)) {
298 | 				var queueItem = defrostedQueue[index];
299 | 				self.push(queueItem);
300 | 
301 | 				if (queueItem.status !== "downloaded")
302 | 					self.oldestUnfetchedIndex = Math.min(
303 | 							self.oldestUnfetchedIndex, index);
304 | 
305 | 				self.scanIndex[queueItem.url] = true;
306 | 			}
307 | 		}
308 | 
309 | 		if (self.oldestUnfetchedIndex === Infinity)
310 | 			self.oldestUnfetchedIndex = 0;
311 | 
312 | 		callback(null,self);
313 | 	});
314 | };
315 | 


--------------------------------------------------------------------------------
/lib/cookies.js:
--------------------------------------------------------------------------------
  1 | // Cookie Jar Functionality
  2 | var EventEmitter	= require("events").EventEmitter,
  3 | 	util			= require("util");
  4 | 
  5 | /*
  6 | 	Public: Constructor for the cookie jar.
  7 | 
  8 | 	Examples
  9 | 
 10 | 		var cookieJar = new CookieJar();
 11 | 
 12 | 	Returns the cookie jar object which has now been constructed.
 13 | 
 14 | */
 15 | function CookieJar() {
 16 | 	var cookies = [];
 17 | 	this.__defineGetter__("cookies",function() {
 18 | 		return cookies;
 19 | 	});
 20 | 
 21 | 	// Run the EventEmitter constructor
 22 | 	EventEmitter.call(this);
 23 | }
 24 | 
 25 | util.inherits(CookieJar,EventEmitter);
 26 | 
 27 | /*
 28 | 	Public: Adds a new cookie to the jar, either by creating a new Cookie() object
 29 | 	from specific details such as name, value, etc., accepting a string from a
 30 | 	Set-Cookie header, or by passing in an existing Cookie() object.
 31 | 
 32 | 	name				-	The name of the cookie to add. Alternately, set-cookie
 33 | 							header as string, or an existing cookie object.
 34 | 	value				-	The value of the cookie.
 35 | 	expiry				-	Expiry timestamp in milliseconds.
 36 | 	path				-	Limit cookie to path (defaults to "/")
 37 | 	domain				-	Limit cookie to domain
 38 | 	httponly			-	Boolean value specifying httponly
 39 | 	cb					-	Optional callback.
 40 | 
 41 | 	Emits
 42 | 
 43 | 		addcookie		-	Emitted with new cookie object as an argument.
 44 | 
 45 | 	Examples
 46 | 
 47 | 		cookieJar.add("mycookie","myValue",Date.now(),"/","test.com",false);
 48 | 
 49 | 	Returns the cookie jar object for chaining.
 50 | 
 51 | */
 52 | CookieJar.prototype.add = function(name,value,expiry,path,domain,httponly,cb) {
 53 | 
 54 | 	var existingIndex = -1, newCookie;
 55 | 
 56 | 	if (arguments.length > 1) {
 57 | 		newCookie = new Cookie(name,value,expiry,path,domain,httponly);
 58 | 	} else if (name instanceof Cookie) {
 59 | 		newCookie = name;
 60 | 	} else {
 61 | 		newCookie = Cookie.fromString(name);
 62 | 	}
 63 | 
 64 | 	// Are we updating an existing cookie or adding a new one?
 65 | 	this.cookies.forEach(function(cookie,index) {
 66 | 		if (cookie.name === newCookie.name &&
 67 | 			cookie.matchDomain(newCookie.domain)) {
 68 | 
 69 | 			existingIndex = index;
 70 | 		}
 71 | 	});
 72 | 
 73 | 	if (existingIndex < 0) {
 74 | 		this.cookies.push(newCookie);
 75 | 	} else {
 76 | 		this.cookies[existingIndex] = newCookie;
 77 | 	}
 78 | 
 79 | 	this.emit("addcookie",newCookie);
 80 | 
 81 | 	if (cb && cb instanceof Function)
 82 | 		cb(null,newCookie);
 83 | 
 84 | 	return this;
 85 | };
 86 | 
 87 | /*
 88 | 	Public: Removes cookies from the cookie jar. If no domain and name are
 89 | 	specified, all cookies in the jar are removed.
 90 | 
 91 | 	name				-	The name of the cookie(s) to remove
 92 | 	domain				-	The domain from which to remove cookies.
 93 | 	cb					-	Optional callback.
 94 | 
 95 | 	Emits
 96 | 
 97 | 		removecookie	-	Emitted with array of removed cookies.
 98 | 
 99 | 	Examples
100 | 
101 | 		cookieJar.remove(null,"nytimes.com");
102 | 
103 | 	Returns an array of removed cookies.
104 | 
105 | */
106 | CookieJar.prototype.remove = function(name,domain,cb) {
107 | 	var cookiesRemoved = [], jar = this;
108 | 
109 | 	this.cookies.forEach(function(cookie,index) {
110 | 
111 | 		// If the names don't match, we're not removing this cookie
112 | 		if (!!name && cookie.name !== name)
113 | 			return false;
114 | 
115 | 		// If the domains don't match, we're not removing this cookie
116 | 		if (!!domain && !cookie.matchDomain(domain))
117 | 			return false;
118 | 
119 | 		// Matched. Remove!
120 | 		cookiesRemoved.push(jar.cookies.splice(index,1));
121 | 	});
122 | 
123 | 	jar.emit("removecookie",cookiesRemoved);
124 | 
125 | 	if (cb && cb instanceof Function)
126 | 		cb(null,cookiesRemoved);
127 | 
128 | 	return cookiesRemoved;
129 | };
130 | 
131 | /*
132 | 	Public: Gets an array of cookies based on name and domain.
133 | 
134 | 	name				-	The name of the cookie(s) to retrieve
135 | 	domain				-	The domain from which to retrieve cookies.
136 | 	cb					-	Optional callback.
137 | 
138 | 	Examples
139 | 
140 | 		cookieJar.get(null,"nytimes.com");
141 | 
142 | 	Returns an array of cookies.
143 | 
144 | */
145 | CookieJar.prototype.get = function(name,domain,cb) {
146 | 
147 | 	var cookies =
148 | 		this.cookies.filter(function(cookie,index) {
149 | 
150 | 			// If the names don't match, we're not returning this cookie
151 | 			if (!!name && cookie.name !== name)
152 | 				return false;
153 | 
154 | 			// If the domains don't match, we're not returning this cookie
155 | 			if (!!domain && !cookie.matchDomain(domain))
156 | 				return false;
157 | 
158 | 			return true;
159 | 		});
160 | 
161 | 	if (cb && cb instanceof Function)
162 | 		cb(null,cookies);
163 | 
164 | 	return cookies;
165 | };
166 | 
167 | /*
168 | 	Public: Generates an array of headers based on the value of the cookie jar.
169 | 
170 | 	domain				-	The domain from which to generate cookies.
171 | 	path				-	Filter headers to cookies applicable to this path.
172 | 	cb					-	Optional callback.
173 | 
174 | 	Examples
175 | 
176 | 		cookieJar.getAsHeader("nytimes.com","/myaccount");
177 | 
178 | 	Returns an array of cookie headers.
179 | 
180 | */
181 | CookieJar.prototype.getAsHeader = function(domain,path,cb) {
182 | 
183 | 	var headers =
184 | 		this.cookies
185 | 			.filter(function(cookie) {
186 | 				if (cookie.isExpired()) return false;
187 | 				if (!domain && !path) return true;
188 | 				if (domain) return cookie.matchDomain(domain);
189 | 				if (path) return cookie.matchPath(path);
190 | 			})
191 | 			.map(function(cookie) {
192 | 				return cookie.toString();
193 | 			});
194 | 
195 | 	if (cb && cb instanceof Function)
196 | 		cb(null,headers);
197 | 
198 | 	return headers;
199 | };
200 | 
201 | /*
202 | 	Public: Adds cookies to the cookie jar based on an array of 'set-cookie'
203 | 	headers provided by a webserver. Duplicate cookies are overwritten.
204 | 
205 | 	headers				-	An array of 'set-cookie' headers
206 | 	cb					-	Optional callback.
207 | 
208 | 	Examples
209 | 
210 | 		cookieJar.addFromHeaders(res.headers["set-cookie"]);
211 | 
212 | 	Returns the cookie jar for chaining.
213 | 
214 | */
215 | CookieJar.prototype.addFromHeaders = function(headers,cb) {
216 | 	var jar = this;
217 | 
218 | 	if (!(headers instanceof Array))
219 | 		headers = [headers];
220 | 
221 | 	headers.forEach(function(header) {
222 | 		jar.add(header);
223 | 	});
224 | 
225 | 	if (cb && cb instanceof Function)
226 | 		cb(jar);
227 | 
228 | 	return jar;
229 | };
230 | 
231 | /*
232 | 	Public: Outputs a linefeed-separated list of set-cookie headers representing
233 | 	the entire contents of the cookie jar.
234 | 
235 | 	Examples
236 | 
237 | 		cookieJar.toString();
238 | 
239 | 	Returns a list of headers in string form.
240 | 
241 | */
242 | CookieJar.prototype.toString = function() {
243 | 	return this.getAsHeader().join("\n");
244 | };
245 | 
246 | 
247 | /*
248 | 	Public: Constructor for the Cookie() object: create a new cookie.
249 | 
250 | 	name				-	The name of the cookie to add.
251 | 	value				-	The value of the cookie.
252 | 	expires				-	Expiry timestamp in milliseconds.
253 | 	path				-	Limit cookie to path (defaults to "/")
254 | 	domain				-	Limit cookie to domain
255 | 	httponly			-	Boolean value specifying httponly
256 | 
257 | 	Examples
258 | 
259 | 		var myCookie = new Cookie("mycookie","myValue",Date.now(),"/","test.com",false);
260 | 
261 | 	Returns the newly created Cookie object.
262 | 
263 | */
264 | function Cookie(name,value,expires,path,domain,httponly) {
265 | 
266 | 	if (!name) throw new Error("A name is required to create a cookie.");
267 | 
268 | 	// Parse date to timestamp - consider it never expiring if timestamp is not
269 | 	// passed to the function
270 | 	if (expires) {
271 | 
272 | 		if (typeof expires !== "number")
273 | 			expires = (new Date(expires)).getTime();
274 | 
275 | 	} else {
276 | 		expires = -1;
277 | 	}
278 | 
279 | 	this.name		= name;
280 | 	this.value		= value || "";
281 | 	this.expires	= expires;
282 | 	this.path		= path || "/";
283 | 	this.domain		= domain || "*";
284 | 	this.httponly	= !!httponly;
285 | }
286 | 
287 | /*
288 | 	Public, Static: Returns a new Cookie() object based on a header string.
289 | 
290 | 	string				-	A set-cookie header string
291 | 
292 | 	Examples
293 | 
294 | 		var myCookie = Cookie.fromString(response.headers["set-cookie"][0]);
295 | 
296 | 	Returns the newly created Cookie object.
297 | 
298 | */
299 | Cookie.fromString = function(string) {
300 | 
301 | 	if (!string || typeof string !== "string")
302 | 		throw new Error("String must be supplied to generate a cookie.");
303 | 
304 | 	function parseKeyVal(input) {
305 | 		var key = input.split(/\=/).shift(),
306 | 			val	= input.split(/\=/).slice(1).join("=");
307 | 
308 | 		return [key,val];
309 | 	}
310 | 
311 | 	string = string.replace(/^\s*set\-cookie\s*\:\s*/i,"");
312 | 
313 | 	var parts		= string.split(/\s*\;\s*/i),
314 | 		name		= parseKeyVal(parts.shift()),
315 | 		keyValParts	= {};
316 | 
317 | 	keyValParts.name = name[0];
318 | 	keyValParts.value = name[1];
319 | 
320 | 	parts
321 | 		.filter(function(input) {
322 | 			return !!input.replace(/\s+/ig,"").length;
323 | 		})
324 | 		.map(parseKeyVal)
325 | 		.forEach(function(keyval) {
326 | 			var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig,"");
327 | 			keyValParts[key] = keyval[1];
328 | 		});
329 | 
330 | 	return new Cookie(
331 | 		keyValParts.name,
332 | 		keyValParts.value,
333 | 		keyValParts.expires || keyValParts.expiry,
334 | 		keyValParts.path,
335 | 		keyValParts.domain,
336 | 		keyValParts.hasOwnProperty("httponly")
337 | 	);
338 | };
339 | 
340 | /*
341 | 	Public: Outputs the cookie as a string, in the form of a set-cookie header.
342 | 
343 | 	includeHeader		-	Boolean value specifying whether to include the
344 | 							'Set-Cookie: ' header name at the beginning of the
345 | 							string.
346 | 
347 | 	Examples
348 | 
349 | 		var header = myCookie.toString(true);
350 | 
351 | 	Returns the header string.
352 | 
353 | */
354 | Cookie.prototype.toString = function(includeHeader) {
355 | 	var string = "";
356 | 
357 | 	if (includeHeader) string = "Set-Cookie: ";
358 | 
359 | 	string += this.name + "=" + this.value + "; ";
360 | 
361 | 	if (this.expires > 0)
362 | 		string += "Expires=" + (new Date(this.expires)).toGMTString() + "; ";
363 | 
364 | 	if (!!this.path)
365 | 		string += "Path=" + this.path + "; ";
366 | 
367 | 	if (!!this.domain)
368 | 		string += "Domain=" + this.domain + "; ";
369 | 
370 | 	if (!!this.httponly)
371 | 		string += "Httponly; ";
372 | 
373 | 	return string;
374 | };
375 | 
376 | /*
377 | 	Public: Determines whether a cookie has expired or not.
378 | 
379 | 	Examples
380 | 
381 | 		if (myCookie.isExpired()) { ... }
382 | 
383 | 	Returns a boolean value specifying whether the cookie has expired (true) or
384 | 	whether it is still valid (false.)
385 | 
386 | */
387 | Cookie.prototype.isExpired = function() {
388 | 	if (this.expires < 0) return false;
389 | 	return (this.expires < Date.now());
390 | };
391 | 
392 | /*
393 | 	Public: Determines whether a cookie matches a given domain.
394 | 
395 | 	Examples
396 | 
397 | 		if (myCookie.matchDomain("example.com")) { ... }
398 | 
399 | 	Returns a boolean value specifying whether the cookie matches (true) or
400 | 	doesn't match (false.)
401 | 
402 | */
403 | Cookie.prototype.matchDomain = function(domain) {
404 | 	var reverseDomain = this.domain.split("").reverse().join(""),
405 | 		reverseDomainComp = domain.split("").reverse().join("");
406 | 
407 | 	return reverseDomain.indexOf(reverseDomainComp) === 0;
408 | };
409 | 
410 | /*
411 | 	Public: Determines whether a cookie matches a given path.
412 | 
413 | 	Examples
414 | 
415 | 		if (myCookie.matchPath("/test/account")) { ... }
416 | 
417 | 	Returns a boolean value specifying whether the cookie matches (true) or
418 | 	doesn't match (false.)
419 | 
420 | */
421 | Cookie.prototype.matchPath = function(path) {
422 | 	if (!this.path) return true;
423 | 
424 | 	return path.indexOf(this.path) === 0;
425 | };
426 | 
427 | module.exports = CookieJar;
428 | module.exports.Cookie = Cookie;
429 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
  1 | # Simple web-crawler for node.js [![Build Status](https://travis-ci.org/cgiffard/node-simplecrawler.png?branch=master)](https://travis-ci.org/cgiffard/node-simplecrawler)
  2 | 
  3 | Simplecrawler is designed to provide the most basic possible API for crawling
  4 | websites, while being as flexible and robust as possible. I wrote simplecrawler
  5 | to archive, analyse, and search some very large websites. It has happily chewed
  6 | through 50,000 pages and written tens of gigabytes to disk without issue.
  7 | 
  8 | #### Example (simple mode)
  9 | 
 10 | ```javascript
 11 | var Crawler = require("simplecrawler");
 12 | 
 13 | Crawler.crawl("http://example.com/")
 14 | 	.on("fetchcomplete",function(queueItem){
 15 | 		console.log("Completed fetching resource:",queueItem.url);
 16 | 	});
 17 | ```
 18 | 
 19 | ### What does simplecrawler do?
 20 | 
 21 | * Provides a very simple event driven API using `EventEmitter`
 22 | * Extremely configurable base for writing your own crawler
 23 | * Provides some simple logic for autodetecting linked resources - which you can
 24 | replace or augment
 25 | * Has a flexible queue system which can be frozen to disk and defrosted
 26 | * Provides basic statistics on network performance
 27 | * Uses buffers for fetching and managing data, preserving binary data (except
 28 | when discovering links)
 29 | 
 30 | ### Installation
 31 | 
 32 | ```
 33 | npm install simplecrawler
 34 | ```
 35 | 
 36 | ### Getting Started
 37 | 
 38 | There are two ways of instantiating a new crawler - a simple but less flexible
 39 | method inspired by [anemone](http://anemone.rubyforge.org), and the traditional
 40 | method which provides a little more room to configure crawl parameters.
 41 | 
 42 | Regardless of wether you use the simple or traditional methods of instantiation,
 43 | you'll need to require simplecrawler:
 44 | 
 45 | ```javascript
 46 | var Crawler = require("simplecrawler");
 47 | ```
 48 | 
 49 | #### Simple Mode
 50 | 
 51 | Simple mode generates a new crawler for you, preconfigures it based on a URL you
 52 | provide, and returns the crawler to you for further configuration and so you can
 53 | attach event handlers.
 54 | 
 55 | Simply call `Crawler.crawl`, with a URL first parameter, and two optional
 56 | functions that will be added as event listeners for `fetchcomplete` and
 57 | `fetcherror` respectively.
 58 | 
 59 | ```javascript
 60 | Crawler.crawl("http://example.com/", function(queueItem){
 61 | 	console.log("Completed fetching resource:",queueItem.url);
 62 | });
 63 | ```
 64 | 
 65 | Alternately, if you decide to omit these functions, you can use the returned
 66 | crawler object to add the event listeners yourself, and tweak configuration
 67 | options:
 68 | 
 69 | ```javascript
 70 | var crawler = Crawler.crawl("http://example.com/");
 71 | 
 72 | crawler.interval = 500;
 73 | 
 74 | crawler.on("fetchcomplete",function(queueItem){
 75 | 	console.log("Completed fetching resource:",queueItem.url);
 76 | });
 77 | ```
 78 | 
 79 | #### Advanced Mode
 80 | 
 81 | The alternative method of creating a crawler is to call the `simplecrawler`
 82 | constructor yourself, and to initiate the crawl manually.
 83 | 
 84 | ```javascript
 85 | var myCrawler = new Crawler("www.example.com");
 86 | ```
 87 | 
 88 | Nonstandard port? HTTPS? Want to start archiving a specific path? No problem:
 89 | 
 90 | ```javascript
 91 | myCrawler.initialPath = "/archive";
 92 | myCrawler.initialPort = 8080;
 93 | myCrawler.initialProtocol = "https";
 94 | 
 95 | // Or:
 96 | var myCrawler = new Crawler("www.example.com","/archive",8080);
 97 | 
 98 | ```
 99 | 
100 | And of course, you're probably wanting to ensure you don't take down your web
101 | server. Decrease the concurrency from five simultaneous requests - and increase
102 | the request interval from the default 250ms like this:
103 | 
104 | ```javascript
105 | myCrawler.interval = 10000; // Ten seconds
106 | myCrawler.maxConcurrency = 1;
107 | ```
108 | 
109 | You can also define a max depth for links to fetch :
110 | ```javascript
111 | myCrawler.maxDepth = 1; // Only first page is fetched (with linked CSS & images)
112 | // Or: 
113 | myCrawler.maxDepth = 2; // First page and discovered links from it are fetched
114 | // Or: 
115 | myCrawler.maxDepth = 3; // Etc.
116 | ```
117 | 
118 | For brevity, you may also specify the initial path and request interval when
119 | creating the crawler:
120 | 
121 | ```javascript
122 | var myCrawler = new Crawler("www.example.com","/",8080,300);
123 | ```
124 | 
125 | ### Running the crawler
126 | 
127 | First, you'll need to set up an event listener to get the fetched data:
128 | 
129 | ```javascript
130 | myCrawler.on("fetchcomplete",function(queueItem, responseBuffer, response) {
131 | 	console.log("I just received %s (%d bytes)",queueItem.url,responseBuffer.length);
132 | 	console.log("It was a resource of type %s",response.headers['content-type']);
133 | 	
134 | 	// Do something with the data in responseBuffer
135 | });
136 | ```
137 | 
138 | Then, when you're satisfied you're ready to go, start the crawler! It'll run
139 | through its queue finding linked resources on the domain to download, until it
140 | can't find any more.
141 | 
142 | ```javascript
143 | myCrawler.start();
144 | ```
145 | 
146 | Of course, once you've got that down pat, there's a fair bit more you can listen for...
147 | 
148 | ### Events
149 | 
150 | * `crawlstart`
151 | Fired when the crawl begins or is restarted.
152 | * `queueadd` ( queueItem )
153 | Fired when a new item is automatically added to the queue (not when you manually
154 | queue an item yourself.)
155 | * `queueduplicate` ( URLData )
156 | Fired when an item cannot be added to the queue because it is already present in
157 | the queue. Frequent firing of this event is normal and expected.
158 | * `queueerror` ( errorData , URLData )
159 | Fired when an item cannot be added to the queue due to error.
160 | * `fetchstart` ( queueItem , requestOptions )
161 | Fired when an item is spooled for fetching. If your event handler is synchronous,
162 | you can modify the crawler request options (including headers and request method.)
163 | * `fetchheaders` ( queueItem , responseObject )
164 | Fired when the headers for a resource are received from the server. The node http
165 | response object is returned for your perusal.
166 | * `fetchcomplete` ( queueItem , responseBuffer , response )
167 | Fired when the resource is completely downloaded. The entire file data is provided
168 | as a buffer, as well as the response object.
169 | * `fetchdataerror` ( queueItem, response )
170 | Fired when a resource can't be downloaded, because it exceeds the maximum size
171 | we're prepared to receive (16MB by default.)
172 | * `fetchredirect` ( queueItem, parsedURL, response )
173 | Fired when a redirect header is encountered. The new URL is validated and returned
174 | as a complete canonical link to the new resource.
175 | * `fetch404` ( queueItem, response )
176 | Fired when a 404 HTTP status code is returned for a request.
177 | * `fetcherror` ( queueItem, response )
178 | Fired when an alternate 400 or 500 series HTTP status code is returned for a
179 | request.
180 | * `fetchtimeout` ( queueItem, crawlerTimeoutValue )
181 | Fired when a request time exceeds the internal crawler threshold.
182 | * `fetchclienterror` ( queueItem, errorData )
183 | Fired when a request dies locally for some reason. The error data is returned as
184 | the second parameter.
185 | * `discoverycomplete` ( queueItem, resources )
186 | Fired when linked resources have been discovered. Passes an array of resources
187 | (as URLs) as the second parameter.
188 | * `complete`
189 | Fired when the crawler completes processing all the items in its queue, and does
190 | not find any more to add. This event returns no arguments.
191 | 
192 | #### A note about HTTP error conditions
193 | By default, simplecrawler does not download the response body when it encounters
194 | an HTTP error status in the response. If you need this information, you can listen
195 | to simplecrawler's error events, and through node's native `data` event
196 | (`response.on("data",function(chunk) {...})`) you can save the information yourself.
197 | 
198 | If this is annoying, and you'd really like to retain error pages by default, let
199 | me know. I didn't include it because I didn't need it - but if it's important to
200 | people I might put it back in. :)
201 | 
202 | #### Waiting for Asynchronous Event Listeners
203 | 
204 | Sometimes, you might want to wait for simplecrawler to wait for you while you
205 | perform sone asynchronous tasks in an event listener, instead of having it
206 | racing off and firing the `complete` event, halting your crawl. For example,
207 | if you're doing your own link discovery using an asynchronous library method.
208 | 
209 | Simplecrawler provides a `wait` method you can call at any time. It is available
210 | via `this` from inside listeners, and on the crawler object itself. It returns
211 | a callback function.
212 | 
213 | Once you've called this method, simplecrawler will not fire the `complete` event
214 | until either you execute the callback it returns, or a timeout is reached
215 | (configured in `crawler.listenerTTL`, by default 10000 msec.)
216 | 
217 | ##### Example Asynchronous Event Listener
218 | 
219 | ```javascript
220 | crawler.on("fetchcomplete",function(queueItem,data,res) {
221 | 	var continue = this.wait();
222 | 	doSomeDiscovery(data,function(foundURLs){
223 | 		foundURLs.forEach(crawler.queueURL.bind(crawler));
224 | 		continue();
225 | 	});
226 | });
227 | ```
228 | 
229 | ### Configuring the crawler
230 | 
231 | Here's a complete list of what you can stuff with at this stage:
232 | 
233 | *	`crawler.host` -
234 | 	The domain to scan. By default, simplecrawler will restrict all requests to
235 | 	this domain.
236 | *	`crawler.initialPath` -
237 | 	The initial path with which the crawler will formulate its first request.
238 | 	Does not restrict subsequent requests.
239 | *	`crawler.initialPort` -
240 | 	The initial port with which the crawler will formulate its first request.
241 | 	Does not restrict subsequent requests.
242 | *	`crawler.initialProtocol` -
243 | 	The initial protocol with which the crawler will formulate its first request.
244 | 	Does not restrict subsequent requests.
245 | *	`crawler.interval` -
246 | 	The interval with which the crawler will spool up new requests (one per
247 | 	tick.) Defaults to 250ms.
248 | *	`crawler.maxConcurrency` -
249 | 	The maximum number of requests the crawler will run simultaneously. Defaults
250 | 	to 5 - the default number of http agents node will run.
251 | *	`crawler.timeout` -
252 | 	The maximum time in milliseconds the crawler will wait for headers before
253 | 	aborting the request.
254 | *	`crawler.listenerTTL` -
255 | 	The maximum time in milliseconds the crawler will wait for async listeners.
256 | *	`crawler.userAgent` -
257 | 	The user agent the crawler will report. Defaults to
258 | 	`Node/SimpleCrawler <version> (http://www.github.com/cgiffard/node-simplecrawler)`.
259 | *	`crawler.queue` -
260 | 	The queue in use by the crawler (Must implement the `FetchQueue` interface)
261 | *	`crawler.filterByDomain` -
262 | 	Specifies whether the crawler will restrict queued requests to a given
263 | 	domain/domains.
264 | *	`crawler.scanSubdomains` -
265 | 	Enables scanning subdomains (other than www) as well as the specified domain.
266 | 	Defaults to false.
267 | *	`crawler.ignoreWWWDomain` -
268 | 	Treats the `www` domain the same as the originally specified domain.
269 | 	Defaults to true.
270 | *	`crawler.stripWWWDomain` -
271 | 	Or go even further and strip WWW subdomain from requests altogether!
272 | *	`crawler.stripQuerystring` -
273 | 	Specify to strip querystring parameters from URLs. Defaults to false.
274 | *	`crawler.discoverResources` -
275 | 	Use simplecrawler's internal resource discovery function. Defaults to true.
276 | 	(switch it off if you'd prefer to discover and queue resources yourself!)
277 | *	`crawler.discoverRegex` -
278 | 	Array of regex objects that simplecrawler uses to discover resources.
279 | *	`crawler.cache` -
280 | 	Specify a cache architecture to use when crawling. Must implement
281 | 	`SimpleCache` interface.
282 | *	`crawler.useProxy` -
283 | 	The crawler should use an HTTP proxy to make its requests.
284 | *	`crawler.proxyHostname` -
285 | 	The hostname of the proxy to use for requests.
286 | *	`crawler.proxyPort` -
287 | 	The port of the proxy to use for requests.
288 | *	`crawler.proxyUser` -
289 | 	The username for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.)
290 | *	`crawler.proxyPass` -
291 | 	The password for HTTP/Basic proxy authentication (leave unset for unauthenticated proxies.)
292 | *	`crawler.domainWhitelist` -
293 | 	An array of domains the crawler is permitted to crawl from. If other settings
294 | 	are more permissive, they will override this setting.
295 | *	`crawler.supportedMimeTypes` -
296 | 	An array of RegEx objects used to determine supported MIME types (types of
297 | 	data simplecrawler will scan for links.) If you're  not using simplecrawler's
298 | 	resource discovery function, this won't have any effect.
299 | *	`crawler.allowedProtocols` -
300 | 	An array of RegEx objects used to determine whether a URL protocol is supported.
301 | 	This is to deal with nonstandard protocol handlers that regular HTTP is
302 | 	sometimes given, like `feed:`. It does not provide support for non-http
303 | 	protocols (and why would it!?)
304 | *	`crawler.maxResourceSize` - 
305 | 	The maximum resource size, in bytes, which will be downloaded. Defaults to 16MB.
306 | *	`crawler.downloadUnsupported` -
307 | 	Simplecrawler will download files it can't parse. Defaults to true, but if
308 | 	you'd rather save the RAM and GC lag, switch it off.
309 | *	`crawler.needsAuth` -
310 | 	Flag to specify if the domain you are hitting requires basic authentication
311 | *	`crawler.authUser` -
312 | 	Username provided for needsAuth flag
313 | *	`crawler.authPass` -
314 | 	Password provided for needsAuth flag
315 | *	`crawler.customHeaders` -
316 | 	An object specifying a number of custom headers simplecrawler will add to
317 | 	every request. These override the default headers simplecrawler sets, so
318 | 	be careful with them. If you want to tamper with headers on a per-request basis,
319 | 	see the `fetchqueue` event.
320 | *	`crawler.acceptCookies` -
321 | 	Flag to indicate if the crawler should hold on to cookies
322 | *	`crawler.urlEncoding` -
323 | 	Set this to `iso8859` to trigger URIjs' re-encoding of iso8859 URLs to unicode.
324 | 	Defaults to `unicode`.
325 | *	`crawler.parseHTMLComments` -
326 | 	Whether to scan for URLs inside HTML comments.
327 | 	Defaults to `true`.
328 | *	`crawler.parseScriptTags` -
329 | 	Whether to scan for URLs inside script tags.
330 | 	Defaults to `true`.
331 | *	`myCrawler.maxDepth` -
332 | 	Defines a maximum distance from the original request at which resources will
333 | 	be downloaded. Asset files are excluded from this distance condition.
334 | 	Defaults to `0` — no max depth.
335 | 
336 | #### Excluding certain resources from downloading
337 | 
338 | Simplecrawler has a mechanism you can use to prevent certain resources from being
339 | fetched, based on the URL, called *Fetch Conditions**. A fetch condition is just
340 | a function, which, when given a parsed URL object, will return a true or a false
341 | value, indicating whether a given resource should be downloaded.
342 | 
343 | You may add as many fetch conditions as you like, and remove them at runtime.
344 | Simplecrawler will evaluate every single condition against every queued URL, and
345 | should just one of them return a falsy value (this includes null and undefined,
346 | so remember to always return a value!) then the resource in question will not be
347 | fetched.
348 | 
349 | ##### Adding a fetch condition
350 | 
351 | This example fetch condition prevents URLs ending in `.pdf` from downloading.
352 | Adding a fetch condition assigns it an ID, which the `addFetchCondition` function
353 | returns. You can use this ID to remove the condition later.
354 | 
355 | ```javascript
356 | var conditionID = myCrawler.addFetchCondition(function(parsedURL) {
357 | 	return !parsedURL.path.match(/\.pdf$/i);
358 | });
359 | ```
360 | 
361 | NOTE: simplecrawler uses slightly different terminology to URIjs. `parsedURL.path`
362 | includes the query string too. If you want the path without the query string,
363 | use `parsedURL.uriPath`.
364 | 
365 | ##### Removing a fetch condition
366 | 
367 | If you stored the ID of the fetch condition you added earlier, you can remove it
368 | from the crawler:
369 | 
370 | ```javascript
371 | myCrawler.removeFetchCondition(conditionID);
372 | ```
373 | 
374 | ### The Simplecrawler Queue
375 | 
376 | Simplecrawler has a queue like any other web crawler. It can be directly accessed
377 | at `crawler.queue` (assuming you called your Crawler() object `crawler`.) It
378 | provides array access, so you can get to queue items just with array notation
379 | and an index.
380 | 
381 | ```javascript
382 | crawler.queue[5];
383 | ```
384 | 
385 | For compatibility with different backing stores, it now provides an alternate
386 | interface which the crawler core makes use of:
387 | 
388 | ```javascript
389 | crawler.queue.get(5);
390 | ```
391 | 
392 | It's not just an array though.
393 | 
394 | #### Adding to the queue
395 | 
396 | The simplest way to add to the queue is to use the crawler's own method,
397 | `crawler.queueURL`. This method takes a complete URL, validates and deconstructs
398 | it, and adds it to the queue.
399 | 
400 | If you instead want to add a resource by its components, you may call the
401 | `queue.add` method directly:
402 | 
403 | ```javascript
404 | crawler.queue.add(protocol,hostname,port,path);
405 | ```
406 | 
407 | That's it! It's basically just a URL, but comma separated (that's how you can
408 | remember the order.)
409 | 
410 | #### Queue items
411 | 
412 | Because when working with simplecrawler, you'll constantly be handed queue items,
413 | it helps to know what's inside them. These are the properties every queue item
414 | is expected to have:
415 | 
416 | * `url` - The complete, canonical URL of the resource.
417 | * `protocol` - The protocol of the resource (http, https)
418 | * `host` - The full domain/hostname of the resource
419 | * `port` - The port of the resource
420 | * `path` - The bit of the URL after the domain - includes the querystring.
421 | * `fetched` - Has the request for this item been completed? You can monitor this as requests are processed.
422 | * `status` - The internal status of the item, always a string. This can be one of:
423 | 	* `queued` - The resource is in the queue to be fetched, but nothing's happened to it yet.
424 | 	* `spooled` - A request has been made to the remote server, but we're still waiting for a response.
425 | 	* `headers` - The headers for the resource have been received.
426 | 	* `downloaded` - The item has been entirely downloaded.
427 | 	* `redirected` - The resource request returned a 300 series response, with a Location header and a new URL.
428 | 	* `notfound` - The resource could not be found. (404)
429 | 	* `failed` - An error occurred when attempting to fetch the resource.
430 | * `stateData` - An object containing state data and other information about the request:
431 | 	* `requestLatency` - The time taken for headers to be received after the request was made.
432 | 	* `requestTime` - The total time taken for the request (including download time.)
433 | 	* `downloadTime` - The total time taken for the resource to be downloaded.
434 | 	* `contentLength` - The length (in bytes) of the returned content. Calculated based on the `content-length` header.
435 | 	* `contentType` - The MIME type of the content.
436 | 	* `code` - The HTTP status code returned for the request.
437 | 	* `headers` - An object containing the header information returned by the server. This is the object node returns as part of the `response` object.
438 | 	* `actualDataSize` - The length (in bytes) of the returned content. Calculated based on what is actually received, not the `content-length` header.
439 | 	* `sentIncorrectSize` - True if the data length returned by the server did not match what we were told to expect by the `content-length` header.
440 | 
441 | You can address these properties like you would any other object:
442 | 
443 | ```javascript
444 | crawler.queue[52].url;
445 | queueItem.stateData.contentLength;
446 | queueItem.status === "queued";
447 | ```
448 | 
449 | As you can see, you can get a lot of meta-information out about each request. The
450 | upside is, the queue actually has some convenient functions for getting simple
451 | aggregate data about the queue...
452 | 
453 | #### Queue Statistics and Reporting
454 | 
455 | First of all, the queue can provide some basic statistics about the network
456 | performance of your crawl (so far.) This is done live, so don't check it thirty
457 | times a second. You can test the following properties:
458 | 
459 | * `requestTime`
460 | * `requestLatency`
461 | * `downloadTime`
462 | * `contentLength`
463 | * `actualDataSize`
464 | 
465 | And you can get the maximum, minimum, and average values for each with the
466 | `crawler.queue.max`, `crawler.queue.min`, and `crawler.queue.avg` functions
467 | respectively. Like so:
468 | 
469 | ```javascript
470 | console.log("The maximum request latency was %dms.",crawler.queue.max("requestLatency"));
471 | console.log("The minimum download time was %dms.",crawler.queue.min("downloadTime"));
472 | console.log("The average resource size received is %d bytes.",crawler.queue.avg("actualDataSize"));
473 | ```
474 | 
475 | You'll probably often need to determine how many items in the queue have a given
476 | status at any one time, and/or retreive them. That's easy with
477 | `crawler.queue.countWithStatus` and `crawler.queue.getWithStatus`.
478 | 
479 | `crawler.queue.countWithStatus` returns the number of queued items with a given
480 | status, while `crawler.queue.getWithStatus` returns an array of the queue items
481 | themselves.
482 | 
483 | ```javascript
484 | var redirectCount = crawler.queue.countWithStatus("redirected");
485 | 
486 | crawler.queue.getWithStatus("failed").forEach(function(queueItem) {
487 | 	console.log("Whoah, the request for %s failed!",queueItem.url);
488 | 	
489 | 	// do something...
490 | });
491 | ```
492 | 
493 | Then there's some even simpler convenience functions:
494 | 
495 | *	`crawler.queue.complete` - returns the number of queue items which have been
496 | 	completed (marked as fetched)
497 | *	`crawler.queue.errors` - returns the number of requests which have failed
498 | 	(404s and other 400/500 errors, as well as client errors)
499 | 
500 | #### Saving and reloading the queue (freeze/defrost)
501 | 
502 | You'll probably want to be able to save your progress and reload it later, if
503 | your application fails or you need to abort the crawl for some reason. (Perhaps
504 | you just want to finish off for the night and pick it up tomorrow!) The
505 | `crawler.queue.freeze` and `crawler.queue.defrost` functions perform this task.
506 | 
507 | **A word of warning though** - they are not CPU friendly as they rely on
508 | JSON.parse and JSON.stringify. Use them only when you need to save the queue -
509 | don't call them every request or your application's performance will be incredibly
510 | poor - they block like *crazy*. That said, using them when your crawler commences
511 | and stops is perfectly reasonable.
512 | 
513 | Note that the methods themselves are asynchronous, so if you are going to exit the
514 | process after you do the freezing, make sure you wait for callback - otherwise
515 | you'll get an empty file.
516 | 
517 | ```javascript
518 | // Freeze queue
519 | crawler.queue.freeze("mysavedqueue.json", function() {
520 | 	process.exit();
521 | });
522 | 
523 | // Defrost queue
524 | crawler.queue.defrost("mysavedqueue.json");
525 | ```
526 | 
527 | ## Cookies
528 | 
529 | Simplecrawler now has an internal cookie jar, which collects and resends cookies
530 | automatically, and by default.
531 | 
532 | If you want to turn this off, set the `crawler.acceptCookies` option to `false`.
533 | 
534 | The cookie jar is accessible via `crawler.cookies`, and is an event emitter itself:
535 | 
536 | ### Cookie Events
537 | 
538 | * `addcookie` ( cookie )
539 | Fired when a new cookie is added to the jar.
540 | * `removecookie` ( cookie array )
541 | Fired when one or more cookies are removed from the jar.
542 | 
543 | ## Building and Testing
544 | 
545 | #### Build Status:
546 | 
547 | * Master: [![Build Status](https://travis-ci.org/cgiffard/node-simplecrawler.png?branch=master)](https://travis-ci.org/cgiffard/node-simplecrawler)
548 | * Development: [![Build Status](https://travis-ci.org/cgiffard/node-simplecrawler.png?branch=development)](https://travis-ci.org/cgiffard/node-simplecrawler)
549 | 
550 | ## Contributors
551 | 
552 | I'd like to extend sincere thanks to:
553 | 
554 | *	[Nick Crohn](https://github.com/ncrohn) for the HTTP Basic auth support, and
555 | 	initial cookie support.
556 | *	[Mike Moulton](https://github.com/mmoulton) for
557 | 	[fixing a bug in the URL discovery mechanism]
558 | 	(https://github.com/cgiffard/node-simplecrawler/pull/3), as well as
559 | 	[adding the `discoverycomplete` event]
560 | 	(https://github.com/cgiffard/node-simplecrawler/pull/10),
561 | *	[Mike Iannacone](https://github.com/mikeiannacone) for correcting a keyword
562 | 	naming collision with node 0.8's EventEmitter.
563 | *	[Greg Molnar](https://github.com/gregmolnar) for
564 | 	[adding a querystring-free path parameter to parsed URL objects.]
565 | 	(https://github.com/cgiffard/node-simplecrawler/pull/31)
566 | *	[Breck Yunits](https://github.com/breck7) for contributing a useful code
567 | 	sample demonstrating using simplecrawler for caching a website to disk!
568 | *	[Luke Plaster](https://github.com/notatestuser) for enabling protocol-agnostic
569 | 	link discovery
570 | *	[Zeus](https://github.com/distracteddev) for fixing a bug where [default port
571 | 	info was wrongly specified in requests]
572 | 	(https://github.com/cgiffard/node-simplecrawler/pull/40)
573 | 	and for fixing the missing request timeout handling!
574 | *	[Graham Hutchinson](https://github.com/ghhutch) for adding
575 | 	querystring-stripping option 
576 | *	[Jellyfrog](https://github.com/jellyfrog) for assisting in diagnosing some
577 | 	nasty EventEmitter issues.
578 | *	[Brian Moeskau](https://github.com/bmoeskau) for helping to fix the confusing
579 | 	'async' events API, and providing invaluable feedback.
580 | 
581 | And everybody else who has helped out in some way! :)
582 | 
583 | ## Licence
584 | 
585 | Copyright (c) 2013, Christopher Giffard.
586 | 
587 | All rights reserved.
588 | 
589 | Redistribution and use in source and binary forms, with or without modification, 
590 | are permitted provided that the following conditions are met:
591 | 
592 | * Redistributions of source code must retain the above copyright notice, this
593 |   list of conditions and the following disclaimer.
594 | * Redistributions in binary form must reproduce the above copyright notice, this
595 |   list of conditions and the following disclaimer in the documentation and/or
596 |   other materials provided with the distribution.
597 | 
598 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
599 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
600 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
601 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
602 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
603 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
604 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
605 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
606 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
607 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
608 | 


--------------------------------------------------------------------------------
/lib/crawler.js:
--------------------------------------------------------------------------------
   1 | // Simplecrawler
   2 | // Christopher Giffard, 2011 - 2013+
   3 | //
   4 | // http://www.github.com/cgiffard/node-simplecrawler
   5 | 
   6 | // Queue Dependency
   7 | var FetchQueue		= require("./queue.js"),
   8 | 	Cache			= require("./cache.js"),
   9 | 	CookieJar		= require("./cookies.js"),
  10 | 	MetaInfo		= require("../package.json");
  11 | 
  12 | var http			= require("http"),
  13 | 	https			= require("https"),
  14 | 	EventEmitter	= require('events').EventEmitter,
  15 | 	URI				= require("URIjs"),
  16 | 	zlib			= require("zlib"),
  17 | 	util			= require("util");
  18 | 	
  19 | var QUEUE_ITEM_INITIAL_DEPTH = 1;
  20 | 
  21 | /*
  22 | 	Public: Constructor for the crawler.
  23 | 
  24 | 	host				-	Initial hostname/domain to begin crawling from. By
  25 | 							default, the crawl will be locked to this hostname.
  26 | 	initialPath			-	Initial path to begin crawling from.
  27 | 	initialPort			-	Port to begin crawling from.
  28 | 	interval			-	Request interval for the crawler. Defaults to 250ms.
  29 | 
  30 | 	Examples
  31 | 
  32 | 		var crawler = new Crawler("example.com","/",80,500);
  33 | 
  34 | 		var crawler = new Crawler("example.com");
  35 | 
  36 | 	Returns the crawler object which has now been constructed.
  37 | 
  38 | */
  39 | var Crawler = function(host,initialPath,initialPort,interval) {
  40 | 	var crawler = this;
  41 | 
  42 | 	// Data integrity checks
  43 | 	if (initialPort && isNaN(initialPort))
  44 | 		throw new Error("Port must be a number!");
  45 | 
  46 | 	// SETTINGS TO STUFF WITH
  47 | 	// (not here! Do it when you create a `new Crawler()`)
  48 | 
  49 | 	// Domain to crawl
  50 | 	crawler.host			= host || "";
  51 | 
  52 | 	// Gotta start crawling *somewhere*
  53 | 	crawler.initialPath		= initialPath || "/";
  54 | 	crawler.initialPort		= initialPort || 80;
  55 | 	crawler.initialProtocol	= "http";
  56 | 
  57 | 	// Internal 'tick' interval for spawning new requests
  58 | 	// (as long as concurrency is under cap)
  59 | 	// One request will be spooled per tick, up to the concurrency threshold.
  60 | 	crawler.interval		= interval || 250;
  61 | 
  62 | 	// Maximum request concurrency. Be sensible. Five ties in with node's
  63 | 	// default maxSockets value.
  64 | 	crawler.maxConcurrency	= 5;
  65 | 
  66 | 	// Maximum time we'll wait for headers
  67 | 	crawler.timeout			= 5 * 60 * 1000;
  68 | 
  69 | 	// Maximum time we'll wait for async listeners.
  70 | 	crawler.listenerTTL		= 10 * 1000;
  71 | 
  72 | 	// User Agent
  73 | 	crawler.userAgent =
  74 | 			"Node/" + MetaInfo.name + " " + MetaInfo.version +
  75 | 			" (" + MetaInfo.repository.url + ")";
  76 | 
  77 | 	// Queue for requests - FetchQueue gives us stats and other sugar
  78 | 	// (but it's basically just an array)
  79 | 	crawler.queue			= new FetchQueue();
  80 | 
  81 | 	// Do we filter by domain?
  82 | 	// Unless you want to be crawling the entire internet, I would
  83 | 	// recommend leaving this on!
  84 | 	crawler.filterByDomain	= true;
  85 | 
  86 | 	// Do we scan subdomains?
  87 | 	crawler.scanSubdomains	= false;
  88 | 
  89 | 	// Treat WWW subdomain the same as the main domain (and don't count
  90 | 	// it as a separate subdomain)
  91 | 	crawler.ignoreWWWDomain	= true;
  92 | 
  93 | 	// Or go even further and strip WWW subdomain from domains altogether!
  94 | 	crawler.stripWWWDomain	= false;
  95 | 
  96 | 	// Internal cachestore
  97 | 	crawler.cache			= null;
  98 | 
  99 | 	// Use an HTTP Proxy?
 100 | 	crawler.useProxy		= false;
 101 | 	crawler.proxyHostname	= "127.0.0.1";
 102 | 	crawler.proxyPort		= 8123;
 103 | 	crawler.proxyUser		= null;
 104 | 	crawler.proxyPass		= null;
 105 | 
 106 | 	// Support for HTTP basic auth
 107 | 	crawler.needsAuth		= false;
 108 | 	crawler.authUser		= "";
 109 | 	crawler.authPass		= "";
 110 | 
 111 | 	// Support for retaining cookies for parse duration
 112 | 	crawler.acceptCookies	= true;
 113 | 	crawler.cookies			= new CookieJar();
 114 | 
 115 | 	// Support for custom headers...
 116 | 	crawler.customHeaders	= {};
 117 | 
 118 | 	// Domain Whitelist
 119 | 	// We allow domains to be whitelisted, so cross-domain requests can be made.
 120 | 	crawler.domainWhitelist	= [];
 121 | 
 122 | 	// Supported Protocols
 123 | 	crawler.allowedProtocols = [
 124 | 		/^http(s)?$/i,					// HTTP & HTTPS
 125 | 		/^(rss|atom|feed)(\+xml)?$/i	// RSS / XML
 126 | 	];
 127 | 
 128 | 	// Max file size to download/store
 129 | 	crawler.maxResourceSize	= 1024 * 1024 * 16; // 16mb
 130 | 
 131 | 	// Supported MIME-types
 132 | 	// Matching MIME-types will be scanned for links
 133 | 	crawler.supportedMimeTypes = [
 134 | 		/^text\//i,
 135 | 		/^application\/(rss|html|xhtml)?[\+\/\-]?xml/i,
 136 | 		/^application\/javascript/i,
 137 | 		/^xml/i
 138 | 	];
 139 | 
 140 | 	// Download linked, but unsupported files (binary - images, documents, etc)
 141 | 	crawler.downloadUnsupported = true;
 142 | 
 143 | 	// URL Encoding setting...
 144 | 	crawler.urlEncoding = "unicode";
 145 | 
 146 | 	// Strip Querystring Parameters from URL
 147 | 	crawler.stripQuerystring = false;
 148 | 
 149 | 	// Regular expressions for finding URL items in HTML and text
 150 | 	crawler.discoverRegex = [
 151 | 		/(\shref\s?=\s?|\ssrc\s?=\s?|url\()([^\"\'\s>\)]+)/ig,
 152 | 		/(\shref\s?=\s?|\ssrc\s?=\s?|url\()['"]([^"']+)/ig,
 153 | 		/http(s)?\:\/\/[^?\s><\'\"]+/ig,
 154 | 		/url\([^\)]+/ig,
 155 | 
 156 | 		// This might be a bit of a gamble... but get hard-coded
 157 | 		// strings out of javacript: URLs. They're often popup-image
 158 | 		// or preview windows, which would otherwise be unavailable to us.
 159 | 		// Worst case scenario is we make some junky requests.
 160 | 		/^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig
 161 | 	];
 162 | 
 163 | 	// Whether to parse inside HTML comments
 164 | 	crawler.parseHTMLComments = true;
 165 | 
 166 | 	// Whether to parse inside script tags
 167 | 	crawler.parseScriptTags = true;
 168 | 
 169 | 	// Max depth parameter
 170 | 	crawler.maxDepth = 0;
 171 | 
 172 | 	// STATE (AND OTHER) VARIABLES NOT TO STUFF WITH
 173 | 	var hiddenProps = {
 174 | 		"_openRequests":	0,
 175 | 		"_fetchConditions":	[],
 176 | 		"_openListeners":	0
 177 | 	};
 178 | 
 179 | 	// Run the EventEmitter constructor
 180 | 	EventEmitter.call(crawler);
 181 | 
 182 | 	// Apply all the hidden props
 183 | 	Object.keys(hiddenProps).forEach(function(key) {
 184 | 		Object.defineProperty(crawler, key, {
 185 | 			"writable": true,
 186 | 			"enumerable": false,
 187 | 			"value": hiddenProps[key]
 188 | 		});
 189 | 	});
 190 | };
 191 | 
 192 | util.inherits(Crawler,EventEmitter);
 193 | 
 194 | /*
 195 | 	Public: Starts or resumes the crawl. If the queue is empty, it adds a new
 196 | 	queue item from which to begin crawling based on the initial configuration
 197 | 	of the crawler itself. The crawler waits for process.nextTick to begin, so
 198 | 	handlers and other properties can be altered or addressed before the crawl
 199 | 	commences.
 200 | 
 201 | 	Examples
 202 | 
 203 | 		crawler.start();
 204 | 
 205 | 	Returns the crawler object, to enable chaining.
 206 | 
 207 | */
 208 | Crawler.prototype.start = function() {
 209 | 	var crawler = this;
 210 | 
 211 | 	// only if we haven't already got stuff in our queue...
 212 | 	crawler.queue.getLength(function(err, length) {
 213 | 		if (err) throw err;
 214 | 
 215 | 		if (!length) {
 216 | 
 217 | 			// Initialise our queue by pushing the initial request data into it...
 218 | 			crawler.queue.add(
 219 | 				crawler.initialProtocol,
 220 | 				crawler.host,
 221 | 				crawler.initialPort,
 222 | 				crawler.initialPath,
 223 | 				QUEUE_ITEM_INITIAL_DEPTH,
 224 | 				function(error) {
 225 | 					if (error) throw error;
 226 | 				});
 227 | 		}
 228 | 
 229 | 		crawler.crawlIntervalID =
 230 | 			setInterval(
 231 | 				function() {
 232 | 					crawler.crawl.call(crawler);
 233 | 				},
 234 | 				crawler.interval);
 235 | 
 236 | 		crawler.emit("crawlstart");
 237 | 		crawler.running = true;
 238 | 
 239 | 		// Now kick off the initial crawl
 240 | 		process.nextTick(function() {
 241 | 			crawler.crawl();
 242 | 		});
 243 | 	});
 244 | 
 245 | 	return crawler;
 246 | };
 247 | 
 248 | /*
 249 | 	Public: Determines whether the protocol is supported, given a URL.
 250 | 
 251 | 	URL	- URL with a protocol, for testing.
 252 | 
 253 | 	Examples
 254 | 
 255 | 		crawler.protocolSupported("http://google.com/") // true, by default
 256 | 		crawler.protocolSupported("wss://google.com/") // false, by default
 257 | 
 258 | 	Returns a boolean, true if the protocol is supported - false if not.
 259 | 
 260 | */
 261 | Crawler.prototype.protocolSupported = function(URL) {
 262 | 	var protocol, crawler = this;
 263 | 
 264 | 	try {
 265 | 		protocol = URI(URL).protocol();
 266 | 
 267 | 		// Unspecified protocol. Assume http
 268 | 		if (!protocol)
 269 | 			protocol = "http";
 270 | 
 271 | 	} catch(e) {
 272 | 		// If URIjs died, we definitely /do not/ support the protocol.
 273 | 		return false;
 274 | 	}
 275 | 
 276 | 	return crawler.allowedProtocols.reduce(function(prev,protocolCheck) {
 277 | 		return prev || !!protocolCheck.exec(protocol);
 278 | 	},false);
 279 | };
 280 | 
 281 | /*
 282 | 	Public: Determines whether the mimetype is supported, given a mimetype
 283 | 
 284 | 	MIMEType	- String containing MIME type to test
 285 | 
 286 | 	Examples
 287 | 
 288 | 		crawler.mimeTypeSupported("text/html") // true, by default
 289 | 		crawler.mimeTypeSupported("application/octet-stream") // false, by default
 290 | 
 291 | 	Returns a boolean, true if the MIME type is supported - false if not.
 292 | 
 293 | */
 294 | Crawler.prototype.mimeTypeSupported = function(MIMEType) {
 295 | 	var crawler = this;
 296 | 
 297 | 	return (
 298 | 		crawler.supportedMimeTypes.reduce(function(prev,mimeCheck) {
 299 | 			return prev || !!mimeCheck.exec(MIMEType);
 300 | 		},false)
 301 | 	);
 302 | };
 303 | 
 304 | /*
 305 | 	Public: Determines whether the queueItem can be fetched from its depth
 306 | 
 307 | 	In fact, the queueItem need to be fetched before calling this (because we need its MIMEType).
 308 | 	This will just determine if we need to send an event for this item & if we need to fetch linked
 309 | 	resources.
 310 | 
 311 | 	If the queue item is a CSS or JS file, it will always be fetched (we need all images in CSS files,
 312 | 	even if max depth is already reached). If it's an HTML page, we will check if max depth is reached
 313 | 	or not.
 314 | 
 315 | 	queueItem	- Queue item object to check
 316 | 
 317 | 	Returns a boolean, true if the queue item can be fetched - false if not.
 318 | 
 319 | */
 320 | Crawler.prototype.depthAllowed = function(queueItem) {
 321 | 	var crawler = this;
 322 | 
 323 | 	// Items matching this pattern will always be fetched, even if max depth is reached
 324 | 	var mimeTypesWhitelist = [
 325 | 		/^text\/(css|javascript|ecmascript)/i,
 326 | 		/^application\/javascript/i,
 327 | 		/^application\/x-font/i,
 328 | 		/^application\/font/i,
 329 | 		/^image\//i,
 330 | 		/^font\//i
 331 | 	];
 332 | 
 333 | 	return (
 334 | 		crawler.maxDepth === 0 ||
 335 | 		queueItem.depth <= crawler.maxDepth || 
 336 | 		mimeTypesWhitelist.reduce(function(prev,mimeCheck) {
 337 | 			return prev || !!mimeCheck.exec(queueItem.stateData.contentType);
 338 | 		}, false)
 339 | 	);
 340 | };
 341 | 
 342 | /*
 343 | 	Public: Extracts protocol, host, port and resource (path) given a URL string.
 344 | 
 345 | 	URL	- String containing URL to process
 346 | 
 347 | 	Examples
 348 | 
 349 | 		var URLInfo = crawler.processURL("http://www.google.com/fish");
 350 | 
 351 | 	Returns an object containing keys and values for "protocol", "host", "port",
 352 | 	and "path".
 353 | 
 354 | */
 355 | Crawler.prototype.processURL = function(URL,context) {
 356 | 	var newURL, crawler = this;
 357 | 
 358 | 	if (!context || typeof(context) !== "object")
 359 | 		context = {
 360 | 			url: (
 361 | 				crawler.initialProtocol + "://" +
 362 | 				crawler.host + ":" +
 363 | 				crawler.initialPort + "/"
 364 | 			),
 365 | 			depth: QUEUE_ITEM_INITIAL_DEPTH
 366 | 		};
 367 | 
 368 | 	// If the URL didn't contain anything, don't fetch it.
 369 | 	if (!URL.replace(/\s+/ig,"").length) return false;
 370 | 
 371 | 	// Check if querystring should be ignored
 372 | 	if (crawler.stripQuerystring === true)
 373 | 		URL = crawler.removeQuerystring(URL);
 374 | 
 375 | 	try {
 376 | 		newURL =
 377 | 			URI(URL)
 378 | 				.absoluteTo(context.url)
 379 | 				.normalize();
 380 | 
 381 | 		if (crawler.urlEncoding === "iso8859") {
 382 | 			newURL = newURL.iso8859();
 383 | 		}
 384 | 
 385 | 	} catch(e) {
 386 | 		// Couldn't process the URL, since URIjs choked on it.
 387 | 		return false;
 388 | 	}
 389 | 
 390 | 	// simplecrawler uses slightly different terminology to URIjs. Sorry!
 391 | 	return {
 392 | 		"protocol": newURL.protocol() || "http",
 393 | 		"host":		newURL.hostname(),
 394 | 		"port":		newURL.port() || 80,
 395 | 		"path":		newURL.resource(),
 396 | 		"uriPath":	newURL.path(),
 397 | 		"depth":	context.depth + 1
 398 | 	};
 399 | };
 400 | 
 401 | /*
 402 | 	Public: Discovers linked resources in an HTML, XML or text document.
 403 | 
 404 | 	resourceData	- String containing document with linked resources.
 405 | 	queueItem		- Queue item corresponding to document being searched.
 406 | 
 407 | 	Examples
 408 | 
 409 | 		crawler.discoverResources("http://www.google.com")
 410 | 		crawler.discoverResources("<a href='...'>test</a>")
 411 | 
 412 | 	Returns an array of the (string) resource URLs found in the document. If none
 413 | 	were found, the array will be empty.
 414 | 
 415 | */
 416 | Crawler.prototype.discoverResources = function(resourceData,queueItem) {
 417 | 	// Convert to UTF-8
 418 | 	// TODO: account for text-encoding.
 419 | 	var resources = [],
 420 | 		resourceText = resourceData.toString("utf8"),
 421 | 		crawler = this;
 422 | 
 423 | 	if (!queueItem)
 424 | 		queueItem = {};
 425 | 
 426 | 	if (!queueItem.protocol)
 427 | 		queueItem.protocol = "http";
 428 | 
 429 | 	if (!crawler.parseHTMLComments) {
 430 | 		resourceText = resourceText.replace(/<!--([\s\S]+?)-->/g, "");
 431 | 	}
 432 | 
 433 | 	if (!crawler.parseScriptTags) {
 434 | 		resourceText = resourceText.replace(/<script(.*?)>([\s\S]+?)<\/script>/gi, "");
 435 | 	}
 436 | 
 437 | 	function cleanURL(URL) {
 438 | 		return URL
 439 | 				.replace(/^(\s?href|\s?src)=['"]?/i,"")
 440 | 				.replace(/^\s*/,"")
 441 | 				.replace(/^url\(['"]*/i,"")
 442 | 				.replace(/^javascript\:[a-z0-9]+\(['"]/i,"")
 443 | 				.replace(/["'\)]$/i,"")
 444 | 				.replace(/^\/\//, queueItem.protocol + "://")
 445 | 				.replace(/\&amp;/gi,"&")
 446 | 				.split("#")
 447 | 				.shift();
 448 | 	}
 449 | 
 450 | 	// Clean links
 451 | 	function cleanAndQueue(urlMatch) {
 452 | 		if (!urlMatch) return [];
 453 | 
 454 | 		return urlMatch
 455 | 			.map(cleanURL)
 456 | 			.reduce(function(list,URL) {
 457 | 
 458 | 				// Ensure URL is whole and complete
 459 | 				try {
 460 | 					URL = URI(URL)
 461 | 							.absoluteTo(queueItem.url)
 462 | 							.normalize()
 463 | 							.toString();
 464 | 				} catch(e) {
 465 | 
 466 | 					// But if URI.js couldn't parse it - nobody can!
 467 | 					return list;
 468 | 				}
 469 | 
 470 | 				// If we hit an empty item, don't add return it
 471 | 				if (!URL.length) return list;
 472 | 
 473 | 				// If we don't support the protocol in question
 474 | 				if (!crawler.protocolSupported(URL)) return list;
 475 | 
 476 | 				// Does the item already exist in the list?
 477 | 				if (resources.reduce(function(prev,current) {
 478 | 						return prev || current === URL;
 479 | 					},false))
 480 | 						return list;
 481 | 
 482 | 				return list.concat(URL);
 483 | 			},[]);
 484 | 	}
 485 | 
 486 | 	// Rough scan for URLs
 487 | 	return crawler.discoverRegex
 488 | 		.reduce(function(list,regex) {
 489 | 			return list.concat(
 490 | 				cleanAndQueue(
 491 | 					resourceText.match(regex)));
 492 | 		},[])
 493 | 		.reduce(function(list,check) {
 494 | 			if (list.indexOf(check) < 0)
 495 | 				return list.concat([check]);
 496 | 
 497 | 			return list;
 498 | 		},[]);
 499 | };
 500 | 
 501 | /*
 502 | 	Public: Determines based on crawler state whether a domain is valid for
 503 | 	crawling.
 504 | 
 505 | 	host - String containing the hostname of the resource to be fetched.
 506 | 
 507 | 	Examples
 508 | 
 509 | 		crawler.domainValid("127.0.0.1");
 510 | 		crawler.domainValid("google.com");
 511 | 		crawler.domainValid("test.example.com");
 512 | 
 513 | 	Returns an true if the domain is valid for crawling, false if not.
 514 | 
 515 | */
 516 | Crawler.prototype.domainValid = function(host) {
 517 | 	var crawler = this,
 518 | 		crawlerHost = crawler.host;
 519 | 
 520 | 	// If we're ignoring the WWW domain, remove the WWW for comparisons...
 521 | 	if (crawler.ignoreWWWDomain)
 522 | 		host = host.replace(/^www\./i,"");
 523 | 
 524 | 	function domainInWhitelist(host) {
 525 | 
 526 | 		// If there's no whitelist, or the whitelist is of zero length,
 527 | 		// just return false.
 528 | 		if (!crawler.domainWhitelist ||
 529 | 			!crawler.domainWhitelist.length) return false;
 530 | 
 531 | 		// Otherwise, scan through it.
 532 | 		return !!crawler.domainWhitelist.reduce(function(prev,cur,index,array) {
 533 | 
 534 | 			// If we already located the relevant domain in the whitelist...
 535 | 			if (prev) return prev;
 536 | 
 537 | 			// If the domain is just equal, return true.
 538 | 			if (host === cur) return true;
 539 | 
 540 | 			// If we're ignoring WWW subdomains, and both domains,
 541 | 			// less www. are the same, return true.
 542 | 			if (crawler.ignoreWWWDomain && host === cur.replace(/^www\./i,""))
 543 | 				return true;
 544 | 
 545 | 			// Otherwise, sorry. No dice.
 546 | 			return false;
 547 | 		},false);
 548 | 	}
 549 | 
 550 | 	// Checks if the first domain is a subdomain of the second
 551 | 	function isSubdomainOf(subdomain,host) {
 552 | 
 553 | 		// Comparisons must be case-insensitive
 554 | 		subdomain	= subdomain.toLowerCase();
 555 | 		host		= host.toLowerCase();
 556 | 
 557 | 		// If we're ignoring www, remove it from both
 558 | 		// (if www is the first domain component...)
 559 | 		if (crawler.ignoreWWWDomain) {
 560 | 			subdomain.replace(/^www./ig,"");
 561 | 			host.replace(/^www./ig,"");
 562 | 		}
 563 | 
 564 | 		// They should be the same flipped around!
 565 | 		return (
 566 | 			subdomain.split("").reverse().join("").substr(0,host.length) ===
 567 | 				host.split("").reverse().join(""));
 568 | 	}
 569 | 
 570 | 			// If we're not filtering by domain, just return true.
 571 | 	return	(!crawler.filterByDomain	||
 572 | 			// Or if the domain is just the right one, return true.
 573 | 			(host === crawler.host)	||
 574 | 			// Or if we're ignoring WWW subdomains, and both domains,
 575 | 			// less www. are the same, return true.
 576 | 			(
 577 | 				crawler.ignoreWWWDomain &&
 578 | 				crawler.host.replace(/^www\./i,"") ===
 579 | 					host.replace(/^www\./i,"")
 580 | 			) ||
 581 | 			// Or if the domain in question exists in the domain whitelist,
 582 | 			// return true.
 583 | 			domainInWhitelist(host) ||
 584 | 			// Or if we're scanning subdomains, and this domain is a subdomain
 585 | 			// of the crawler's set domain, return true.
 586 | 			(crawler.scanSubdomains && isSubdomainOf(host,crawler.host)));
 587 | };
 588 | 
 589 | /*
 590 | 	Public: Given a text or HTML document, initiates discovery of linked
 591 | 	resources in the text, and queues the resources if applicable. Emits
 592 | 	"discoverycomplete". Not to be confused with `crawler.discoverResources`,
 593 | 	despite the `discoverResources` function being the main component of this
 594 | 	one, since this function queues the resources in addition to
 595 | 	discovering them.
 596 | 
 597 | 	resourceData	- Text document containing linked resource URLs.
 598 | 	queueItem		- Queue item from which the resource document was derived.
 599 | 	decompressed	- Content is already decompressed (default: false)
 600 | 
 601 | 	Examples
 602 | 
 603 | 		crawler.queueLinkedItems("<a href='...'>test</a>",queueItem);
 604 | 
 605 | 	Returns the crawler object for chaining.
 606 | 
 607 | */
 608 | Crawler.prototype.queueLinkedItems = function(resourceData,queueItem,decompressed) {
 609 | 	var crawler = this,
 610 | 		resources = [];
 611 | 
 612 | 	if (!decompressed &&
 613 | 		queueItem.stateData &&
 614 | 		queueItem.stateData.headers['content-encoding'] && (
 615 | 		queueItem.stateData.headers['content-encoding'].match(/gzip/) ||
 616 | 		queueItem.stateData.headers['content-encoding'].match(/deflate/))) {
 617 | 
 618 | 		return zlib.unzip(resourceData,function(err,newData) {
 619 | 			if (err) {
 620 | 				return crawler.emit("fetcherror",queueItem);
 621 | 			}
 622 | 
 623 | 			crawler.queueLinkedItems(newData,queueItem,true);
 624 | 		});
 625 | 	}
 626 | 
 627 | 	resources = crawler.discoverResources(resourceData,queueItem);
 628 | 
 629 | 	// Emit discovered resources. ie: might be useful in building a graph of
 630 | 	// page relationships.
 631 | 	crawler.emit("discoverycomplete",queueItem,resources);
 632 | 
 633 | 	resources.forEach(function(url){ crawler.queueURL(url,queueItem); });
 634 | 
 635 | 	return crawler;
 636 | };
 637 | 
 638 | /*
 639 | 	Public: Given a single URL, this function cleans, validates, parses it and
 640 | 	adds it to the queue. This is the best and simplest way to add an item to
 641 | 	the queue.
 642 | 
 643 | 	url			- URL to be queued.
 644 | 	queueItem	- Queue item from which the resource was linked.
 645 | 
 646 | 	Emits
 647 | 
 648 | 		queueduplicate
 649 | 		queueerror
 650 | 		queueadd
 651 | 
 652 | 	Examples
 653 | 
 654 | 		crawler.queueURL("http://www.google.com/",queueItem);
 655 | 
 656 | 	Returns a boolean value indicating whether the URL was successfully queued
 657 | 	or not.
 658 | 
 659 | */
 660 | Crawler.prototype.queueURL = function(url,queueItem) {
 661 | 	var crawler = this;
 662 | 	var parsedURL =
 663 | 		typeof(url) === "object" ? url : crawler.processURL(url,queueItem);
 664 | 
 665 | 	// URL Parser decided this URL was junky. Next please!
 666 | 	if (!parsedURL) {
 667 | 		return false;
 668 | 	}
 669 | 
 670 | 	// Pass this URL past fetch conditions to ensure the user thinks it's valid
 671 | 	var fetchDenied = false;
 672 | 	fetchDenied = crawler._fetchConditions.reduce(function(prev,callback) {
 673 | 		return prev || !callback(parsedURL);
 674 | 	},false);
 675 | 
 676 | 	if (fetchDenied) {
 677 | 		// Fetch Conditions conspired to block URL
 678 | 		return false;
 679 | 	}
 680 | 
 681 | 	// Check the domain is valid before adding it to the queue
 682 | 	if (crawler.domainValid(parsedURL.host)) {
 683 | 		crawler.queue.add(
 684 | 			parsedURL.protocol,
 685 | 			parsedURL.host,
 686 | 			parsedURL.port,
 687 | 			parsedURL.path,
 688 | 			parsedURL.depth,
 689 | 			function queueAddCallback(error,newQueueItem) {
 690 | 				if (error) {
 691 | 					// We received an error condition when adding the callback
 692 | 					if (error.code && error.code === "DUP")
 693 | 						return crawler.emit("queueduplicate",parsedURL);
 694 | 
 695 | 					return crawler.emit("queueerror",error,parsedURL);
 696 | 				}
 697 | 
 698 | 				crawler.emit("queueadd",newQueueItem,parsedURL);
 699 | 				newQueueItem.referrer = queueItem ? queueItem.url : null;
 700 | 			}
 701 | 		);
 702 | 	}
 703 | 
 704 | 	return true;
 705 | };
 706 | 
 707 | /*
 708 | 	Public: The guts of the crawler: takes a queue item and spools a request for
 709 | 	it, downloads, caches, and fires events based on the result of the request.
 710 | 	It kicks off resource discovery and queues any new resources found.
 711 | 
 712 | 	queueItem	- Queue item to be fetched.
 713 | 
 714 | 	Emits
 715 | 		fetchstart
 716 | 		fetchheaders
 717 | 		fetchcomplete
 718 | 		fetchdataerror
 719 | 		notmodified
 720 | 		fetchredirect
 721 | 		fetch404
 722 | 		fetcherror
 723 | 		fetchclienterror
 724 | 
 725 | 	Examples
 726 | 
 727 | 		crawler.fetchQueueItem(queueItem);
 728 | 
 729 | 	Returns the crawler object for chaining.
 730 | 
 731 | */
 732 | Crawler.prototype.fetchQueueItem = function(queueItem) {
 733 | 	var crawler = this;
 734 | 	crawler._openRequests ++;
 735 | 
 736 | 	// Variable declarations
 737 | 	var fetchData = false,
 738 | 		requestOptions,
 739 | 		clientRequest,
 740 | 		timeCommenced;
 741 | 
 742 | 	// Mark as spooled
 743 | 	queueItem.status = "spooled";
 744 | 	var client = (queueItem.protocol === "https" ? https : http);
 745 | 
 746 | 	// Up the socket limit if required.
 747 | 	if (client.globalAgent.maxSockets < crawler.maxConcurrency) {
 748 | 		client.globalAgent.maxSockets = crawler.maxConcurrency;
 749 | 	}
 750 | 
 751 | 	// Extract request options from queue;
 752 | 	var requestHost = queueItem.host,
 753 | 		requestPort = queueItem.port,
 754 | 		requestPath = queueItem.path;
 755 | 
 756 | 	// Are we passing through an HTTP proxy?
 757 | 	if (crawler.useProxy) {
 758 | 		requestHost = crawler.proxyHostname;
 759 | 		requestPort = crawler.proxyPort;
 760 | 		requestPath = queueItem.url;
 761 | 	}
 762 | 
 763 | 	// Load in request options
 764 | 	requestOptions = {
 765 | 		method:	"GET",
 766 | 		host:	requestHost,
 767 | 		port:	requestPort,
 768 | 		path:	requestPath,
 769 | 		headers: {
 770 | 			"User-Agent":	crawler.userAgent,
 771 | 			"Host":			queueItem.host + (
 772 | 							queueItem.port !== 80 ?
 773 | 								":" + queueItem.port :
 774 | 								""
 775 | 							),
 776 | 			"Referer":		queueItem.referrer
 777 | 		}
 778 | 	};
 779 | 
 780 | 	// If port is one of the HTTP/HTTPS defaults, delete the option to avoid conflicts
 781 | 	if (requestOptions.port === 80 || requestOptions.port === 443) {
 782 | 		delete requestOptions.port;
 783 | 	}
 784 | 
 785 | 	// Add cookie header from cookie jar if we're configured to
 786 | 	// send/accept cookies
 787 | 	if (crawler.acceptCookies && crawler.cookies.getAsHeader()) {
 788 | 		requestOptions.headers.cookie =
 789 | 			crawler.cookies.getAsHeader(queueItem.host,queueItem.path);
 790 | 	}
 791 | 
 792 | 	// Add auth headers if we need them
 793 | 	if (crawler.needsAuth) {
 794 | 		var auth = crawler.authUser + ":" + crawler.authPass;
 795 | 
 796 | 		// Generate auth header
 797 | 		auth = 'Basic ' + (new Buffer(auth).toString('base64'));
 798 | 		requestOptions.headers.Authorization = auth;
 799 | 	}
 800 | 
 801 | 	// Add proxy auth if we need it
 802 | 	if (crawler.proxyUser !== null && crawler.proxyPass !== null) {
 803 | 		var proxyAuth = crawler.proxyUser + ":" + crawler.proxyPass;
 804 | 
 805 | 		// Generate auth header
 806 | 		proxyAuth = 'Basic ' + (new Buffer(proxyAuth).toString('base64'));
 807 | 		requestOptions.headers["Proxy-Authorization"] = proxyAuth;
 808 | 	}
 809 | 
 810 | 	// And if we've got any custom headers available
 811 | 	if (crawler.customHeaders) {
 812 | 		for (var header in crawler.customHeaders) {
 813 | 			if (!crawler.customHeaders.hasOwnProperty(header)) continue;
 814 | 
 815 | 			requestOptions.headers[header] = crawler.customHeaders[header];
 816 | 		}
 817 | 	}
 818 | 
 819 | 	// Emit fetchstart event - gives the user time to mangle the request options
 820 | 	// if required.
 821 | 	crawler.emit("fetchstart", queueItem, requestOptions);
 822 | 
 823 | 	process.nextTick(function() {
 824 | 		// Record what time we started this request
 825 | 		timeCommenced = Date.now();
 826 | 
 827 | 		// Get the resource!
 828 | 		clientRequest =
 829 | 			client.request(requestOptions,function(response) {
 830 | 				crawler.handleResponse(queueItem,response,timeCommenced);
 831 | 			});
 832 | 
 833 | 		clientRequest.end();
 834 | 
 835 | 		clientRequest.setTimeout(crawler.timeout, function() {
 836 | 			clientRequest.abort();
 837 | 			crawler.emit("fetchtimeout",queueItem,crawler.timeout);
 838 | 		});
 839 | 
 840 | 		clientRequest.on("error",function(errorData) {
 841 | 			crawler._openRequests --;
 842 | 
 843 | 			// Emit 5xx / 4xx event
 844 | 			crawler.emit("fetchclienterror",queueItem,errorData);
 845 | 			queueItem.fetched = true;
 846 | 			queueItem.stateData.code = 599;
 847 | 			queueItem.status = "failed";
 848 | 		});
 849 | 
 850 | 		return crawler;
 851 | 	});
 852 | };
 853 | 
 854 | 
 855 | /*
 856 | 	Public: Given a queueItem and a matching response object, the crawler will
 857 | 	handle downloading the resource, queueing of linked items, etc.
 858 | 
 859 | 	Examples
 860 | 
 861 | 		// Passing in a response from `request`
 862 | 		request(queueItem.url,function(err,res,body) {
 863 | 			crawler.handleResponse(queueItem,res);
 864 | 		});
 865 | 
 866 | 	Returns the crawler object for chaining.
 867 | 
 868 | */
 869 | Crawler.prototype.handleResponse = function(queueItem,response,timeCommenced) {
 870 | 	var crawler = this,
 871 | 		dataReceived = false,
 872 | 		timeHeadersReceived,
 873 | 		timeDataReceived,
 874 | 		parsedURL,
 875 | 		responseBuffer,
 876 | 		responseLength,
 877 | 		responseLengthReceived = 0,
 878 | 		contentType,
 879 | 		stateData = queueItem.stateData;
 880 | 
 881 | 	// Record what time we first received the header information
 882 | 	timeHeadersReceived = Date.now();
 883 | 
 884 | 	// If we weren't passed a time of commencement, assume Now()
 885 | 	timeCommenced = timeCommenced || Date.now();
 886 | 
 887 | 	responseLength = parseInt(response.headers["content-length"],10);
 888 | 	responseLength = !isNaN(responseLength) ? responseLength : 0;
 889 | 
 890 | 	// Save timing and content some header information into queue
 891 | 	stateData.requestLatency	= (timeHeadersReceived - timeCommenced);
 892 | 	stateData.requestTime		= (timeHeadersReceived - timeCommenced);
 893 | 	stateData.contentLength		= responseLength;
 894 | 	stateData.contentType		= contentType = response.headers["content-type"];
 895 | 	stateData.code				= response.statusCode;
 896 | 	stateData.headers			= response.headers;
 897 | 
 898 | 	// Do we need to save cookies? Were we sent any?
 899 | 	if (crawler.acceptCookies &&
 900 | 		response.headers.hasOwnProperty('set-cookie'))
 901 | 			crawler.cookies.addFromHeaders(response.headers["set-cookie"]);
 902 | 
 903 | 	// Emit header receive event
 904 | 	crawler.emit("fetchheaders",queueItem,response);
 905 | 
 906 | 	// Ensure response length is reasonable...
 907 | 	responseLength =
 908 | 		responseLength > 0 ? responseLength : crawler.maxResourceSize;
 909 | 
 910 | 	queueItem.stateData.contentLength = responseLength;
 911 | 
 912 | 	// Function for dealing with 200 responses
 913 | 	function processReceivedData() {
 914 | 		if (queueItem.fetched) return;
 915 | 
 916 | 		timeDataReceived = (new Date().getTime());
 917 | 
 918 | 		queueItem.fetched = true;
 919 | 		queueItem.status = "downloaded";
 920 | 
 921 | 		// Save state information
 922 | 		stateData.downloadTime		= (timeDataReceived - timeHeadersReceived);
 923 | 		stateData.requestTime		= (timeDataReceived - timeCommenced);
 924 | 		stateData.actualDataSize	= responseBuffer.length;
 925 | 		stateData.sentIncorrectSize = responseBuffer.length !== responseLength;
 926 | 
 927 | 		// First, save item to cache (if we're using a cache!)
 928 | 		if (crawler.cache !== null &&
 929 | 			crawler.cache.setCacheData instanceof Function) {
 930 | 
 931 | 			crawler.cache.setCacheData(queueItem,responseBuffer);
 932 | 		}
 933 | 
 934 | 		// Is the item allowed by depth conditions ?
 935 | 		if(crawler.depthAllowed(queueItem)) {
 936 | 			crawler.emit("fetchcomplete",queueItem,responseBuffer,response);
 937 | 
 938 | 			// We only process the item if it's of a valid mimetype
 939 | 			// and only if the crawler is set to discover its own resources
 940 | 			if (crawler.mimeTypeSupported(contentType) && crawler.discoverResources) {
 941 | 				crawler.queueLinkedItems(responseBuffer,queueItem);
 942 | 			}
 943 | 		}
 944 | 
 945 | 		crawler._openRequests --;
 946 | 	}
 947 | 
 948 | 	function receiveData(chunk) {
 949 | 		if (chunk && chunk.length && !dataReceived) {
 950 | 			if (responseLengthReceived + chunk.length > responseBuffer.length) {
 951 | 				// Oh dear. We've been sent more data than we were initially told.
 952 | 				// This could be a mis-calculation, or a streaming resource.
 953 | 				// Let's increase the size of our buffer to match, as long as it isn't
 954 | 				// larger than our maximum resource size.
 955 | 
 956 | 				if (responseLengthReceived + chunk.length <= crawler.maxResourceSize) {
 957 | 
 958 | 					// Start by creating a new buffer, which will be our main
 959 | 					// buffer from now on...
 960 | 
 961 | 					var tmpNewBuffer = new Buffer(responseLengthReceived + chunk.length);
 962 | 
 963 | 					// Copy all our old data into it...
 964 | 					responseBuffer.copy(tmpNewBuffer,0,0,responseBuffer.length);
 965 | 
 966 | 					// And now the new chunk
 967 | 					chunk.copy(tmpNewBuffer,responseBuffer.length,0,chunk.length);
 968 | 
 969 | 					// And now make the response buffer our new buffer,
 970 | 					// leaving the original for GC
 971 | 					responseBuffer = tmpNewBuffer;
 972 | 
 973 | 				} else {
 974 | 					// Oh dear oh dear! The response is not only more data
 975 | 					// than we were initially told, but it also exceeds the
 976 | 					// maximum amount of data we're prepared to download per
 977 | 					// resource.
 978 | 					//
 979 | 					// Throw error event and ignore.
 980 | 					//
 981 | 					// We'll then deal with the data that we have.
 982 | 
 983 | 					crawler.emit("fetchdataerror",queueItem,response);
 984 | 				}
 985 | 			} else {
 986 | 				// Copy the chunk data into our main buffer
 987 | 				chunk.copy(responseBuffer,responseLengthReceived,0,chunk.length);
 988 | 			}
 989 | 
 990 | 			// Increment our data received counter
 991 | 			responseLengthReceived += chunk.length;
 992 | 		}
 993 | 
 994 | 
 995 | 		if ((responseLengthReceived >= responseLength || response.complete) &&
 996 | 			!dataReceived) {
 997 | 
 998 | 			// Slice the buffer to chop off any unused space
 999 | 			responseBuffer = responseBuffer.slice(0,responseLengthReceived);
1000 | 
1001 | 			dataReceived = true;
1002 | 			processReceivedData();
1003 | 		}
1004 | 	}
1005 | 
1006 | 	// If we should just go ahead and get the data
1007 | 	if (response.statusCode >= 200 && response.statusCode < 300 &&
1008 | 		responseLength <= crawler.maxResourceSize) {
1009 | 
1010 | 		queueItem.status = "headers";
1011 | 
1012 | 		// Create a buffer with our response length
1013 | 		responseBuffer = new Buffer(responseLength);
1014 | 
1015 | 		response.on("data",receiveData);
1016 | 		response.on("end",receiveData);
1017 | 
1018 | 	// We've got a not-modified response back
1019 | 	} else if (response.statusCode === 304) {
1020 | 
1021 | 		if (crawler.cache !== null && crawler.cache.getCacheData) {
1022 | 			// We've got access to a cache
1023 | 			crawler.cache.getCacheData(queueItem,function(cacheObject) {
1024 | 				crawler.emit("notmodified",queueItem,response,cacheObject);
1025 | 			});
1026 | 		} else {
1027 | 			// Emit notmodified event. We don't have a cache available, so
1028 | 			// we don't send any data.
1029 | 			crawler.emit("notmodified",queueItem,response);
1030 | 		}
1031 | 
1032 | 	// If we should queue a redirect
1033 | 	} else if (response.statusCode >= 300 && response.statusCode < 400 &&
1034 | 					response.headers.location) {
1035 | 
1036 | 		queueItem.fetched = true;
1037 | 		queueItem.status = "redirected";
1038 | 
1039 | 		// Parse the redirect URL ready for adding to the queue...
1040 | 		parsedURL = crawler.processURL(response.headers.location,queueItem);
1041 | 
1042 | 		// Emit redirect event
1043 | 		crawler.emit("fetchredirect",queueItem,parsedURL,response);
1044 | 
1045 | 		// Clean URL, add to queue...
1046 | 		crawler.queueURL(parsedURL,queueItem);
1047 | 
1048 | 		crawler._openRequests --;
1049 | 
1050 | 	// Ignore this request, but record that we had a 404
1051 | 	} else if (response.statusCode === 404) {
1052 | 		queueItem.fetched = true;
1053 | 		queueItem.status = "notfound";
1054 | 
1055 | 		// Emit 404 event
1056 | 		crawler.emit("fetch404",queueItem,response);
1057 | 
1058 | 		crawler._openRequests --;
1059 | 
1060 | 	// And oh dear. Handle this one as well. (other 400s, 500s, etc)
1061 | 	} else {
1062 | 		queueItem.fetched = true;
1063 | 		queueItem.status = "failed";
1064 | 
1065 | 		// Emit 5xx / 4xx event
1066 | 		crawler.emit("fetcherror",queueItem,response);
1067 | 
1068 | 		crawler._openRequests --;
1069 | 	}
1070 | 
1071 | 	return crawler;
1072 | };
1073 | 
1074 | /*
1075 | 	Public: The main crawler runloop. Fires at the interval specified in the
1076 | 	crawler configuration, when the crawl is running. May be manually fired.
1077 | 	This function initiates fetching of a queue item if there are enough workers
1078 | 	to do so and there are unfetched items in the queue.
1079 | 
1080 | 	Examples
1081 | 
1082 | 		crawler.crawl();
1083 | 
1084 | 	Returns the crawler object for chaining.
1085 | 
1086 | */
1087 | Crawler.prototype.crawl = function() {
1088 | 	var crawler = this;
1089 | 
1090 | 	if (crawler._openRequests > crawler.maxConcurrency) return;
1091 | 
1092 | 	crawler.queue.oldestUnfetchedItem(function(err, queueItem) {
1093 | 
1094 | 		if (queueItem) {
1095 | 			crawler.fetchQueueItem(queueItem);
1096 | 
1097 | 		} else if (	!crawler._openRequests &&
1098 | 					!crawler._openListeners) {
1099 | 
1100 | 			crawler.queue.complete(function(err, completeCount) {
1101 | 				if (err) throw err;
1102 | 
1103 | 				crawler.queue.getLength(function(err, length) {
1104 | 					if (err) throw err;
1105 | 
1106 | 					if (completeCount === length) {
1107 | 						crawler.emit("complete");
1108 | 						crawler.stop();
1109 | 					}
1110 | 				});
1111 | 			});
1112 | 		}
1113 | 	});
1114 | 
1115 | 	return crawler;
1116 | };
1117 | 
1118 | /*
1119 | 	Public: Stops the crawler, terminating the crawl runloop.
1120 | 
1121 | 	Examples
1122 | 
1123 | 		crawler.stop();
1124 | 
1125 | 	Returns the crawler object for chaining.
1126 | 
1127 | */
1128 | Crawler.prototype.stop = function() {
1129 | 	var crawler = this;
1130 | 	clearInterval(crawler.crawlIntervalID);
1131 | 	crawler.running = false;
1132 | 	return crawler;
1133 | };
1134 | 
1135 | /*
1136 | 	Public: Holds the crawler in a 'running' state, preventing the `complete`
1137 | 	event from firing until the callback this function returns has been executed,
1138 | 	or a predetermined timeout (as specified by `crawler.listenerTTL`) has
1139 | 	elapsed.
1140 | 
1141 | 	Examples
1142 | 
1143 | 		crawler.on("fetchcomplete",function(queueItem,data) {
1144 | 			continue = this.wait();
1145 | 			doSomethingThatTakesAlongTime(function callback() {
1146 | 				continue();
1147 | 			});
1148 | 		});
1149 | 
1150 | 	Returns callback which will allow the crawler to continue.
1151 | 
1152 | */
1153 | Crawler.prototype.wait = function() {
1154 | 	var crawler = this,
1155 | 		cleared = false,
1156 | 		timeout =
1157 | 			setTimeout(function() {
1158 | 				if (cleared) return;
1159 | 				cleared = true;
1160 | 				crawler._openListeners --;
1161 | 			}, crawler.listenerTTL);
1162 | 
1163 | 	crawler._openListeners ++;
1164 | 
1165 | 	return function() {
1166 | 		if (cleared) return;
1167 | 		cleared = true;
1168 | 		crawler._openListeners --;
1169 | 		clearTimeout(timeout);
1170 | 	};
1171 | };
1172 | 
1173 | /*
1174 | 	Public: Given a function, this method adds it to an internal list maintained
1175 | 	by the crawler to be executed against each URL to determine whether it should
1176 | 	be fetched or not.
1177 | 
1178 | 	callback -	Function to be called when evaluating a URL. This function is
1179 | 				passed an object containing the protocol, hostname, port, and path
1180 | 				of a resource to be fetched. It can determine whether it should
1181 | 				be requested or not by returning a boolean - false for no, true
1182 | 				for yes.
1183 | 
1184 | 	Examples
1185 | 
1186 | 		crawler.addFetchCondition(function(parsedURL) {
1187 | 			return (parsedURL.host !== "evildomain.com");
1188 | 		});
1189 | 
1190 | 	Returns the ID of the fetch condition - used for removing it from the crawler
1191 | 	later.
1192 | 
1193 | */
1194 | Crawler.prototype.addFetchCondition = function(callback) {
1195 | 	var crawler = this;
1196 | 	if (callback instanceof Function) {
1197 | 		crawler._fetchConditions.push(callback);
1198 | 		return crawler._fetchConditions.length - 1;
1199 | 	} else {
1200 | 		throw new Error("Fetch Condition must be a function.");
1201 | 	}
1202 | };
1203 | 
1204 | /*
1205 | 	Public: Given the ID of an existing fetch condition, this function removes
1206 | 	it from the crawler's internal list of conditions.
1207 | 
1208 | 	index - ID of fetch condition to be removed.
1209 | 
1210 | 	Examples
1211 | 
1212 | 		crawler.removeFetchCondition(3);
1213 | 
1214 | 	Returns true if the fetch condition was removed, and throws an error if it
1215 | 	could not be found.
1216 | 
1217 | */
1218 | Crawler.prototype.removeFetchCondition = function(index) {
1219 | 	var crawler = this;
1220 | 	if (crawler._fetchConditions[index] &&
1221 | 		crawler._fetchConditions[index] instanceof Function) {
1222 | 
1223 | 		return !!crawler._fetchConditions.splice(index,1);
1224 | 	} else {
1225 | 		throw new Error("Unable to find indexed Fetch Condition.");
1226 | 	}
1227 | };
1228 | 
1229 | /*
1230 | 	Public: Given a URL it will remove the querstring if it exists.
1231 | 
1232 | 	url - URL from which to remove the querystring
1233 | 
1234 | 	Examples
1235 | 
1236 | 		crawler.removeQuerystring(url);
1237 | 
1238 | 	Returns URL without querystring if it exists
1239 | 
1240 | */
1241 | Crawler.prototype.removeQuerystring = function(url) {
1242 | 	if (url.indexOf("?") > -1) {
1243 | 		return url.substr(0,url.indexOf("?"));
1244 | 	} else {
1245 | 		return url;
1246 | 	}
1247 | };
1248 | 
1249 | module.exports = Crawler;
1250 | 


--------------------------------------------------------------------------------