├── .gitignore ├── README.md ├── jquery.js ├── main.js ├── package.json └── tests ├── memory_leak.js ├── test_foodie.js └── test_nytimes.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | 6 | # Runtime data 7 | pids 8 | *.pid 9 | *.seed 10 | 11 | # Directory for instrumented libs generated by jscoverage/JSCover 12 | lib-cov 13 | 14 | # Coverage directory used by tools like istanbul 15 | coverage 16 | 17 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 18 | .grunt 19 | 20 | # node-waf configuration 21 | .lock-wscript 22 | 23 | # Compiled binary addons (http://nodejs.org/api/addons.html) 24 | build/Release 25 | 26 | # Dependency directory 27 | # https://docs.npmjs.com/misc/faq#should-i-check-my-node-modules-folder-into-git 28 | node_modules -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spider -- Programmable spidering of web sites with node.js and jQuery 2 | 3 | ## Install 4 | 5 | From source: 6 | 7 |
8 | git clone git://github.com/mikeal/spider.git 9 | cd spider 10 | npm link ../spider 11 |12 | 13 | ## (How to use the) API 14 | 15 | ### Creating a Spider 16 |
17 | var spider = require('spider'); 18 | var s = spider(); 19 |20 | 21 | #### spider(options) 22 | 23 | The `options` object can have the following fields: 24 | 25 | * `maxSockets` - Integer containing the maximum amount of sockets in the pool. Defaults to `4`. 26 | * `userAgent` - The User Agent String to be sent to the remote server along with our request. Defaults to `Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7` (firefox userAgent String). 27 | * `cache` - The Cache object to be used as cache. Defaults to NoCache, see code for implementation details for a new Cache object. 28 | * `pool` - A hash object containing the agents for the requests. If omitted the requests will use the global pool which is set to maxSockets. 29 | 30 | ### Adding a Route Handler 31 | 32 | #### spider.route(hosts, pattern, cb) 33 | Where the params are the following : 34 | 35 | * `hosts` - A string -- or an array of string -- representing the `host` part of the targeted URL(s). 36 | * `pattern` - The pattern against which spider tries to match the remaining (`pathname` + `search` + `hash`) of the URL(s). 37 | * `cb` - A function of the form `function(window, $)` where 38 | * `this` - Will be a variable referencing the `Routes.match` return object/value with some other goodies added from spider. For more info see https://github.com/aaronblohowiak/routes.js 39 | * `window` - Will be a variable referencing the document's window. 40 | * `$` - Will be the variable referencing the jQuery Object. 41 | 42 | ### Queuing an URL for spider to fetch. 43 | 44 | `spider.get(url)` where `url` is the url to fetch. 45 | 46 | ### Extending / Replacing the MemoryCache 47 | 48 | Currently the MemoryCache must provide the following methods: 49 | 50 | * `get(url, cb)` - Returns `url`'s `body` field via the `cb` callback/continuation if it exists. Returns `null` otherwise. 51 | * `cb` - Must be of the form `function(retval) {...}` 52 | * `getHeaders(url, cb)` - Returns `url`'s `headers` field via the `cb` callback/continuation if it exists. Returns `null` otherwise. 53 | * `cb` - Must be of the form `function(retval) {...}` 54 | * `set(url, headers, body)` - Sets/Saves `url`'s `headers` and `body` in the cache. 55 | 56 | ### Setting the verbose/log level 57 | `spider.log(level)` - Where `level` is a string that can be any of `"debug"`, `"info"`, `"error"` 58 | -------------------------------------------------------------------------------- /main.js: -------------------------------------------------------------------------------- 1 | var request = require('request') 2 | , fs = require('fs') 3 | , sys = require('sys') 4 | , path = require('path') 5 | , vm = require('vm') 6 | , jsdom = require('jsdom') 7 | , util = require('util') 8 | , urlParse = require('url').parse 9 | , urlResolve = require('url').resolve 10 | , routes = require('routes') 11 | , events = require('events') 12 | , util = require('util') 13 | , cookiejar = require('cookiejar') 14 | ; 15 | 16 | var headers = 17 | { 'accept': "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5" 18 | , 'accept-language': 'en-US,en;q=0.8' 19 | , 'accept-charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3' 20 | } 21 | 22 | var firefox = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) ' + 23 | 'AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7' 24 | 25 | 26 | var jqueryFilename = path.join(__dirname, 'jquery.js') 27 | , jquery = fs.readFileSync(jqueryFilename).toString() 28 | ; 29 | 30 | var copy = function (obj) { 31 | var n = {} 32 | for (i in obj) { 33 | n[i] = obj[i]; 34 | } 35 | return n 36 | } 37 | 38 | jsdom.defaultDocumentFeatures = 39 | { FetchExternalResources : [] 40 | , ProcessExternalResources : false 41 | , MutationEvents : false 42 | , QuerySelector : false 43 | } 44 | 45 | var debug = 1 46 | , info = 50 47 | , error = 100 48 | ; 49 | 50 | var isUrl = /^https?:/; 51 | 52 | var logLevels = {debug:debug, info:info, error:error, 1:'debug', 50:'info', 100:'error'} 53 | 54 | function MemoryCache () { 55 | this.cache = {}; 56 | } 57 | MemoryCache.prototype.get = function (url, cb) { 58 | if (!this.cache[url]) return cb(null); 59 | cb({headers:this.cache[url].headers, body:this.cache[url].body.toString()}); 60 | } 61 | MemoryCache.prototype.set = function (url, headers, body) { 62 | this.cache[url] = {headers:headers, body:new Buffer(body)}; 63 | } 64 | MemoryCache.prototype.getHeaders = function (url, cb) { 65 | if (!this.cache[url]) return cb(null); 66 | cb(this.cache[url].headers); 67 | } 68 | 69 | function NoCache () {}; 70 | NoCache.prototype.get = function (url, cb) { cb(null) }; 71 | NoCache.prototype.getHeaders = function (url, cb) {cb(null)}; 72 | NoCache.prototype.set = function (url, headers, body) {}; 73 | 74 | function Spider (options) { 75 | this.maxSockets = options.maxSockets || 4; 76 | this.userAgent = options.userAgent || firefox; 77 | this.cache = options.cache || new NoCache(); 78 | this.pool = options.pool || {maxSockets: options.maxSockets}; 79 | this.options = options; 80 | this.currentUrl = null; 81 | this.routers = {}; 82 | this.urls = []; 83 | this.jar = cookiejar.CookieJar(); 84 | } 85 | util.inherits(Spider, events.EventEmitter) 86 | Spider.prototype.get = function (url, referer) { 87 | var self = this 88 | , h = copy(headers) 89 | ; 90 | referer = referer || this.currentUrl; 91 | 92 | url = url.slice(0, (url.indexOf('#') === -1) ? url.length : url.indexOf('#')) 93 | 94 | if (this.urls.indexOf(url) !== -1) { 95 | // Already handled this request 96 | this.emit('log', debug, 'Already received one get request for '+url+'. skipping.') 97 | return this; 98 | } 99 | this.urls.push(url); 100 | 101 | var u = urlParse(url); 102 | if (!this.routers[u.host]) { 103 | this.emit('log', debug, 'No routes for host: '+u.host+'. skipping.') 104 | return this; 105 | } 106 | if (!this.routers[u.host].match(u.href.slice(u.href.indexOf(u.host)+u.host.length))) { 107 | this.emit('log', debug, 'No routes for path '+u.href.slice(u.href.indexOf(u.host)+u.host.length)+'. skipping.') 108 | return this; 109 | } 110 | 111 | if (referer) h.referer = referer; 112 | h['user-agent'] = this.userAgent; 113 | 114 | this.cache.getHeaders(url, function (c) { 115 | if (c) { 116 | if (c['last-modifed']) { 117 | h['if-modified-since'] = c['last-modified']; 118 | } 119 | if (c.etag) { 120 | h['if-none-match'] = c.etag; 121 | } 122 | } 123 | 124 | var cookies = self.jar.getCookies(cookiejar.CookieAccessInfo(u.host, u.pathname)); 125 | if (cookies) { 126 | h.cookie = cookies.join(";"); 127 | } 128 | 129 | request.get({url:url, headers:h, pool:self.pool}, function (e, resp, body) { 130 | self.emit('log', debug, 'Response received for '+url+'.') 131 | if (e) { 132 | self.emit('log', error, e); 133 | return; 134 | } 135 | if (resp.statusCode === 304) { 136 | self.cache.get(url, function (c_) { 137 | self._handler(url, referer, {fromCache:true, headers:c_.headers, body:c_.body}) 138 | }); 139 | return; 140 | } else if (resp.statusCode !== 200) { 141 | self.emit('log', debug, 'Request did not return 200. '+url); 142 | return; 143 | } else if (!resp.headers['content-type'] || resp.headers['content-type'].indexOf('html') === -1) { 144 | self.emit('log', debug, 'Content-Type does not match. '+url); 145 | return; 146 | } 147 | if (resp.headers['set-cookie']) { 148 | try { self.jar.setCookies(resp.headers['set-cookie']) } 149 | catch(e) {} 150 | } 151 | self.cache.set(url, resp.headers, body); 152 | self._handler(url, referer, {fromCache:false, headers:resp.headers, body:body}); 153 | }) 154 | }); 155 | return this; 156 | } 157 | Spider.prototype.route = function (hosts, pattern, cb) { 158 | var self = this; 159 | if (typeof hosts === 'string') { 160 | hosts = [hosts]; 161 | } 162 | hosts.forEach(function (host) { 163 | if (!self.routers[host]) self.routers[host] = new routes.Router(); 164 | self.routers[host].addRoute(pattern, cb); 165 | }) 166 | return self; 167 | } 168 | Spider.prototype._handler = function (url, referer, response) { 169 | var u = urlParse(url) 170 | , self = this 171 | ; 172 | if (this.routers[u.host]) { 173 | var r = this.routers[u.host].match(u.href.slice(u.href.indexOf(u.host)+u.host.length)); 174 | r.spider = this; 175 | r.response = response 176 | r.url = u; 177 | 178 | var document = jsdom.jsdom(response.body, null, {}) 179 | var window = document.parentWindow; 180 | window.run(jquery, jqueryFilename) 181 | 182 | window.$.fn.spider = function () { 183 | this.each(function () { 184 | var h = window.$(this).attr('href'); 185 | if (!isUrl.test(h)) { 186 | h = urlResolve(url, h); 187 | } 188 | self.get(h, url); 189 | }) 190 | } 191 | 192 | this.currentUrl = url; 193 | if (jsdom.defaultDocumentFeatures.ProcessExternalResources) { 194 | $(function () { r.fn.call(r, window, window.$); }) 195 | } else { 196 | r.fn.call(r, window, window.$); 197 | } 198 | this.currentUrl = null; 199 | window.close(); //fix suggested by 200 | } 201 | } 202 | Spider.prototype.log = function (level) { 203 | if (typeof level === 'string') level = logLevels[level]; 204 | this.on('log', function (l, text) { 205 | if (l >= level) { 206 | console.log('['+(logLevels[l] || l)+']', text) 207 | } 208 | }) 209 | return this; 210 | } 211 | 212 | function ZombieSpider (options) { 213 | var zombie = require('zombie'); 214 | this.browser = new zombie.Browser({ debug: options }); 215 | if (typeof options.runScripts !== 'undefined') { 216 | options.runScripts = false; 217 | } 218 | this.browser.runScripts = options.runScripts; 219 | 220 | this.get = function () {}; 221 | } 222 | util.inherits(ZombieSpider, Spider); 223 | 224 | module.exports = function (options) {return new Spider(options || {})} 225 | module.exports.jsdom = jsdom; 226 | 227 | 228 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { "name" : "spider" 2 | , "description" : "Programmable spidering of web sites with node.js and jQuery" 3 | , "tags" : ["dom","javascript","crawling","jquery", "spider", "spidering"] 4 | , "version" : "0.1.0" 5 | , "author" : "Mikeal Rogers