├── .gitignore ├── README.md ├── jquery.js ├── main.js ├── package.json └── tests ├── memory_leak.js ├── test_foodie.js └── test_nytimes.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | 6 | # Runtime data 7 | pids 8 | *.pid 9 | *.seed 10 | 11 | # Directory for instrumented libs generated by jscoverage/JSCover 12 | lib-cov 13 | 14 | # Coverage directory used by tools like istanbul 15 | coverage 16 | 17 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 18 | .grunt 19 | 20 | # node-waf configuration 21 | .lock-wscript 22 | 23 | # Compiled binary addons (http://nodejs.org/api/addons.html) 24 | build/Release 25 | 26 | # Dependency directory 27 | # https://docs.npmjs.com/misc/faq#should-i-check-my-node-modules-folder-into-git 28 | node_modules -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spider -- Programmable spidering of web sites with node.js and jQuery 2 | 3 | ## Install 4 | 5 | From source: 6 | 7 |
 8 |   git clone git://github.com/mikeal/spider.git 
 9 |   cd spider
10 |   npm link ../spider
11 | 
12 | 13 | ## (How to use the) API 14 | 15 | ### Creating a Spider 16 |
17 |   var spider = require('spider');
18 |   var s = spider();
19 | 
20 | 21 | #### spider(options) 22 | 23 | The `options` object can have the following fields: 24 | 25 | * `maxSockets` - Integer containing the maximum amount of sockets in the pool. Defaults to `4`. 26 | * `userAgent` - The User Agent String to be sent to the remote server along with our request. Defaults to `Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7` (firefox userAgent String). 27 | * `cache` - The Cache object to be used as cache. Defaults to NoCache, see code for implementation details for a new Cache object. 28 | * `pool` - A hash object containing the agents for the requests. If omitted the requests will use the global pool which is set to maxSockets. 29 | 30 | ### Adding a Route Handler 31 | 32 | #### spider.route(hosts, pattern, cb) 33 | Where the params are the following : 34 | 35 | * `hosts` - A string -- or an array of string -- representing the `host` part of the targeted URL(s). 36 | * `pattern` - The pattern against which spider tries to match the remaining (`pathname` + `search` + `hash`) of the URL(s). 37 | * `cb` - A function of the form `function(window, $)` where 38 | * `this` - Will be a variable referencing the `Routes.match` return object/value with some other goodies added from spider. For more info see https://github.com/aaronblohowiak/routes.js 39 | * `window` - Will be a variable referencing the document's window. 40 | * `$` - Will be the variable referencing the jQuery Object. 41 | 42 | ### Queuing an URL for spider to fetch. 43 | 44 | `spider.get(url)` where `url` is the url to fetch. 45 | 46 | ### Extending / Replacing the MemoryCache 47 | 48 | Currently the MemoryCache must provide the following methods: 49 | 50 | * `get(url, cb)` - Returns `url`'s `body` field via the `cb` callback/continuation if it exists. Returns `null` otherwise. 51 | * `cb` - Must be of the form `function(retval) {...}` 52 | * `getHeaders(url, cb)` - Returns `url`'s `headers` field via the `cb` callback/continuation if it exists. Returns `null` otherwise. 53 | * `cb` - Must be of the form `function(retval) {...}` 54 | * `set(url, headers, body)` - Sets/Saves `url`'s `headers` and `body` in the cache. 55 | 56 | ### Setting the verbose/log level 57 | `spider.log(level)` - Where `level` is a string that can be any of `"debug"`, `"info"`, `"error"` 58 | -------------------------------------------------------------------------------- /main.js: -------------------------------------------------------------------------------- 1 | var request = require('request') 2 | , fs = require('fs') 3 | , sys = require('sys') 4 | , path = require('path') 5 | , vm = require('vm') 6 | , jsdom = require('jsdom') 7 | , util = require('util') 8 | , urlParse = require('url').parse 9 | , urlResolve = require('url').resolve 10 | , routes = require('routes') 11 | , events = require('events') 12 | , util = require('util') 13 | , cookiejar = require('cookiejar') 14 | ; 15 | 16 | var headers = 17 | { 'accept': "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5" 18 | , 'accept-language': 'en-US,en;q=0.8' 19 | , 'accept-charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3' 20 | } 21 | 22 | var firefox = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) ' + 23 | 'AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7' 24 | 25 | 26 | var jqueryFilename = path.join(__dirname, 'jquery.js') 27 | , jquery = fs.readFileSync(jqueryFilename).toString() 28 | ; 29 | 30 | var copy = function (obj) { 31 | var n = {} 32 | for (i in obj) { 33 | n[i] = obj[i]; 34 | } 35 | return n 36 | } 37 | 38 | jsdom.defaultDocumentFeatures = 39 | { FetchExternalResources : [] 40 | , ProcessExternalResources : false 41 | , MutationEvents : false 42 | , QuerySelector : false 43 | } 44 | 45 | var debug = 1 46 | , info = 50 47 | , error = 100 48 | ; 49 | 50 | var isUrl = /^https?:/; 51 | 52 | var logLevels = {debug:debug, info:info, error:error, 1:'debug', 50:'info', 100:'error'} 53 | 54 | function MemoryCache () { 55 | this.cache = {}; 56 | } 57 | MemoryCache.prototype.get = function (url, cb) { 58 | if (!this.cache[url]) return cb(null); 59 | cb({headers:this.cache[url].headers, body:this.cache[url].body.toString()}); 60 | } 61 | MemoryCache.prototype.set = function (url, headers, body) { 62 | this.cache[url] = {headers:headers, body:new Buffer(body)}; 63 | } 64 | MemoryCache.prototype.getHeaders = function (url, cb) { 65 | if (!this.cache[url]) return cb(null); 66 | cb(this.cache[url].headers); 67 | } 68 | 69 | function NoCache () {}; 70 | NoCache.prototype.get = function (url, cb) { cb(null) }; 71 | NoCache.prototype.getHeaders = function (url, cb) {cb(null)}; 72 | NoCache.prototype.set = function (url, headers, body) {}; 73 | 74 | function Spider (options) { 75 | this.maxSockets = options.maxSockets || 4; 76 | this.userAgent = options.userAgent || firefox; 77 | this.cache = options.cache || new NoCache(); 78 | this.pool = options.pool || {maxSockets: options.maxSockets}; 79 | this.options = options; 80 | this.currentUrl = null; 81 | this.routers = {}; 82 | this.urls = []; 83 | this.jar = cookiejar.CookieJar(); 84 | } 85 | util.inherits(Spider, events.EventEmitter) 86 | Spider.prototype.get = function (url, referer) { 87 | var self = this 88 | , h = copy(headers) 89 | ; 90 | referer = referer || this.currentUrl; 91 | 92 | url = url.slice(0, (url.indexOf('#') === -1) ? url.length : url.indexOf('#')) 93 | 94 | if (this.urls.indexOf(url) !== -1) { 95 | // Already handled this request 96 | this.emit('log', debug, 'Already received one get request for '+url+'. skipping.') 97 | return this; 98 | } 99 | this.urls.push(url); 100 | 101 | var u = urlParse(url); 102 | if (!this.routers[u.host]) { 103 | this.emit('log', debug, 'No routes for host: '+u.host+'. skipping.') 104 | return this; 105 | } 106 | if (!this.routers[u.host].match(u.href.slice(u.href.indexOf(u.host)+u.host.length))) { 107 | this.emit('log', debug, 'No routes for path '+u.href.slice(u.href.indexOf(u.host)+u.host.length)+'. skipping.') 108 | return this; 109 | } 110 | 111 | if (referer) h.referer = referer; 112 | h['user-agent'] = this.userAgent; 113 | 114 | this.cache.getHeaders(url, function (c) { 115 | if (c) { 116 | if (c['last-modifed']) { 117 | h['if-modified-since'] = c['last-modified']; 118 | } 119 | if (c.etag) { 120 | h['if-none-match'] = c.etag; 121 | } 122 | } 123 | 124 | var cookies = self.jar.getCookies(cookiejar.CookieAccessInfo(u.host, u.pathname)); 125 | if (cookies) { 126 | h.cookie = cookies.join(";"); 127 | } 128 | 129 | request.get({url:url, headers:h, pool:self.pool}, function (e, resp, body) { 130 | self.emit('log', debug, 'Response received for '+url+'.') 131 | if (e) { 132 | self.emit('log', error, e); 133 | return; 134 | } 135 | if (resp.statusCode === 304) { 136 | self.cache.get(url, function (c_) { 137 | self._handler(url, referer, {fromCache:true, headers:c_.headers, body:c_.body}) 138 | }); 139 | return; 140 | } else if (resp.statusCode !== 200) { 141 | self.emit('log', debug, 'Request did not return 200. '+url); 142 | return; 143 | } else if (!resp.headers['content-type'] || resp.headers['content-type'].indexOf('html') === -1) { 144 | self.emit('log', debug, 'Content-Type does not match. '+url); 145 | return; 146 | } 147 | if (resp.headers['set-cookie']) { 148 | try { self.jar.setCookies(resp.headers['set-cookie']) } 149 | catch(e) {} 150 | } 151 | self.cache.set(url, resp.headers, body); 152 | self._handler(url, referer, {fromCache:false, headers:resp.headers, body:body}); 153 | }) 154 | }); 155 | return this; 156 | } 157 | Spider.prototype.route = function (hosts, pattern, cb) { 158 | var self = this; 159 | if (typeof hosts === 'string') { 160 | hosts = [hosts]; 161 | } 162 | hosts.forEach(function (host) { 163 | if (!self.routers[host]) self.routers[host] = new routes.Router(); 164 | self.routers[host].addRoute(pattern, cb); 165 | }) 166 | return self; 167 | } 168 | Spider.prototype._handler = function (url, referer, response) { 169 | var u = urlParse(url) 170 | , self = this 171 | ; 172 | if (this.routers[u.host]) { 173 | var r = this.routers[u.host].match(u.href.slice(u.href.indexOf(u.host)+u.host.length)); 174 | r.spider = this; 175 | r.response = response 176 | r.url = u; 177 | 178 | var document = jsdom.jsdom(response.body, null, {}) 179 | var window = document.parentWindow; 180 | window.run(jquery, jqueryFilename) 181 | 182 | window.$.fn.spider = function () { 183 | this.each(function () { 184 | var h = window.$(this).attr('href'); 185 | if (!isUrl.test(h)) { 186 | h = urlResolve(url, h); 187 | } 188 | self.get(h, url); 189 | }) 190 | } 191 | 192 | this.currentUrl = url; 193 | if (jsdom.defaultDocumentFeatures.ProcessExternalResources) { 194 | $(function () { r.fn.call(r, window, window.$); }) 195 | } else { 196 | r.fn.call(r, window, window.$); 197 | } 198 | this.currentUrl = null; 199 | window.close(); //fix suggested by 200 | } 201 | } 202 | Spider.prototype.log = function (level) { 203 | if (typeof level === 'string') level = logLevels[level]; 204 | this.on('log', function (l, text) { 205 | if (l >= level) { 206 | console.log('['+(logLevels[l] || l)+']', text) 207 | } 208 | }) 209 | return this; 210 | } 211 | 212 | function ZombieSpider (options) { 213 | var zombie = require('zombie'); 214 | this.browser = new zombie.Browser({ debug: options }); 215 | if (typeof options.runScripts !== 'undefined') { 216 | options.runScripts = false; 217 | } 218 | this.browser.runScripts = options.runScripts; 219 | 220 | this.get = function () {}; 221 | } 222 | util.inherits(ZombieSpider, Spider); 223 | 224 | module.exports = function (options) {return new Spider(options || {})} 225 | module.exports.jsdom = jsdom; 226 | 227 | 228 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { "name" : "spider" 2 | , "description" : "Programmable spidering of web sites with node.js and jQuery" 3 | , "tags" : ["dom","javascript","crawling","jquery", "spider", "spidering"] 4 | , "version" : "0.1.0" 5 | , "author" : "Mikeal Rogers " 6 | , "repository" : 7 | { "type" : "git" 8 | , "url" : "http://github.com/mikeal/spider.git" 9 | } 10 | , "bugs" : 11 | { "web" : "http://github.com/mikeal/spider/issues" } 12 | , "engines" : ["node >= 0.6.4"] 13 | , "main" : "./main" 14 | , "dependencies": 15 | { "request" : ">= 1.9.3" 16 | , "jsdom" : ">= 0.2.13" 17 | , "routes" : ">= 0.1.0" 18 | , "cookiejar": ">= 1.3.0" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/memory_leak.js: -------------------------------------------------------------------------------- 1 | /** 2 | * AS taken from this gist https://gist.github.com/gmarcus/934787 3 | * original dies after 840 fetches 4 | * @type {*} 5 | */ 6 | 7 | var sys = require('sys'); 8 | var util = require('util'); 9 | var spider = require('../main'); 10 | var counter = 0; 11 | 12 | spider() 13 | .route('itunes.apple.com', '/us/genre/*', function (window, $) { 14 | if (this.fromCache) return; 15 | 16 | console.log("Fetching page: %s, for the %s th time", this.spider.currentUrl, ++counter); 17 | 18 | // spider all genres 19 | $('div#genre-nav.main.nav a').spider(); 20 | 21 | // spider all letters per genre 22 | $('div#selectedgenre ul.list.alpha li a').spider(); 23 | 24 | // spider all numbered pages of letters per genre 25 | $('div#selectedgenre ul.list.paginate li a').spider(); 26 | 27 | 28 | // // fetch apps JSON and store in a database (not implemented yet) 29 | // $("#selectedgenre .column a").each(function(i,a) { 30 | // // extract the iTunes URL 31 | // var aHref = a.href; 32 | // console.log("Recording " + aHref); 33 | // 34 | // }); 35 | 36 | // console.log(util.inspect(this.spider, false, null)); 37 | }) 38 | .get('http://itunes.apple.com/us/genre/ios/id36?mt=8') 39 | .log('info') 40 | ; -------------------------------------------------------------------------------- /tests/test_foodie.js: -------------------------------------------------------------------------------- 1 | var spider = require('../main') 2 | , urlParse = require('url').parse 3 | ; 4 | 5 | var dishes = []; 6 | 7 | process.on('exit', function () { 8 | console.log('number,dish,restaurant,address,telephone,info,title') 9 | dishes.forEach(function (row) { 10 | var line = ''; 11 | line += JSON.stringify(row.number || '') 12 | line += ',' 13 | line += JSON.stringify(row.dish || '') 14 | line += ',' 15 | line += JSON.stringify(row.restaurant || '') 16 | line += ',' 17 | line += JSON.stringify(row.address || '') 18 | line += ',' 19 | line += JSON.stringify(row.telephone || '') 20 | line += ',' 21 | line += JSON.stringify(row.info || '') 22 | line += ',' 23 | line += JSON.stringify(row.title || '') 24 | console.log(line) 25 | }) 26 | }) 27 | 28 | var s = spider() 29 | s.route('blogs.sfweekly.com', '/foodie/sfoodies_92/index.php*', function (window, $) { 30 | $('div.primaryCategory:contains("SFoodie\'s 92")') 31 | .each(function () { 32 | var entry = $(this).parent() 33 | , title = entry.find('h2').text() 34 | ; 35 | 36 | number = parseInt(title.slice(4, title.indexOf(':'))) 37 | 38 | if (isNaN(number)) return; 39 | 40 | title = title.slice(title.indexOf(':')+2) 41 | 42 | var restaurant, dish; 43 | if (title.indexOf(' at ') !== -1) { 44 | dish = title.slice(0, title.indexOf(' at ')) 45 | restaurant = title.slice(title.indexOf(' at ')+' at '.length); 46 | } else if (title.indexOf(' from ') !== -1) { 47 | dish = title.slice(0, title.indexOf(' from ')) 48 | restaurant = title.slice(title.indexOf(' from ')+' from '.length); 49 | } else if (title.indexOf('\'s') !== -1) { 50 | restaurant = title.slice(0, title.indexOf('\'s')) 51 | dish = title.slice(title.indexOf('\'s')+'\'s '.length); 52 | } else if (title.indexOf('s\'') !== -1) { 53 | restaurant = title.slice(0, title.indexOf('s\'')+1) 54 | dish = title.slice(title.indexOf('s\'')+'s\' '.length); 55 | } 56 | 57 | var infourl = entry.find('a.moreLink') 58 | , csv = {number:number,title:title,dish:dish,restaurant:restaurant} 59 | , info 60 | ; 61 | if (infourl.length !== 0) { 62 | var u = urlParse(infourl.attr('href')); 63 | s.route(u.hostname, u.pathname, function (window, $) { 64 | var text = $('strong:contains("'+restaurant+'")').parent().text() 65 | text = text.slice(text.indexOf(restaurant+':')+(restaurant+':').length+1) 66 | info = text.slice(0, text.indexOf('\n')) 67 | csv.info = info 68 | if (info.lastIndexOf('-') !== -1) { 69 | var phone = info.slice(info.lastIndexOf('-')-3, info.lastIndexOf('-')+5); 70 | if (!isNaN(parseInt(phone.replace('-','')))) { 71 | csv.telephone = phone 72 | csv.address = info.slice(0, info.indexOf(csv.phone)-2) 73 | } 74 | } 75 | }) 76 | s.get(u.href) 77 | } 78 | 79 | dishes.push(csv) 80 | }) 81 | 82 | $('a').spider(); 83 | }) 84 | s.get('http://blogs.sfweekly.com/foodie/sfoodies_92/index.php?page=1'); 85 | -------------------------------------------------------------------------------- /tests/test_nytimes.js: -------------------------------------------------------------------------------- 1 | var spider = require('../main'); 2 | 3 | spider() 4 | .route('www.nytimes.com', '/pages/dining/index.html', function (window, $) { 5 | $('a').spider(); 6 | }) 7 | .route('travel.nytimes.com', '*', function (window, $) { 8 | $('a').spider(); 9 | if (this.fromCache) return; 10 | 11 | var article = { title: $('nyt_headline').text(), articleBody: '', photos: [] } 12 | article.body = '' 13 | $('div.articleBody').each(function () { 14 | article.body += this.outerHTML; 15 | }) 16 | $('div#abColumn img').each(function () { 17 | var p = $(this).attr('src'); 18 | if (p.indexOf('ADS') === -1) { 19 | article.photos.push(p); 20 | } 21 | }) 22 | // console.log(article); 23 | }) 24 | .route('dinersjournal.blogs.nytimes.com', '*', function (window, $) { 25 | var article = {title: $('h1.entry-title').text()} 26 | // console.log($('div.entry-content').html()) 27 | }) 28 | .get('http://www.nytimes.com/pages/dining/index.html') 29 | .log('info') 30 | ; --------------------------------------------------------------------------------