├── .gitattributes ├── .gitignore ├── README.md ├── index.js ├── lib ├── common.js ├── pager.js └── xpath.js └── package.json /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | proxy.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapejs 2 | ======== 3 | 4 | A web scraping framework for node 5 | 6 | ## Introduction 7 | 8 | Powerful, easy to use web scraping framework, built on top of request, cheerio, and xpath. Support both xpath and jQuery selectors. 9 | 10 | ## Install 11 | 12 | npm install scrapejs 13 | 14 | ## Samples 15 | ```javascript 16 | var sp = require('scrapejs').init({ 17 | cc: 2, // up to 2 concurrent requests 18 | delay: 5 * 1000 // delay 5 seconds before each request 19 | }); 20 | 21 | sp.load('https://www.google.com/search?q=scraping') 22 | .then(function($){ 23 | $.q("//h3[@class='r']/a").forEach(function(node){ 24 | var res = { 25 | title: node.textContent, 26 | url: node.x("./@href") 27 | } 28 | console.log(res); 29 | }) 30 | }) 31 | .fail(function(err){ 32 | console.log(err); 33 | }) 34 | 35 | ``` 36 | 37 | 38 | ## License 39 | 40 | (The MIT License) 41 | 42 | Copyright (c) 2013 Cung Nguyen 43 | 44 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 45 | 46 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 47 | 48 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 2 | var path = require('path') 3 | , fs = require('fs') 4 | , urlUtil = require('url') 5 | , async = require('async') 6 | , Q = require('q') 7 | , _ = require('underscore') 8 | , cheerio = require('cheerio') 9 | , common = require('./lib/common') 10 | 11 | 12 | /** 13 | * init an instance of the Scraper 14 | */ 15 | module.exports.init = function(options){ 16 | return new Scraper(options); 17 | } 18 | module.exports.common = common; 19 | 20 | 21 | 22 | /** 23 | * Constructor 24 | */ 25 | 26 | function Scraper(options){ 27 | var defaultOptions = { 28 | cc:1, 29 | delay:5, 30 | timeout:60*000, 31 | proxy: false, 32 | proxies:[], 33 | proxy_file: path.join(__dirname, '../proxy.txt'), 34 | proxy_auth:'', 35 | cache: false, 36 | dir: path.join(__dirname, "../") 37 | }; 38 | this.options = options || {} 39 | if(this.options.proxy_file || this.options.proxy_auth || this.options.proxies){ 40 | //auto enabling the proxies feature when caller provides one of these options 41 | this.options.proxy = true; 42 | } 43 | //override default optiosn with custom options 44 | this.options = common.mergeObjs(this.options, defaultOptions); 45 | 46 | this.__loadProxies(); 47 | 48 | this.request = require('request'); 49 | var self = this; 50 | var worker = function(options, cb){ 51 | self.__doLoad(options.url, options.options).then(function($){ 52 | cb();//indicate to the queue that this request is done 53 | options.handler(null, $); //pass response data to handler 54 | 55 | }).fail(function(err){ 56 | cb();//indicate to the queue that this request is done 57 | options.handler(err, null); //pass response data to handler 58 | }) 59 | } 60 | this.queue = async.queue(worker, options.cc); 61 | } 62 | 63 | //*** API **** 64 | 65 | //support: load(url), load(options), or load(url, options) 66 | Scraper.prototype.load = function(url, options){ 67 | if(typeof url === 'object'){ 68 | //case: load(options) 69 | options = url; 70 | url = options.url; 71 | } 72 | options = options || {} 73 | url = url || ''; 74 | if(options.form) options.method = options.method || 'POST'; 75 | if(url.contains('http://localhost') || url.contains('127.0.0.1')) options.no_proxy = true; 76 | 77 | var deferred = Q.defer(); 78 | this.queue.push({url:url, options:options, handler: function(err, $){ 79 | if(err) 80 | deferred.reject(err); 81 | else 82 | deferred.resolve($); 83 | }}) 84 | 85 | return deferred.promise; 86 | } 87 | 88 | Scraper.prototype.pagin = function(options){ 89 | require('./lib/pager').create({ 90 | sp: this, 91 | init: options.init, 92 | loadedHandler: options.loaded, 93 | doneHandler: options.done || function(err){ 94 | if(err) console.log(err); 95 | } 96 | }).start(); 97 | } 98 | //***-- end of API *** 99 | 100 | /** 101 | * load given url using Mikeal's Request object 102 | */ 103 | Scraper.prototype.__doLoad = function(url, options ){ 104 | 105 | var deferred = Q.defer(); 106 | 107 | options.url = url; 108 | options.headers || (options.headers = {"User-Agent" : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'} ); 109 | options.strictSSL = false; 110 | options.followAllRedirects = true; 111 | options.timeout = options.timeout || this.options.timeout; 112 | if(this.options.proxy && options.proxy !== false){ 113 | var proxy = this.options.proxies[Math.floor( (Math.random() * this.options.proxies.length) + 1) - 1].trim(); 114 | if(this.options.proxy_auth){ 115 | proxy = 'http://' + this.options.proxy_auth + '@' + proxy; 116 | } 117 | options.proxy = proxy; 118 | } 119 | var sp = this; 120 | setTimeout(function(){ 121 | try{ 122 | sp.request(options, function(err, res, body) { 123 | var accept_codes = options.accept_codes || []; 124 | accept_codes.push(200); 125 | 126 | if(err || accept_codes.indexOf(res.statusCode) === -1){ 127 | deferred.reject(err || new Error("httpcode: " +res.statusCode)); 128 | }else{ 129 | //success 130 | if(options.plain_text){ 131 | //simply return body as text file 132 | deferred.resolve(body); 133 | }else{ 134 | //var cheerio = require('cheerio'); 135 | var $ = cheerio.load(body); 136 | 137 | //make cheerio object xpath-able 138 | _.extend($, require('./lib/xpath')); 139 | 140 | //resolve all relative links to absolute 141 | $('a').each(function(i, el){ 142 | var old = $(el).attr('href'); 143 | if(old && old.length && !old.contains('javascript') && !old.contains('mailto:') && old[0] !== '#'){ 144 | $(this).attr('href', urlUtil.resolve(url, old)) 145 | } 146 | 147 | }) 148 | $('form').each(function(i, el){ 149 | var old = $(el).attr('action'); 150 | if(old && old.length && !old.contains('javascript') && old[0] !== '#'){ 151 | $(this).attr('action', urlUtil.resolve(url, old)) 152 | } 153 | 154 | }) 155 | //if(res.statusCode !== 200) {console.log(res);process.exit();} 156 | 157 | deferred.resolve($); 158 | } 159 | } 160 | }); 161 | }catch(err){ 162 | deferred.reject(err); 163 | } 164 | }, this.options.delay) 165 | 166 | return deferred.promise; 167 | 168 | } 169 | Scraper.prototype.__loadProxies = function(){ 170 | if(!this.options.proxy_file || !fs.existsSync(this.options.proxy_file)) return; 171 | //reset proxies 172 | this.options.proxies = []; 173 | var self = this; 174 | fs.readFileSync(this.options.proxy_file).toString().split("\n") 175 | .forEach(function(proxy){ 176 | self.options.proxies.push(proxy.trim()); 177 | }) 178 | 179 | } 180 | 181 | 182 | -------------------------------------------------------------------------------- /lib/common.js: -------------------------------------------------------------------------------- 1 | var crypto = require('crypto'), 2 | fs = require('fs'); 3 | 4 | String.prototype.contains = function(str){ 5 | return this.indexOf(str) !== -1; 6 | } 7 | 8 | String.prototype.subreg = function(reg){ 9 | var m = this.match(reg); 10 | if(m) return m[1]; else return ''; 11 | } 12 | 13 | String.prototype.sub = function(start, end){ 14 | start = start || ''; 15 | var iStart = (start)? this.indexOf(start) : 0; 16 | if(iStart === -1) return '';//not found 17 | iStart += start.length; //to get the next char 18 | 19 | var iTo = this.indexOf(end, iStart); 20 | if(iTo === -1) return '';//not found 21 | 22 | var res = this.substring(iStart, iTo); 23 | res = res || ''; 24 | return res; 25 | 26 | } 27 | 28 | 29 | var common = {}; 30 | 31 | common.md5 = function (str) { 32 | return crypto.createHash('md5').update(str).digest('hex') 33 | } 34 | common.mergeObjs = function(obj1, obj2) { 35 | obj1 = obj1 || {}; 36 | obj2 = obj2 || {}; 37 | var obj3 = {}; 38 | for (var attrname in obj2) obj3[attrname] = obj2[attrname]; 39 | for (var attrname in obj1) obj3[attrname] = obj1[attrname]; 40 | return obj3; 41 | } 42 | 43 | common.savecsv = function(obj, options){ 44 | var filePath = options.path; 45 | var s = options.s || ',', 46 | q = options.q || '"', 47 | e = options.e || '"'; 48 | 49 | var values = []; 50 | var keys = []; 51 | for(var k in obj){ 52 | var value = (obj[k] + "").trim().replace(q, e+q).replace('\r',''); 53 | var key = (k + "").trim().replace(q, e+q); 54 | values.push(q + value + q); 55 | keys.push(q+ key + q); 56 | 57 | } 58 | 59 | if(!fs.existsSync(filePath)){ 60 | //write the header 61 | fs.appendFileSync(filePath, keys.join(s) + "\r\n"); 62 | 63 | } 64 | fs.appendFileSync(filePath, values.join(s) + "\r\n"); 65 | 66 | } 67 | 68 | common.range = function (low, high, step) { 69 | // http://kevin.vanzonneveld.net 70 | // + original by: Waldo Malqui Silva 71 | // * example 1: range ( 0, 12 ); 72 | // * returns 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 73 | // * example 2: range( 0, 100, 10 ); 74 | // * returns 2: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] 75 | // * example 3: range( 'a', 'i' ); 76 | // * returns 3: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'] 77 | // * example 4: range( 'c', 'a' ); 78 | // * returns 4: ['c', 'b', 'a'] 79 | var matrix = []; 80 | var inival, endval, plus; 81 | var walker = step || 1; 82 | var chars = false; 83 | 84 | if (!isNaN(low) && !isNaN(high)) { 85 | inival = low; 86 | endval = high; 87 | } else if (isNaN(low) && isNaN(high)) { 88 | chars = true; 89 | inival = low.charCodeAt(0); 90 | endval = high.charCodeAt(0); 91 | } else { 92 | inival = (isNaN(low) ? 0 : low); 93 | endval = (isNaN(high) ? 0 : high); 94 | } 95 | 96 | plus = ((inival > endval) ? false : true); 97 | if (plus) { 98 | while (inival <= endval) { 99 | matrix.push(((chars) ? String.fromCharCode(inival) : inival)); 100 | inival += walker; 101 | } 102 | } else { 103 | while (inival >= endval) { 104 | matrix.push(((chars) ? String.fromCharCode(inival) : inival)); 105 | inival -= walker; 106 | } 107 | } 108 | 109 | return matrix; 110 | } 111 | 112 | module.exports = common; 113 | -------------------------------------------------------------------------------- /lib/pager.js: -------------------------------------------------------------------------------- 1 | module.exports.create = function(options){ 2 | return new Pager(options); 3 | } 4 | 5 | function Pager(options){ 6 | this.sp = options.sp; 7 | this.init = options.init; 8 | this.loadedHandler = options.loadedHandler; 9 | this.doneHandler = options.doneHandler; 10 | this.page = 0; 11 | 12 | } 13 | 14 | Pager.prototype.process = function(options){ 15 | var pager = this; 16 | 17 | if(!options) return pager.doneHandler(); 18 | if(typeof options === 'string') options = {url: options}; 19 | if(!options.url) return pager.doneHandler(); 20 | 21 | pager.sp.load(options.url, options) 22 | .then(function($){ 23 | pager.page++; 24 | $.stats = {page: pager.page}; 25 | 26 | //success, call the data handler and wait for next action 27 | var next = function(options){ 28 | pager.process(options); 29 | } 30 | return pager.loadedHandler($, next); 31 | }) 32 | .fail(function(err){ 33 | return pager.doneHandler(err); 34 | }) 35 | } 36 | 37 | Pager.prototype.start = function(){ 38 | this.process(this.init); 39 | } 40 | -------------------------------------------------------------------------------- /lib/xpath.js: -------------------------------------------------------------------------------- 1 | /* 2 | provide xpath functions for cheerio object 3 | */ 4 | 5 | var xpath = require('xpath') 6 | , dom = require('xmldom').DOMParser 7 | 8 | 9 | function x(strXpath, node){ 10 | var xpath = require('xpath'); 11 | var nodes = xpath.select(strXpath, node); 12 | if(nodes.length > 0){ 13 | return nodes[0].textContent || ""; 14 | }else{ 15 | return ""; 16 | } 17 | } 18 | var q = function(strXpath, node){ 19 | var nodes = xpath.select(strXpath, node) || []; 20 | 21 | nodes.forEach(function(node){ 22 | //adding function for each node 23 | node.x = function(strXpath){ 24 | return x(strXpath, this); 25 | } 26 | 27 | node.q = function(strXpath){ 28 | return q(strXpath, this); 29 | } 30 | }); 31 | 32 | //adding function to join nodes' values 33 | nodes.join = function(sep){ 34 | var arr = []; 35 | nodes.forEach(function(node){ 36 | arr.push(node.textContent.trim()); 37 | }); 38 | return arr.join(sep); 39 | } 40 | 41 | return nodes; 42 | } 43 | 44 | exports.q = function(xp){ 45 | var $ = this; 46 | 47 | var doc = new dom(({ errorHandler:{warning:function(err){},error:function(err){},fatalError:function(err){}}})) 48 | .parseFromString($.html()); 49 | 50 | return q(xp, doc); 51 | } 52 | 53 | exports.x = function(xp){ 54 | var $ = this; 55 | 56 | var doc = new dom(({ errorHandler:{warning:function(err){},error:function(err){},fatalError:function(err){}}})) 57 | .parseFromString($.html()); 58 | 59 | return x(xp, doc); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrapejs", 3 | "description": "a web scraping framework for node.js", 4 | "author": "Cung Nguyen ", 5 | 6 | "tags": [ 7 | "web scraping", 8 | "web scraper", 9 | "spider", 10 | "crawler", 11 | "bot" 12 | ], 13 | "version": "1.0.0", 14 | "repository": { 15 | "type": "git", 16 | "url": "https://github.com/cungnv/scrapejs.git" 17 | }, 18 | "main": "./index.js", 19 | "engines": { 20 | "node": ">= 0.10" 21 | }, 22 | "dependencies": { 23 | "request": "latest", 24 | "cheerio": "latest", 25 | "async": "latest", 26 | "q": "latest", 27 | "underscore": "latest", 28 | "xpath": "latest", 29 | "xmldom": "latest" 30 | } 31 | } 32 | --------------------------------------------------------------------------------