├── .gitignore
├── LICENSE
├── README.md
├── index.js
└── package.json


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | test*
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Thibaut Séguy <thibaut.seguy@gmail.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | google-search-scraper
 2 | =============
 3 | ### Google search scraper with captcha solving support
 4 | 
 5 | This module allows google search results extraction in a simple yet flexible way, and handles captcha solving transparently (through external services or your own hand-made solver).
 6 | 
 7 | Out of the box you can target a specific google search host, specify a language and limit search results returned. Extending these defaults with custom URL params is supported through options.
 8 | 
 9 | A word of warning: This code is intented for educational and research use only. Use responsibly.
10 | 
11 | 
12 | Installation
13 | ------------
14 | 
15 | ``` bash
16 | $ npm install google-search-scraper
17 | ```
18 | 
19 | 
20 | Examples
21 | --------
22 | 
23 | Grab first 10 results for 'nodejs'
24 | 
25 | ``` javascript
26 | var scraper = require('google-search-scraper');
27 | 
28 | var options = {
29 |   query: 'nodejs',
30 |   limit: 10
31 | };
32 | 
33 | scraper.search(options, function(err, url, meta) {
34 |   // This is called for each result
35 |   if(err) throw err;
36 |   console.log(url);
37 |   console.log(meta.title);
38 |   console.log(meta.meta);
39 |   console.log(meta.desc)
40 | });
41 | ``` 
42 | 
43 | Various options combined
44 | 
45 | ``` javascript
46 | var scraper = require('google-search-scraper');
47 | 
48 | var options = {
49 |   query: 'grenouille',
50 |   host: 'www.google.fr',
51 |   lang: 'fr',
52 |   age: 'd1', // last 24 hours ([hdwmy]\d? as in google URL)
53 |   limit: 10,
54 |   params: {} // params will be copied as-is in the search URL query string
55 | };
56 | 
57 | scraper.search(options, function(err, url) {
58 |   // This is called for each result
59 |   if(err) throw err;
60 |   console.log(url)
61 | });
62 | ```
63 | 
64 | Extract all results on edu sites for "information theory" and solve captchas along the way
65 | 
66 | ``` javascript
67 | var scraper = require('google-search-scraper');
68 | var DeathByCaptcha = require('deathbycaptcha');
69 | 
70 | var dbc = new DeathByCaptcha('username', 'password');
71 | 
72 | var options = {
73 |   query: 'site:edu "information theory"',
74 |   age: 'y', // less than a year,
75 |   solver: dbc
76 | };
77 | 
78 | scraper.search(options, function(err, url) {
79 |   // This is called for each result
80 |   if(err) throw err;
81 |   console.log(url)
82 | });
83 | ```
84 | 
85 | You can easily plug your own solver, implementing a solve method with the following signature:
86 | 
87 | ```javascript
88 | var customSolver = {
89 |   solve: function(imageData, callback) {
90 |     // Do something with image data, like displaying it to the user
91 |     // id is used by BDC to allow reporting solving errors and can be safely ignored here
92 |     var id = null; 
93 |     callback(err, id, solutionText);
94 |   }
95 | };
96 | ``` 
97 | 
98 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | var request = require('request');
  2 | var cheerio = require('cheerio');
  3 | var url     = require('url');
  4 | 
  5 | function search(options, callback) {
  6 | 
  7 |   var session = request.defaults({ jar : true });
  8 |   var host = options.host || 'www.google.com';
  9 |   var solver = options.solver;
 10 |   var params = options.params || {};
 11 |   var results = [];
 12 | 
 13 |   params.hl = params.hl || options.lang || 'en';
 14 | 
 15 |   if(options.age) params.tbs = 'qdr:' + options.age;
 16 |   if(options.query) params.q = options.query;
 17 | 
 18 |   params.start = params.start || 0;
 19 | 
 20 |   getPage(params, function onPage(err, body) {
 21 |     if(err) {
 22 |       if(err.code !== 'ECAPTCHA' || !solver) return callback(err);
 23 | 
 24 |       solveCaptcha(err.location, function(err, page) {
 25 |         if(err) return callback(err);
 26 |         onPage(null, page);
 27 |       });
 28 | 
 29 |       return;
 30 |     }
 31 | 
 32 |     var currentResults = extractResults(body);
 33 | 
 34 |     var newResults = currentResults.filter(function(result) {
 35 |       return results.indexOf(result) === -1;
 36 |     });
 37 | 
 38 |     newResults.forEach(function(result) {
 39 |       callback(null, result['url'], result);
 40 |     });
 41 | 
 42 |     if(newResults.length === 0) {
 43 |       return;
 44 |     }
 45 | 
 46 |     results = results.concat(newResults);
 47 | 
 48 |     if(!options.limit || results.length < options.limit) {
 49 |       params.start = results.length;
 50 |       getPage(params, onPage);
 51 |     }
 52 |   });
 53 | 
 54 | 
 55 |   function getPage(params, callback) {
 56 |     session.get({
 57 |         uri: 'https://' + host + '/search',
 58 |         qs: params,
 59 |         followRedirect: false
 60 |       }, 
 61 |       function(err, res) {
 62 |         if(err) return callback(err);
 63 | 
 64 |         if(res.statusCode === 302) {
 65 |           var parsed = url.parse(res.headers.location, true);
 66 | 
 67 |           if(parsed.pathname !== '/search') {
 68 |             var err = new Error('Captcha');
 69 |             err.code = 'ECAPTCHA';
 70 |             err.location = res.headers.location;
 71 |             this.abort();
 72 |             return callback(err);
 73 |           } else {
 74 |             session.get({
 75 |               uri: res.headers.location,
 76 |               qs: params,
 77 |               followRedirect: false
 78 |             }, function(err, res) {
 79 |               if(err) return callback(err);
 80 |               callback(null, res.body);
 81 |             });
 82 |             return;
 83 |           }
 84 |         }
 85 | 
 86 |         callback(null, res.body);
 87 |       }
 88 |     );
 89 |   }
 90 | 
 91 |   function extractResults(body) {
 92 |     var results = [];
 93 |     var $ = cheerio.load(body);
 94 | 
 95 |     $('#search .g').each(function(i, elem) {
 96 |       var item = {};
 97 | 
 98 |       var elemUrl = $(this).find("h3 a");
 99 |       var elemMeta = $(this).find(".slp");
100 |       var elemDesc = $(this).find(".st");
101 |       var parsedUrl = url.parse(elemUrl.attr("href"), true);
102 |       if (parsedUrl.pathname === '/url') {
103 |         item['url'] = parsedUrl.query.q;
104 |       }
105 |       item['title'] = elemUrl.text();
106 |       item['meta'] = elemMeta.text();
107 |       item['desc'] = elemDesc.text();
108 | 
109 |       results.push(item);
110 |     });    
111 | 
112 |     return results;
113 |   }
114 | 
115 |   function solveCaptcha(captchaUrl, callback) {
116 | 
117 |     var tmp = url.parse(captchaUrl);
118 |     var baseUrl = url.format({
119 |       protocol: tmp.protocol,
120 |       hostname: tmp.host,
121 |     });
122 | 
123 |     // Fetch captcha page
124 |     session.get(captchaUrl, function(err, res) {
125 |       if(err) return callback(err);
126 | 
127 |       var $ = cheerio.load(res.body);
128 |       var captchaId = $('input[name=id]').attr('value');
129 |       var continueUrl = $('input[name=continue]').attr('value');
130 |       var formAction = $('form').attr('action');
131 |       var imgSrc = $('img').attr('src');
132 | 
133 |       // Fetch captcha image
134 |       session.get({uri: baseUrl + imgSrc, encoding: null}, function(err, res) {
135 |         if(err) return callback(err);
136 | 
137 |         // Send to solver
138 |         solver.solve(res.body, function(err, id, solution) {
139 |           if(err) return callback(err);
140 | 
141 |           // Try solution
142 |           session.get({
143 |               uri: baseUrl + '/sorry/' + formAction,
144 |               qs: {
145 |                 id: captchaId,
146 |                 captcha: solution,
147 |                 continue: continueUrl
148 |               }
149 |             }, 
150 |             function(err, res) {
151 |               if(res.statusCode !== 200) return callback(new Error('Captcha decoding failed'));
152 |               callback(null, res.body);
153 |             }
154 |           );
155 | 
156 |         });
157 | 
158 |       });
159 | 
160 |     });
161 | 
162 |   }
163 | 
164 | }
165 | 
166 | module.exports.search = search;
167 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "google-search-scraper",
 3 |   "version": "0.1.0",
 4 |   "description": "Google search scraper with captcha solving support",
 5 |   "author": "thibauts",
 6 |   "license": "MIT",
 7 |   "main": "index.js",
 8 |   "dependencies": {
 9 |     "cheerio": "~0.13.1",
10 |     "request": "~2.33.0"
11 |   },
12 |   "repository": {
13 |     "type": "git",
14 |     "url": "git://github.com/thibauts/node-google-search-scraper.git"
15 |   },
16 |   "keywords": [
17 |     "google",
18 |     "search",
19 |     "scraper",
20 |     "decaptcher",
21 |     "captcha"
22 |   ]
23 | }
24 | 


--------------------------------------------------------------------------------