├── .gitignore ├── LICENSE ├── README.md ├── index.js └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | test* 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Thibaut Séguy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | google-search-scraper 2 | ============= 3 | ### Google search scraper with captcha solving support 4 | 5 | This module allows google search results extraction in a simple yet flexible way, and handles captcha solving transparently (through external services or your own hand-made solver). 6 | 7 | Out of the box you can target a specific google search host, specify a language and limit search results returned. Extending these defaults with custom URL params is supported through options. 8 | 9 | A word of warning: This code is intented for educational and research use only. Use responsibly. 10 | 11 | 12 | Installation 13 | ------------ 14 | 15 | ``` bash 16 | $ npm install google-search-scraper 17 | ``` 18 | 19 | 20 | Examples 21 | -------- 22 | 23 | Grab first 10 results for 'nodejs' 24 | 25 | ``` javascript 26 | var scraper = require('google-search-scraper'); 27 | 28 | var options = { 29 | query: 'nodejs', 30 | limit: 10 31 | }; 32 | 33 | scraper.search(options, function(err, url, meta) { 34 | // This is called for each result 35 | if(err) throw err; 36 | console.log(url); 37 | console.log(meta.title); 38 | console.log(meta.meta); 39 | console.log(meta.desc) 40 | }); 41 | ``` 42 | 43 | Various options combined 44 | 45 | ``` javascript 46 | var scraper = require('google-search-scraper'); 47 | 48 | var options = { 49 | query: 'grenouille', 50 | host: 'www.google.fr', 51 | lang: 'fr', 52 | age: 'd1', // last 24 hours ([hdwmy]\d? as in google URL) 53 | limit: 10, 54 | params: {} // params will be copied as-is in the search URL query string 55 | }; 56 | 57 | scraper.search(options, function(err, url) { 58 | // This is called for each result 59 | if(err) throw err; 60 | console.log(url) 61 | }); 62 | ``` 63 | 64 | Extract all results on edu sites for "information theory" and solve captchas along the way 65 | 66 | ``` javascript 67 | var scraper = require('google-search-scraper'); 68 | var DeathByCaptcha = require('deathbycaptcha'); 69 | 70 | var dbc = new DeathByCaptcha('username', 'password'); 71 | 72 | var options = { 73 | query: 'site:edu "information theory"', 74 | age: 'y', // less than a year, 75 | solver: dbc 76 | }; 77 | 78 | scraper.search(options, function(err, url) { 79 | // This is called for each result 80 | if(err) throw err; 81 | console.log(url) 82 | }); 83 | ``` 84 | 85 | You can easily plug your own solver, implementing a solve method with the following signature: 86 | 87 | ```javascript 88 | var customSolver = { 89 | solve: function(imageData, callback) { 90 | // Do something with image data, like displaying it to the user 91 | // id is used by BDC to allow reporting solving errors and can be safely ignored here 92 | var id = null; 93 | callback(err, id, solutionText); 94 | } 95 | }; 96 | ``` 97 | 98 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var request = require('request'); 2 | var cheerio = require('cheerio'); 3 | var url = require('url'); 4 | 5 | function search(options, callback) { 6 | 7 | var session = request.defaults({ jar : true }); 8 | var host = options.host || 'www.google.com'; 9 | var solver = options.solver; 10 | var params = options.params || {}; 11 | var results = []; 12 | 13 | params.hl = params.hl || options.lang || 'en'; 14 | 15 | if(options.age) params.tbs = 'qdr:' + options.age; 16 | if(options.query) params.q = options.query; 17 | 18 | params.start = params.start || 0; 19 | 20 | getPage(params, function onPage(err, body) { 21 | if(err) { 22 | if(err.code !== 'ECAPTCHA' || !solver) return callback(err); 23 | 24 | solveCaptcha(err.location, function(err, page) { 25 | if(err) return callback(err); 26 | onPage(null, page); 27 | }); 28 | 29 | return; 30 | } 31 | 32 | var currentResults = extractResults(body); 33 | 34 | var newResults = currentResults.filter(function(result) { 35 | return results.indexOf(result) === -1; 36 | }); 37 | 38 | newResults.forEach(function(result) { 39 | callback(null, result['url'], result); 40 | }); 41 | 42 | if(newResults.length === 0) { 43 | return; 44 | } 45 | 46 | results = results.concat(newResults); 47 | 48 | if(!options.limit || results.length < options.limit) { 49 | params.start = results.length; 50 | getPage(params, onPage); 51 | } 52 | }); 53 | 54 | 55 | function getPage(params, callback) { 56 | session.get({ 57 | uri: 'https://' + host + '/search', 58 | qs: params, 59 | followRedirect: false 60 | }, 61 | function(err, res) { 62 | if(err) return callback(err); 63 | 64 | if(res.statusCode === 302) { 65 | var parsed = url.parse(res.headers.location, true); 66 | 67 | if(parsed.pathname !== '/search') { 68 | var err = new Error('Captcha'); 69 | err.code = 'ECAPTCHA'; 70 | err.location = res.headers.location; 71 | this.abort(); 72 | return callback(err); 73 | } else { 74 | session.get({ 75 | uri: res.headers.location, 76 | qs: params, 77 | followRedirect: false 78 | }, function(err, res) { 79 | if(err) return callback(err); 80 | callback(null, res.body); 81 | }); 82 | return; 83 | } 84 | } 85 | 86 | callback(null, res.body); 87 | } 88 | ); 89 | } 90 | 91 | function extractResults(body) { 92 | var results = []; 93 | var $ = cheerio.load(body); 94 | 95 | $('#search .g').each(function(i, elem) { 96 | var item = {}; 97 | 98 | var elemUrl = $(this).find("h3 a"); 99 | var elemMeta = $(this).find(".slp"); 100 | var elemDesc = $(this).find(".st"); 101 | var parsedUrl = url.parse(elemUrl.attr("href"), true); 102 | if (parsedUrl.pathname === '/url') { 103 | item['url'] = parsedUrl.query.q; 104 | } 105 | item['title'] = elemUrl.text(); 106 | item['meta'] = elemMeta.text(); 107 | item['desc'] = elemDesc.text(); 108 | 109 | results.push(item); 110 | }); 111 | 112 | return results; 113 | } 114 | 115 | function solveCaptcha(captchaUrl, callback) { 116 | 117 | var tmp = url.parse(captchaUrl); 118 | var baseUrl = url.format({ 119 | protocol: tmp.protocol, 120 | hostname: tmp.host, 121 | }); 122 | 123 | // Fetch captcha page 124 | session.get(captchaUrl, function(err, res) { 125 | if(err) return callback(err); 126 | 127 | var $ = cheerio.load(res.body); 128 | var captchaId = $('input[name=id]').attr('value'); 129 | var continueUrl = $('input[name=continue]').attr('value'); 130 | var formAction = $('form').attr('action'); 131 | var imgSrc = $('img').attr('src'); 132 | 133 | // Fetch captcha image 134 | session.get({uri: baseUrl + imgSrc, encoding: null}, function(err, res) { 135 | if(err) return callback(err); 136 | 137 | // Send to solver 138 | solver.solve(res.body, function(err, id, solution) { 139 | if(err) return callback(err); 140 | 141 | // Try solution 142 | session.get({ 143 | uri: baseUrl + '/sorry/' + formAction, 144 | qs: { 145 | id: captchaId, 146 | captcha: solution, 147 | continue: continueUrl 148 | } 149 | }, 150 | function(err, res) { 151 | if(res.statusCode !== 200) return callback(new Error('Captcha decoding failed')); 152 | callback(null, res.body); 153 | } 154 | ); 155 | 156 | }); 157 | 158 | }); 159 | 160 | }); 161 | 162 | } 163 | 164 | } 165 | 166 | module.exports.search = search; 167 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "google-search-scraper", 3 | "version": "0.1.0", 4 | "description": "Google search scraper with captcha solving support", 5 | "author": "thibauts", 6 | "license": "MIT", 7 | "main": "index.js", 8 | "dependencies": { 9 | "cheerio": "~0.13.1", 10 | "request": "~2.33.0" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git://github.com/thibauts/node-google-search-scraper.git" 15 | }, 16 | "keywords": [ 17 | "google", 18 | "search", 19 | "scraper", 20 | "decaptcher", 21 | "captcha" 22 | ] 23 | } 24 | --------------------------------------------------------------------------------