├── .gitignore ├── test ├── mocha.opts └── google.test.js ├── .travis.yml ├── package.json ├── LICENSE ├── CHANGELOG.md ├── lib └── google.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --reporter spec 2 | --ui bdd 3 | --growl 4 | --timeout 40000 -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 0.10 4 | - 0.12 5 | - iojs 6 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "google", 3 | "version": "2.1.0", 4 | "description": "A module to search and scrape google. This is not sponsored, supported, or affiliated with Google Inc.", 5 | "homepage": "https://github.com/jprichardson/node-google", 6 | "repository": { 7 | "type": "git", 8 | "url": "https://github.com/jprichardson/node-google" 9 | }, 10 | "keywords": [ 11 | "google", 12 | "search", 13 | "scrape", 14 | "scraper", 15 | "screen" 16 | ], 17 | "author": "JP Richardson ", 18 | "license": "MIT", 19 | "dependencies": { 20 | "cheerio": "^0.19.0", 21 | "request": "^2.54.0" 22 | }, 23 | "devDependencies": { 24 | "mocha": "*", 25 | "standard": "5.x" 26 | }, 27 | "main": "./lib/google", 28 | "scripts": { 29 | "test": "standard && mocha" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | (The MIT License) 3 | 4 | Copyright (c) 2012-2013 JP Richardson 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files 7 | (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, 8 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 14 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS 15 | OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 16 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2.1.0 / 2016-08-11 2 | ------------------ 3 | ### Added 4 | - `https` support 5 | 6 | 2.0.0 / 2016-03-09 7 | ------------------ 8 | - provide links, next and more attributes as a single response object, see: https://github.com/jprichardson/node-google/pull/38 9 | 10 | 1.5.0 / 2016-02-22 11 | ------------------ 12 | - use https, see: https://github.com/jprichardson/node-google/pull/35 13 | 14 | 1.4.0 / 2015-01-12 15 | ----------------- 16 | - updated for new Google HTMl 17 | 18 | 1.3.0 / 2015-07-03 19 | ------------------ 20 | - allowed start parameter now, see: https://github.com/jprichardson/node-google/pull/29 21 | 22 | 1.2.0 / 2015-05-14 23 | ------------------ 24 | - added `timeSpan` option: https://github.com/jprichardson/node-google/pull/26#issuecomment-101983266 25 | 26 | 1.1.0 / 2015-04-05 27 | ------------------ 28 | - added option `nextText` for languages other than English https://github.com/jprichardson/node-google/pull/25 29 | 30 | 1.0.0 / 2015-03-24 31 | ------------------ 32 | - exposed all of `request` options https://github.com/jprichardson/node-google/pull/21 33 | - extracted out CLI interface 34 | - using [JavaScript Standard Style](https://github.com/feross/standard) 35 | - upgrade `request` from `2.12.x` to `^2.54.0` 36 | - upgrade `cheerio` from `0.10.8` to `^0.19.0` 37 | 38 | 0.6.0 / 2014-10-23 39 | ------------------ 40 | - added `proxy` field https://github.com/jprichardson/node-google/pull/16 41 | 42 | 0.5.0 / 2014-09-30 43 | ------------------ 44 | - bugfix: CSV quoting in the command line program. https://github.com/jprichardson/node-google/pull/12 45 | 46 | 0.4.0 / 2014-09-29 47 | ------------------ 48 | - add `tld` and `language` selection. https://github.com/jprichardson/node-google/pull/11 49 | 50 | 0.3.4 / 2014-07-28 51 | ------------------ 52 | * bugix: when `resp` is `undefined/null` calling `resp.statuCode` is another error [#9](https://github.com/jprichardson/node-google/pull/9) 53 | 54 | 0.3.3 / 2014-07-07 55 | ------------------ 56 | * bugfix: search result titles [#8](https://github.com/jprichardson/node-google/pull/8) 57 | 58 | 0.3.2 / 2013-08-14 59 | ------------------ 60 | * add debugging for errors 61 | 62 | 0.3.1 / 2013-06-21 63 | ------------------ 64 | * Fixed bug that had the descriptions showing "Cached" at the start. @cancio [#4] 65 | 66 | 0.3.0 / 2013-03-29 67 | ------------------ 68 | * upgrade to `cheerio 0.10.8` 69 | * fixed element traversal bug: see https://github.com/MatthewMueller/cheerio/issues/167 70 | 71 | 0.2.0 / 2013-01-22 72 | ------------------ 73 | * Fixed Cheerio dependency bug. Stuck at version 0.10.4 74 | * Aliased `link.link` to `link.href`. 75 | * Made command line `google` program. 76 | 77 | 0.1.0 / 2012-10-03 78 | ------------------ 79 | * Added `resultsPerPage`. 80 | * Upgraded `cheerio` dep. 81 | 82 | 0.0.1 / 2012-07-10 83 | ------------------ 84 | * Inital release. 85 | -------------------------------------------------------------------------------- /lib/google.js: -------------------------------------------------------------------------------- 1 | var request = require('request') 2 | var cheerio = require('cheerio') 3 | var querystring = require('querystring') 4 | var util = require('util') 5 | 6 | var linkSel = 'h3.r a' 7 | var descSel = 'div.s' 8 | var itemSel = 'div.g' 9 | var nextSel = 'td.b a span' 10 | 11 | var URL = '%s://www.google.%s/search?hl=%s&q=%s&start=%s&sa=N&num=%s&ie=UTF-8&oe=UTF-8&gws_rd=ssl' 12 | 13 | var nextTextErrorMsg = 'Translate `google.nextText` option to selected language to detect next results link.' 14 | var protocolErrorMsg = "Protocol `google.protocol` needs to be set to either 'http' or 'https', please use a valid protocol. Setting the protocol to 'https'." 15 | 16 | // start parameter is optional 17 | function google (query, start, callback) { 18 | var startIndex = 0 19 | if (typeof callback === 'undefined') { 20 | callback = start 21 | } else { 22 | startIndex = start 23 | } 24 | igoogle(query, startIndex, callback) 25 | } 26 | 27 | google.resultsPerPage = 10 28 | google.tld = 'com' 29 | google.lang = 'en' 30 | google.requestOptions = {} 31 | google.nextText = 'Next' 32 | google.protocol = 'https' 33 | 34 | var igoogle = function (query, start, callback) { 35 | if (google.resultsPerPage > 100) google.resultsPerPage = 100 // Google won't allow greater than 100 anyway 36 | if (google.lang !== 'en' && google.nextText === 'Next') console.warn(nextTextErrorMsg) 37 | if (google.protocol !== 'http' && google.protocol !== 'https') { 38 | google.protocol = 'https' 39 | console.warn(protocolErrorMsg) 40 | } 41 | 42 | // timeframe is optional. splice in if set 43 | if (google.timeSpan) { 44 | URL = URL.indexOf('tbs=qdr:') >= 0 ? URL.replace(/tbs=qdr:[snhdwmy]\d*/, 'tbs=qdr:' + google.timeSpan) : URL.concat('&tbs=qdr:', google.timeSpan) 45 | } 46 | var newUrl = util.format(URL, google.protocol, google.tld, google.lang, querystring.escape(query), start, google.resultsPerPage) 47 | var requestOptions = { 48 | url: newUrl, 49 | method: 'GET' 50 | } 51 | 52 | for (var k in google.requestOptions) { 53 | requestOptions[k] = google.requestOptions[k] 54 | } 55 | 56 | request(requestOptions, function (err, resp, body) { 57 | if ((err == null) && resp.statusCode === 200) { 58 | var $ = cheerio.load(body) 59 | var res = { 60 | url: newUrl, 61 | query: query, 62 | start: start, 63 | links: [], 64 | $: $, 65 | body: body 66 | } 67 | 68 | $(itemSel).each(function (i, elem) { 69 | var linkElem = $(elem).find(linkSel) 70 | var descElem = $(elem).find(descSel) 71 | var item = { 72 | title: $(linkElem).first().text(), 73 | link: null, 74 | description: null, 75 | href: null 76 | } 77 | var qsObj = querystring.parse($(linkElem).attr('href')) 78 | 79 | if (qsObj['/url?q']) { 80 | item.link = qsObj['/url?q'] 81 | item.href = item.link 82 | } 83 | 84 | $(descElem).find('div').remove() 85 | item.description = $(descElem).text() 86 | 87 | res.links.push(item) 88 | }) 89 | 90 | if ($(nextSel).last().text() === google.nextText) { 91 | res.next = function () { 92 | igoogle(query, start + google.resultsPerPage, callback) 93 | } 94 | } 95 | 96 | callback(null, res) 97 | } else { 98 | callback(new Error('Error on response' + (resp ? ' (' + resp.statusCode + ')' : '') + ':' + err + ' : ' + body), null, null) 99 | } 100 | }) 101 | } 102 | 103 | module.exports = google 104 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Node.js - google 2 | ===================== 3 | 4 | [![build status](https://secure.travis-ci.org/jprichardson/node-google.svg)](http://travis-ci.org/jprichardson/node-google) 5 | 6 | This module allows you to search google by scraping the results. It does NOT use the Google Search API. **PLEASE DO NOT ABUSE THIS.** The intent of using this is convenience vs the cruft that exists in the Google Search API. 7 | 8 | This is not sponsored, supported, or affiliated with Google Inc. 9 | 10 | Please **do not** post an issue, email me, tweet me, or in anyway contact me about getting around Google blocking 11 | your automated search requests. These sorts of requests are outside the scope of this module. **Google has every 12 | right to block consumers of their service for any reason.** See: [#27](https://github.com/jprichardson/node-google/issues/27), 13 | [#20](https://github.com/jprichardson/node-google/issues/20#issuecomment-74289023). 14 | 15 | [![js-standard-style](https://raw.githubusercontent.com/feross/standard/master/badge.png)](https://github.com/feross/standard) 16 | 17 | 18 | Installation 19 | ------------ 20 | 21 | npm install --save google 22 | 23 | 24 | 25 | API Example 26 | ------- 27 | 28 | This prints out the first 100 search results of the query `node.js best practices`. 29 | 30 | ```js 31 | var google = require('google') 32 | 33 | google.resultsPerPage = 25 34 | var nextCounter = 0 35 | 36 | google('node.js best practices', function (err, res){ 37 | if (err) console.error(err) 38 | 39 | for (var i = 0; i < res.links.length; ++i) { 40 | var link = res.links[i]; 41 | console.log(link.title + ' - ' + link.href) 42 | console.log(link.description + "\n") 43 | } 44 | 45 | if (nextCounter < 4) { 46 | nextCounter += 1 47 | if (res.next) res.next() 48 | } 49 | }) 50 | ``` 51 | 52 | 53 | ### Search Within a Time Span 54 | 55 | You can specify results in a specific timeframe. Working values listed below: 56 | 57 | ```js 58 | var google = require('google') 59 | 60 | // assign one of the values below. Nothing is set by default. 61 | google.timeSpan = 'h' // information indexed in the past hour 62 | google.timeSpan = 'd' // information indexed in the past day 63 | google.timeSpan = 'w' // information indexed in the past week 64 | google.timeSpan = 'm' // information indexed in the past month 65 | google.timeSpan = 'y' // information indexed in the past year 66 | ``` 67 | 68 | 69 | ### Search Within Different Languages 70 | 71 | You can also specify the TLD of the Google search page and the language. 72 | If you change the language you must translate the next page results text to detect the corresponding link. 73 | 74 | ```js 75 | var google = require('google') 76 | 77 | google.lang = 'de' 78 | google.tld = 'de' 79 | google.nextText = 'Weiter' 80 | 81 | google('node.js best practices', function (err, res){ 82 | … 83 | }) 84 | ``` 85 | 86 | 87 | ### Set Request Options 88 | 89 | You can specify the options to be passed to request, see the [request module](https://github.com/request/request) for all available options. 90 | 91 | ```js 92 | var google = require('google') 93 | 94 | google.requestOptions = { 95 | proxy: 'http://user:password@192.168.5.4:80', 96 | timeout: 30000, 97 | localAddress: '127.0.0.1', 98 | jar: true, 99 | headers: { 100 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 101 | 'Accept-Encoding': 'gzip, deflate', 102 | 'Accept-Language': 'en;q=0.5', 103 | 'Cache-Control': 'max-age=0', 104 | 'Connection': 'keep-alive', 105 | 'DNT': 1 106 | } 107 | } 108 | 109 | google('node.js best practices', function (err, res){ 110 | … 111 | }) 112 | ``` 113 | 114 | ### Setting the protocol 115 | 116 | A 'http' or 'https' protocol can be specified after the google object has been created for queries. For example specifying 'http' will search google using the a http://www.google.com query where 'https' will use a https://www.google.com query. If no protocol is specified or any other protocol other then 'http' or 'https' is explicitly passed then this will be set to 'https' by default. 117 | 118 | ```js 119 | var google = require('google') 120 | 121 | google.protocol = 'http' // searches google using http://www.google.com 122 | google.protocol = 'https' // searches google using https://www.google.com 123 | 124 | google('node.js best practices', function (err, res){ 125 | … 126 | }) 127 | ``` 128 | 129 | The response object 130 | ------- 131 | 132 | The provided callback will receive a response object as second argument, it has these properties: 133 | 134 | - `url`: The URL requested from Google for this search and page 135 | - `query`: The search provided on this call 136 | - `start`: The index of the first link across the links of all pages 137 | - `links`: An array with all the link objects 138 | - `body`: The HTML of the loaded page 139 | - `next`: A method that invokes the originally specified callback with next page results 140 | - `$`: A cheerio instance of the loaded page 141 | 142 | Updating from 1.x 143 | ------- 144 | 145 | The only backwards-incompatible change from 1.x is that the callback received 3 arguments: 146 | ```js 147 | google('...', function (err, next, links) { 148 | links.forEach(function(link) { ... }) 149 | if (next) next() 150 | }) 151 | ``` 152 | 153 | And it now receives a single `res` object. The above code should be rewritten to: 154 | ```js 155 | google('...', function (err, res) { 156 | res.links.forEach(function(link) { ... }) 157 | if (res.next) res.next() 158 | }) 159 | ``` 160 | 161 | License 162 | ------- 163 | 164 | Licensed under MIT. See `LICENSE` for more details. 165 | 166 | Copyright (c) 2012-2016 JP Richardson 167 | -------------------------------------------------------------------------------- /test/google.test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | var google = require('../lib/google') 3 | 4 | /* global describe, it */ 5 | 6 | describe('+ google()', function () { 7 | it('should return search results', function (done) { 8 | var nextCounter = 0 9 | var allLinks = [] 10 | var query = 'Microsoft' 11 | 12 | var finished = function () { 13 | assert(allLinks.length > 20) 14 | var flags = 0x0 15 | for (var i = 0; i < allLinks.length; ++i) { 16 | var link = allLinks[i] 17 | // console.dir(link) 18 | if (link.title && link.link) { 19 | if (link.title.indexOf('Wikipedia')) { 20 | flags |= 0x1 21 | } 22 | if (link.link.indexOf('microsoft.com')) { 23 | flags |= 0x2 24 | } 25 | if (link.link.indexOf('twitter.com/Microsoft')) { 26 | flags |= 0x4 27 | } 28 | if (link.title.indexOf('Microsoft Corporation')) { 29 | flags |= 0x8 30 | } 31 | if (link.title.indexOf('Microsoft Store')) { 32 | flags |= 0x10 33 | } 34 | } 35 | 36 | assert.equal(link.description.indexOf('Cached'), -1) 37 | } 38 | 39 | // console.log(flags) 40 | assert.equal(flags, 31) // all flags above set properly 41 | 42 | done() 43 | } 44 | 45 | google(query, function (err, res) { 46 | assert.ifError(err) 47 | assert.equal(res.query, 'Microsoft') 48 | assert.equal(typeof res.$, 'function') 49 | assert.equal(typeof res.body, 'string') 50 | assert.equal(typeof res.url, 'string') 51 | assert.equal(res.start, nextCounter * google.resultsPerPage) 52 | assert.equal(typeof res.links, 'object') 53 | assert.ok(res.links.length <= google.resultsPerPage) 54 | // console.log('L: ' + links.length) 55 | allLinks = allLinks.concat(res.links) 56 | if (nextCounter < 2) { 57 | if (res.next) { 58 | nextCounter += 1 59 | res.next() 60 | } else { 61 | finished() 62 | } 63 | } else { 64 | finished() 65 | } 66 | }) 67 | }) 68 | 69 | describe('when resultsPerPage is set', function () { 70 | it('should return search results', function (done) { 71 | var allLinks = [] 72 | var query = 'Microsoft' 73 | 74 | var finished = function () { 75 | assert(allLinks.length > 90) 76 | done() 77 | } 78 | 79 | google.resultsPerPage = 100 80 | google(query, function (err, res) { 81 | assert.ifError(err) 82 | assert.equal(res.query, 'Microsoft') 83 | assert.equal(typeof res.$, 'function') 84 | assert.equal(typeof res.body, 'string') 85 | assert.equal(typeof res.url, 'string') 86 | assert.equal(res.start, 0) 87 | assert.equal(typeof res.links, 'object') 88 | assert.ok(res.links.length <= google.resultsPerPage) 89 | allLinks = allLinks.concat(res.links) 90 | // console.log(allLinks.length) 91 | finished() 92 | }) 93 | }) 94 | }) 95 | 96 | describe('when timeSpan is set', function () { 97 | it('each time-based query should return search results', function (done) { 98 | var allLinks = [] 99 | var query = 'Microsoft' 100 | var timeFrame = 'm' 101 | 102 | var finished = function () { 103 | // TODO: investigate why it's only 8 and not 10 here 104 | assert(allLinks.length === 8) 105 | done() 106 | } 107 | 108 | google.resultsPerPage = 10 109 | google.timeSpan = timeFrame 110 | google(query, function (err, res) { 111 | assert.ifError(err) 112 | assert.equal(res.query, 'Microsoft') 113 | assert.equal(typeof res.$, 'function') 114 | assert.equal(typeof res.body, 'string') 115 | assert.equal(typeof res.url, 'string') 116 | assert.equal(res.start, 0) 117 | assert.equal(typeof res.links, 'object') 118 | assert.ok(res.links.length <= google.resultsPerPage) 119 | allLinks = allLinks.concat(res.links) 120 | finished() 121 | }) 122 | }) 123 | }) 124 | 125 | describe('when nextText and lang are set', function () { 126 | it('should return next page search results', function (done) { 127 | var nextCounter = 0 128 | var allLinks = [] 129 | var query = 'Microsoft' 130 | 131 | var finished = function () { 132 | assert(allLinks.length > 25) 133 | done() 134 | } 135 | 136 | google.resultsPerPage = 25 137 | google.lang = 'it' 138 | google.nextText = 'Avanti' 139 | google(query, function (err, res) { 140 | assert.ifError(err) 141 | allLinks = allLinks.concat(res.links) 142 | if (nextCounter < 2) { 143 | if (res.next) { 144 | nextCounter += 1 145 | res.next() 146 | } else { 147 | finished() 148 | } 149 | } else { 150 | finished() 151 | } 152 | }) 153 | }) 154 | }) 155 | 156 | describe('when start is set', function () { 157 | it('optional start parameter should return search results', function (done) { 158 | var allLinks = [] 159 | var query = 'Microsoft' 160 | var timeFrame = 'm' 161 | 162 | var finished = function () { 163 | assert(allLinks.length === 10) 164 | done() 165 | } 166 | 167 | google.resultsPerPage = 10 168 | google.timeSpan = timeFrame 169 | google(query, 2, function (err, res) { 170 | assert.ifError(err) 171 | assert.equal(res.query, 'Microsoft') 172 | assert.equal(typeof res.$, 'function') 173 | assert.equal(typeof res.body, 'string') 174 | assert.equal(typeof res.url, 'string') 175 | assert.equal(res.start, 2) 176 | assert.equal(typeof res.links, 'object') 177 | assert.ok(res.links.length <= google.resultsPerPage) 178 | allLinks = allLinks.concat(res.links) 179 | finished() 180 | }) 181 | }) 182 | }) 183 | }) 184 | --------------------------------------------------------------------------------