├── .env.example
├── .gitignore
├── LICENSE
├── README.md
├── gsearch.py
├── keywords
├── requirements.txt
└── user_agents


/.env.example:
--------------------------------------------------------------------------------
1 | # This is a sample for .env file. To use this, make a copy of this file by and
2 | # rename it into '.env', remove these comment lines (staring with #) and then
3 | # finally set the variables as you wish.
4 | 
5 | BASE_URL="https://www.google.com"
6 | RESULTS_PER_PAGE=10
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .env
3 | 
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 meibenjin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Google Search Crawler
 2 | ======================
 3 | 
 4 | This is a simple google search results crawler. Before use this tool, please read these tips below.
 5 | 
 6 | Requirements
 7 | ----------------------
 8 | You may find requirements in `requirements.txt` which can be installed using `pip install -r requirements.txt`
 9 | 
10 | **More info:**
11 | 
12 | 1. Python
13 |     
14 |     python should be installed in your computer. here is the official website: http://www.python.org
15 | 
16 | 2. BeautifulSoup
17 |     
18 |     A html parser to extract search results from Google. BeautifulSoup(version 4) is better. 
19 |     
20 |     For more information about BeautifuleSoup, please visit: http://www.crummy.com/software/BeautifulSoup/
21 | 
22 | 3. dotenv (python-dotenv)
23 | 
24 | How to Use?
25 | ----------------------
26 | 1. Rename or copy `.env.example` into `.env`. Apply your own config in this file only.
27 | 
28 | 2. single key word
29 |     
30 |     >python gsearch.py 'your query key words'
31 |     
32 |     It will return about 10 extracted results by default. if you need more results, just change the expect_num value.
33 | 3. list of key words
34 |     >python gsearch.py
35 |     
36 |     First create a file named keywords, put your key words list into this file, one key word per line.
37 | 
38 | If there are any problems or bugs about this tool, please open an issue.
39 | 


--------------------------------------------------------------------------------
/gsearch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Create by Meibenjin.
  5 | #
  6 | # Last updated: 2018-12-15
  7 | #
  8 | # google search results crawler
  9 | 
 10 | import sys
 11 | import os
 12 | import urllib2
 13 | import socket
 14 | import time
 15 | import gzip
 16 | import StringIO
 17 | import re
 18 | import random
 19 | import types
 20 | from dotenv import load_dotenv, find_dotenv
 21 | from bs4 import BeautifulSoup
 22 | 
 23 | reload(sys)
 24 | sys.setdefaultencoding('utf-8')
 25 | 
 26 | # Load config from .env file
 27 | # TODO: Error handling
 28 | try:
 29 |     load_dotenv(find_dotenv(usecwd=True))
 30 |     base_url = os.environ.get('BASE_URL')
 31 |     results_per_page = int(os.environ.get('RESULTS_PER_PAGE'))
 32 | except:
 33 |     print "ERROR: Make sure you have .env file with proper config"
 34 |     sys.exit(1)
 35 | 
 36 | user_agents = list()
 37 | 
 38 | # results from the search engine
 39 | # basically include url, title,content
 40 | 
 41 | 
 42 | class SearchResult:
 43 |     def __init__(self):
 44 |         self.url = ''
 45 |         self.title = ''
 46 |         self.content = ''
 47 | 
 48 |     def getURL(self):
 49 |         return self.url
 50 | 
 51 |     def setURL(self, url):
 52 |         self.url = url
 53 | 
 54 |     def getTitle(self):
 55 |         return self.title
 56 | 
 57 |     def setTitle(self, title):
 58 |         self.title = title
 59 | 
 60 |     def getContent(self):
 61 |         return self.content
 62 | 
 63 |     def setContent(self, content):
 64 |         self.content = content
 65 | 
 66 |     def printIt(self, prefix=''):
 67 |         print ('url\t->', self.url, '\n',
 68 |             'title\t->', self.title, '\n',
 69 |             'content\t->', self.content)
 70 |         
 71 |     def writeFile(self, filename):
 72 |         file = open(filename, 'a')
 73 |         try:
 74 |             file.write('url:' + self.url + '\n')
 75 |             file.write('title:' + self.title + '\n')
 76 |             file.write('content:' + self.content + '\n\n')
 77 |         except IOError, e:
 78 |             print ('file error:', e)
 79 |         finally:
 80 |             file.close()
 81 | 
 82 | 
 83 | class GoogleAPI:
 84 |     def __init__(self):
 85 |         timeout = 40
 86 |         socket.setdefaulttimeout(timeout)
 87 | 
 88 |     def randomSleep(self):
 89 |         sleeptime = random.randint(60, 120)
 90 |         time.sleep(sleeptime)
 91 | 
 92 |     def extractDomain(self, url):
 93 |         """Return string
 94 | 
 95 |         extract the domain of a url
 96 |         """
 97 |         domain = ''
 98 |         pattern = re.compile(r'http[s]?://([^/]+)/', re.U | re.M)
 99 |         url_match = pattern.search(url)
100 |         if(url_match and url_match.lastindex > 0):
101 |             domain = url_match.group(1)
102 | 
103 |         return domain
104 | 
105 |     def extractUrl(self, href):
106 |         """ Return a string
107 | 
108 |         extract a url from a link
109 |         """
110 |         url = ''
111 |         pattern = re.compile(r'(http[s]?://[^&]+)&', re.U | re.M)
112 |         url_match = pattern.search(href)
113 |         if(url_match and url_match.lastindex > 0):
114 |             url = url_match.group(1)
115 | 
116 |         return url
117 | 
118 |     def extractSearchResults(self, html):
119 |         """Return a list
120 | 
121 |         extract serach results list from downloaded html file
122 |         """
123 |         results = list()
124 |         soup = BeautifulSoup(html, 'html.parser')
125 |         div = soup.find('div', id='main')
126 |         if (type(div) == types.NoneType):
127 |             div = soup.find('div', id='center_col')
128 |         if (type(div) == types.NoneType):
129 |             div = soup.find('body')
130 |         if (type(div) != types.NoneType):
131 |             lis = div.findAll('a')
132 |             if(len(lis) > 0):
133 |                 for link in lis:
134 |                     if (type(link) == types.NoneType):
135 |                         continue
136 |                     
137 |                     url = link['href']
138 |                     if url.find(".google") > 6:
139 |                         continue
140 |                         
141 |                     url = self.extractUrl(url)
142 |                     if(cmp(url, '') == 0):
143 |                         continue
144 |                     title = link.renderContents()
145 |                     title = re.sub(r'<.+?>', '', title)
146 |                     result = SearchResult()
147 |                     result.setURL(url)
148 |                     result.setTitle(title)
149 |                     span = link.find('div')
150 |                     if (type(span) != types.NoneType):
151 |                         content = span.renderContents()
152 |                         content = re.sub(r'<.+?>', '', content)
153 |                         result.setContent(content)
154 |                     results.append(result)
155 |         return results
156 | 
157 |     def search(self, query, lang='en', num=results_per_page):
158 |         """Return a list of lists
159 | 
160 |         search web
161 |         @param query -> query key words
162 |         @param lang -> language of search results
163 |         @param num -> number of search results to return
164 |         """
165 |         search_results = list()
166 |         query = urllib2.quote(query)
167 |         if(num % results_per_page == 0):
168 |             pages = num / results_per_page
169 |         else:
170 |             pages = num / results_per_page + 1
171 | 
172 |         for p in range(0, pages):
173 |             start = p * results_per_page
174 |             url = '%s/search?hl=%s&num=%d&start=%s&q=%s' % (
175 |                 base_url, lang, results_per_page, start, query)
176 |             retry = 3
177 |             while(retry > 0):
178 |                 try:
179 |                     request = urllib2.Request(url)
180 |                     length = len(user_agents)
181 |                     index = random.randint(0, length-1)
182 |                     user_agent = user_agents[index]
183 |                     request.add_header('User-agent', user_agent)
184 |                     request.add_header('connection', 'keep-alive')
185 |                     request.add_header('Accept-Encoding', 'gzip')
186 |                     request.add_header('referer', base_url)
187 |                     response = urllib2.urlopen(request)
188 |                     html = response.read()
189 |                     if(response.headers.get('content-encoding', None) == 'gzip'):
190 |                         html = gzip.GzipFile(
191 |                             fileobj=StringIO.StringIO(html)).read()
192 | 
193 |                     results = self.extractSearchResults(html)
194 |                     search_results.extend(results)
195 |                     break
196 |                 except urllib2.URLError, e:
197 |                     print ('url error:', e)
198 |                     self.randomSleep()
199 |                     retry = retry - 1
200 |                     continue
201 | 
202 |                 except Exception, e:
203 |                     print ('error:', e)
204 |                     retry = retry - 1
205 |                     self.randomSleep()
206 |                     continue
207 |         return search_results
208 | 
209 | 
210 | def load_user_agent():
211 |     fp = open('./user_agents', 'r')
212 | 
213 |     line = fp.readline().strip('\n')
214 |     while(line):
215 |         user_agents.append(line)
216 |         line = fp.readline().strip('\n')
217 |     fp.close()
218 | 
219 | 
220 | def crawler():
221 |     # Load use agent string from file
222 |     load_user_agent()
223 | 
224 |     # Create a GoogleAPI instance
225 |     api = GoogleAPI()
226 | 
227 |     # set expect search results to be crawled
228 |     expect_num = 10
229 |     # if no parameters, read query keywords from file
230 |     if(len(sys.argv) < 2):
231 |         keywords = open('./keywords', 'r')
232 |         keyword = keywords.readline()
233 |         while(keyword):
234 |             results = api.search(keyword, num=expect_num)
235 |             for r in results:
236 |                 r.printIt()
237 |             keyword = keywords.readline()
238 |         keywords.close()
239 |     else:
240 |         keyword = sys.argv[1]
241 |         results = api.search(keyword, num=expect_num)
242 |         for r in results:
243 |             r.printIt()
244 | 
245 | 
246 | if __name__ == '__main__':
247 |     crawler()
248 | 


--------------------------------------------------------------------------------
/keywords:
--------------------------------------------------------------------------------
  1 | test
  2 | China
  3 | USA
  4 | Gmail
  5 | Google
  6 | Search
  7 | IP
  8 | TCP
  9 | UDP
 10 | Socket
 11 | software
 12 | solution
 13 | create
 14 | online
 15 | privide
 16 | manage
 17 | Computer
 18 | Binary
 19 | Language
 20 | certification
 21 | program
 22 | ABC
 23 | BBC
 24 | CBC
 25 | Login
 26 | Facebook
 27 | QQ
 28 | Tencent
 29 | Alipay
 30 | Alibaba
 31 | Internet
 32 | bandwidth
 33 | locations
 34 | around
 35 | world
 36 | github
 37 | gitlab
 38 | extract
 39 | crawler
 40 | html
 41 | BeautifulSoup
 42 | visit
 43 | parser
 44 | install
 45 | simple
 46 | tool
 47 | meibenjin
 48 | issues
 49 | request
 50 | network
 51 | graph
 52 | settings
 53 | clone
 54 | https
 55 | desktop
 56 | repository
 57 | explorer
 58 | blog
 59 | Email
 60 | problem
 61 | contact
 62 | code
 63 | python
 64 | php
 65 | c
 66 | c++
 67 | Java
 68 | Javascript
 69 | CSS
 70 | jquery
 71 | nodejs
 72 | perl
 73 | shell
 74 | linux
 75 | mac
 76 | thinkpad
 77 | windows
 78 | ipv4
 79 | ipv6
 80 | insert sort
 81 | bubble sort
 82 | quick sort
 83 | information
 84 | release
 85 | Chemical
 86 | Weapons
 87 | Boost
 88 | Environment
 89 | Robots
 90 | Music
 91 | Christmas
 92 | Rebalance
 93 | hadoop
 94 | mysql
 95 | postgres
 96 | IBM
 97 | Amazon
 98 | Twitter
 99 | Baidu
100 | Hulu
101 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv
2 | bs4
3 | 


--------------------------------------------------------------------------------