├── .env.example ├── .gitignore ├── LICENSE ├── README.md ├── gsearch.py ├── keywords ├── requirements.txt └── user_agents /.env.example: -------------------------------------------------------------------------------- 1 | # This is a sample for .env file. To use this, make a copy of this file by and 2 | # rename it into '.env', remove these comment lines (staring with #) and then 3 | # finally set the variables as you wish. 4 | 5 | BASE_URL="https://www.google.com" 6 | RESULTS_PER_PAGE=10 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | .env 3 | 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 meibenjin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Google Search Crawler 2 | ====================== 3 | 4 | This is a simple google search results crawler. Before use this tool, please read these tips below. 5 | 6 | Requirements 7 | ---------------------- 8 | You may find requirements in `requirements.txt` which can be installed using `pip install -r requirements.txt` 9 | 10 | **More info:** 11 | 12 | 1. Python 13 | 14 | python should be installed in your computer. here is the official website: http://www.python.org 15 | 16 | 2. BeautifulSoup 17 | 18 | A html parser to extract search results from Google. BeautifulSoup(version 4) is better. 19 | 20 | For more information about BeautifuleSoup, please visit: http://www.crummy.com/software/BeautifulSoup/ 21 | 22 | 3. dotenv (python-dotenv) 23 | 24 | How to Use? 25 | ---------------------- 26 | 1. Rename or copy `.env.example` into `.env`. Apply your own config in this file only. 27 | 28 | 2. single key word 29 | 30 | >python gsearch.py 'your query key words' 31 | 32 | It will return about 10 extracted results by default. if you need more results, just change the expect_num value. 33 | 3. list of key words 34 | >python gsearch.py 35 | 36 | First create a file named keywords, put your key words list into this file, one key word per line. 37 | 38 | If there are any problems or bugs about this tool, please open an issue. 39 | -------------------------------------------------------------------------------- /gsearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Create by Meibenjin. 5 | # 6 | # Last updated: 2018-12-15 7 | # 8 | # google search results crawler 9 | 10 | import sys 11 | import os 12 | import urllib2 13 | import socket 14 | import time 15 | import gzip 16 | import StringIO 17 | import re 18 | import random 19 | import types 20 | from dotenv import load_dotenv, find_dotenv 21 | from bs4 import BeautifulSoup 22 | 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | 26 | # Load config from .env file 27 | # TODO: Error handling 28 | try: 29 | load_dotenv(find_dotenv(usecwd=True)) 30 | base_url = os.environ.get('BASE_URL') 31 | results_per_page = int(os.environ.get('RESULTS_PER_PAGE')) 32 | except: 33 | print "ERROR: Make sure you have .env file with proper config" 34 | sys.exit(1) 35 | 36 | user_agents = list() 37 | 38 | # results from the search engine 39 | # basically include url, title,content 40 | 41 | 42 | class SearchResult: 43 | def __init__(self): 44 | self.url = '' 45 | self.title = '' 46 | self.content = '' 47 | 48 | def getURL(self): 49 | return self.url 50 | 51 | def setURL(self, url): 52 | self.url = url 53 | 54 | def getTitle(self): 55 | return self.title 56 | 57 | def setTitle(self, title): 58 | self.title = title 59 | 60 | def getContent(self): 61 | return self.content 62 | 63 | def setContent(self, content): 64 | self.content = content 65 | 66 | def printIt(self, prefix=''): 67 | print ('url\t->', self.url, '\n', 68 | 'title\t->', self.title, '\n', 69 | 'content\t->', self.content) 70 | 71 | def writeFile(self, filename): 72 | file = open(filename, 'a') 73 | try: 74 | file.write('url:' + self.url + '\n') 75 | file.write('title:' + self.title + '\n') 76 | file.write('content:' + self.content + '\n\n') 77 | except IOError, e: 78 | print ('file error:', e) 79 | finally: 80 | file.close() 81 | 82 | 83 | class GoogleAPI: 84 | def __init__(self): 85 | timeout = 40 86 | socket.setdefaulttimeout(timeout) 87 | 88 | def randomSleep(self): 89 | sleeptime = random.randint(60, 120) 90 | time.sleep(sleeptime) 91 | 92 | def extractDomain(self, url): 93 | """Return string 94 | 95 | extract the domain of a url 96 | """ 97 | domain = '' 98 | pattern = re.compile(r'http[s]?://([^/]+)/', re.U | re.M) 99 | url_match = pattern.search(url) 100 | if(url_match and url_match.lastindex > 0): 101 | domain = url_match.group(1) 102 | 103 | return domain 104 | 105 | def extractUrl(self, href): 106 | """ Return a string 107 | 108 | extract a url from a link 109 | """ 110 | url = '' 111 | pattern = re.compile(r'(http[s]?://[^&]+)&', re.U | re.M) 112 | url_match = pattern.search(href) 113 | if(url_match and url_match.lastindex > 0): 114 | url = url_match.group(1) 115 | 116 | return url 117 | 118 | def extractSearchResults(self, html): 119 | """Return a list 120 | 121 | extract serach results list from downloaded html file 122 | """ 123 | results = list() 124 | soup = BeautifulSoup(html, 'html.parser') 125 | div = soup.find('div', id='main') 126 | if (type(div) == types.NoneType): 127 | div = soup.find('div', id='center_col') 128 | if (type(div) == types.NoneType): 129 | div = soup.find('body') 130 | if (type(div) != types.NoneType): 131 | lis = div.findAll('a') 132 | if(len(lis) > 0): 133 | for link in lis: 134 | if (type(link) == types.NoneType): 135 | continue 136 | 137 | url = link['href'] 138 | if url.find(".google") > 6: 139 | continue 140 | 141 | url = self.extractUrl(url) 142 | if(cmp(url, '') == 0): 143 | continue 144 | title = link.renderContents() 145 | title = re.sub(r'<.+?>', '', title) 146 | result = SearchResult() 147 | result.setURL(url) 148 | result.setTitle(title) 149 | span = link.find('div') 150 | if (type(span) != types.NoneType): 151 | content = span.renderContents() 152 | content = re.sub(r'<.+?>', '', content) 153 | result.setContent(content) 154 | results.append(result) 155 | return results 156 | 157 | def search(self, query, lang='en', num=results_per_page): 158 | """Return a list of lists 159 | 160 | search web 161 | @param query -> query key words 162 | @param lang -> language of search results 163 | @param num -> number of search results to return 164 | """ 165 | search_results = list() 166 | query = urllib2.quote(query) 167 | if(num % results_per_page == 0): 168 | pages = num / results_per_page 169 | else: 170 | pages = num / results_per_page + 1 171 | 172 | for p in range(0, pages): 173 | start = p * results_per_page 174 | url = '%s/search?hl=%s&num=%d&start=%s&q=%s' % ( 175 | base_url, lang, results_per_page, start, query) 176 | retry = 3 177 | while(retry > 0): 178 | try: 179 | request = urllib2.Request(url) 180 | length = len(user_agents) 181 | index = random.randint(0, length-1) 182 | user_agent = user_agents[index] 183 | request.add_header('User-agent', user_agent) 184 | request.add_header('connection', 'keep-alive') 185 | request.add_header('Accept-Encoding', 'gzip') 186 | request.add_header('referer', base_url) 187 | response = urllib2.urlopen(request) 188 | html = response.read() 189 | if(response.headers.get('content-encoding', None) == 'gzip'): 190 | html = gzip.GzipFile( 191 | fileobj=StringIO.StringIO(html)).read() 192 | 193 | results = self.extractSearchResults(html) 194 | search_results.extend(results) 195 | break 196 | except urllib2.URLError, e: 197 | print ('url error:', e) 198 | self.randomSleep() 199 | retry = retry - 1 200 | continue 201 | 202 | except Exception, e: 203 | print ('error:', e) 204 | retry = retry - 1 205 | self.randomSleep() 206 | continue 207 | return search_results 208 | 209 | 210 | def load_user_agent(): 211 | fp = open('./user_agents', 'r') 212 | 213 | line = fp.readline().strip('\n') 214 | while(line): 215 | user_agents.append(line) 216 | line = fp.readline().strip('\n') 217 | fp.close() 218 | 219 | 220 | def crawler(): 221 | # Load use agent string from file 222 | load_user_agent() 223 | 224 | # Create a GoogleAPI instance 225 | api = GoogleAPI() 226 | 227 | # set expect search results to be crawled 228 | expect_num = 10 229 | # if no parameters, read query keywords from file 230 | if(len(sys.argv) < 2): 231 | keywords = open('./keywords', 'r') 232 | keyword = keywords.readline() 233 | while(keyword): 234 | results = api.search(keyword, num=expect_num) 235 | for r in results: 236 | r.printIt() 237 | keyword = keywords.readline() 238 | keywords.close() 239 | else: 240 | keyword = sys.argv[1] 241 | results = api.search(keyword, num=expect_num) 242 | for r in results: 243 | r.printIt() 244 | 245 | 246 | if __name__ == '__main__': 247 | crawler() 248 | -------------------------------------------------------------------------------- /keywords: -------------------------------------------------------------------------------- 1 | test 2 | China 3 | USA 4 | Gmail 5 | Google 6 | Search 7 | IP 8 | TCP 9 | UDP 10 | Socket 11 | software 12 | solution 13 | create 14 | online 15 | privide 16 | manage 17 | Computer 18 | Binary 19 | Language 20 | certification 21 | program 22 | ABC 23 | BBC 24 | CBC 25 | Login 26 | Facebook 27 | QQ 28 | Tencent 29 | Alipay 30 | Alibaba 31 | Internet 32 | bandwidth 33 | locations 34 | around 35 | world 36 | github 37 | gitlab 38 | extract 39 | crawler 40 | html 41 | BeautifulSoup 42 | visit 43 | parser 44 | install 45 | simple 46 | tool 47 | meibenjin 48 | issues 49 | request 50 | network 51 | graph 52 | settings 53 | clone 54 | https 55 | desktop 56 | repository 57 | explorer 58 | blog 59 | Email 60 | problem 61 | contact 62 | code 63 | python 64 | php 65 | c 66 | c++ 67 | Java 68 | Javascript 69 | CSS 70 | jquery 71 | nodejs 72 | perl 73 | shell 74 | linux 75 | mac 76 | thinkpad 77 | windows 78 | ipv4 79 | ipv6 80 | insert sort 81 | bubble sort 82 | quick sort 83 | information 84 | release 85 | Chemical 86 | Weapons 87 | Boost 88 | Environment 89 | Robots 90 | Music 91 | Christmas 92 | Rebalance 93 | hadoop 94 | mysql 95 | postgres 96 | IBM 97 | Amazon 98 | Twitter 99 | Baidu 100 | Hulu 101 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | bs4 3 | --------------------------------------------------------------------------------