├── README.md
├── .DS_Store
├── test.py
├── myStringIO.py
└── crawler.py


/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ghost123gg/tools/HEAD/.DS_Store


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/myStringIO.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | """
  5 | A StringIO like class implement by Alex
  6 | """
  7 | 
  8 | __author__ = "Alex"
  9 | __all__ = ["MyStringIO"]
 10 | 
 11 | 
 12 | def _test_ifclosed(closed):
 13 |     if closed:
 14 |         raise ValueError, "I/O operation on closed file"
 15 | 
 16 | 
 17 | class MyStringIO:
 18 |     def __init__(self, buf = ""):
 19 |         if not isinstance(buf, basestring):
 20 |             buf = str(buf)
 21 |         self.buf = buf
 22 |         self.length = len(buf)
 23 |         self.pos = 0
 24 |         self.closed = False
 25 | 
 26 |     def __iter__(self):
 27 |         return self
 28 | 
 29 |     def next(self):
 30 |         _test_ifclosed(self.closed)
 31 |         r = self.readline()
 32 |         if not r:
 33 |             raise StopIteration
 34 |         return r
 35 | 
 36 |     def close(self):
 37 |         """
 38 |         Set closed attribute to True. A closed buffer can not be used for
 39 |         further IO Operation. close() can be called more than once without
 40 |         error.
 41 |         """
 42 |         if not self.closed:
 43 |             self.closed = True
 44 |             del self.pos, self.buf
 45 | 
 46 |     def tell(self):
 47 |         """
 48 |         Return current file position.
 49 |         """
 50 |         _test_ifclosed(self.closed)
 51 |         return self.pos
 52 | 
 53 |     def seek(self, pos, whence = 0):
 54 |         """
 55 |         Set the file's current position.
 56 |         """
 57 |         _test_ifclosed(self.closed)
 58 |         spos = self.pos
 59 |         slen = self.length
 60 |         if whence == 1:
 61 |             pos += self.pos
 62 |         elif whence == 2:
 63 |             pos += self.length
 64 |         self.pos = max(0, pos)
 65 | 
 66 |     def read(self, size = -1):
 67 |         """
 68 |         Read at the most size bytes, return as a string.
 69 |         If the size argument is negative or omitted, read until EOF is reached.
 70 |         """
 71 |         _test_ifclosed(self.closed)
 72 |         if size < 0:
 73 |             newpos = self.length
 74 |         else:
 75 |             newpos = min(self.length, self.pos + size)
 76 |         r = self.buf[self.pos:newpos]
 77 |         self.pos = newpos
 78 |         return r
 79 | 
 80 |     def readline(self):
 81 |         """
 82 |         Read next line from file, return as a string.
 83 |         """
 84 |         _test_ifclosed(self.closed)
 85 |         sep = self.buf.find("\n", self.pos)
 86 |         newpos = self.length if sep == -1 else sep + 1
 87 |         r = self.buf[self.pos:newpos]
 88 |         self.pos = newpos
 89 |         return r
 90 | 
 91 |     def readlines(self, size = -1):
 92 |         """
 93 |         Call readline() repeatedly and return a list of the lines so read.
 94 |         The optional size argument, if given, is an approximate bound on the
 95 |         total number of bytes in the lines returned.
 96 |         """
 97 |         _test_ifclosed(self.closed)
 98 |         lines = []
 99 |         total = 0
100 | 
101 |         line = self.readline()
102 |         while line: 
103 |             lines.append(line)
104 |             total += len(line)
105 |             if 0 < size <= total:
106 |                 break
107 |             line = self.readline()
108 |         return lines
109 |     
110 |     def write(self, s):
111 |         """
112 |         Write string to buffer.
113 |         """
114 |         _test_ifclosed(self.closed)
115 |         if not s: return
116 |         if not isinstance(s, basestring):
117 |             s = str(s)
118 |         curpos = self.pos
119 |         curlen = self.length
120 |         slen = len(s)
121 | 
122 |         if curpos > curlen:
123 |             self.buf += '\0' * (curpos - curlen)
124 |             curlen = self.length = curpos
125 | 
126 |         newpos = curpos + slen
127 |         if newpos < curlen:
128 |             self.buf = self.buf[:curpos] + s + self.buf[newpos:]
129 |         elif newpos == curlen:
130 |             self.buf = self.buf[:curpos] + s
131 |         else:
132 |             self.length = newpos
133 |             self.buf = self.buf[:curpos] + s
134 |         self.pos = newpos
135 | 
136 |     def writelines(self, lines):
137 |         """
138 |         Write a sequence of string to the file.
139 |         """
140 |         _test_ifclosed(self.closed)
141 |         for line in lines:
142 |             self.write(line)
143 |     
144 |     def truncate(self, size = None):
145 |         """
146 |         Truncate the file to at most size bytes.
147 |         Size default to the current file position, as returnd by tell()
148 |         """
149 |         _test_ifclosed(self.closed)
150 |         if size is None:
151 |             size = self.pos
152 |         elif size < 0:
153 |             raise IOError("Invalid argument")
154 |         elif size < self.length:
155 |             self.pos = size
156 |         self.buf = self.buf[:size]
157 |         self.length = size
158 | 
159 |     def getvalue(self):
160 |         """
161 |         Return the value of buffer.
162 |         """
163 |         _test_ifclosed(self.closed)
164 |         return self.buf
165 | 
166 | 
167 | def test():
168 |     import sys
169 |     if sys.argv[1:]:
170 |         file = sys.argv[1]
171 |     else:
172 |         file = '/etc/passwd'
173 |     lines = open(file, 'r').readlines()
174 |     text = open(file, 'r').read()
175 |     f = MyStringIO()
176 |     for line in lines[:-2]:
177 |         f.write(line)
178 |     f.writelines(lines[-2:])
179 |     if f.getvalue() != text:
180 |         raise RuntimeError, 'write failed'
181 |     length = f.tell()
182 |     print 'File length =', length
183 |     print 'Text length =', len(text)
184 |     f.seek(len(lines[0]))
185 |     print "Position =", f.tell()
186 |     f.write(lines[1])
187 |     print f.getvalue()
188 |     f.seek(0)
189 |     print 'First line =', repr(f.readline())
190 |     print 'Position =', f.tell()
191 |     line = f.readline()
192 |     print 'Second line =', repr(line)
193 |     f.seek(-len(line), 1)
194 |     line2 = f.read(len(line))
195 |     if line != line2:
196 |         raise RuntimeError, 'bad result after seek back'
197 |     f.seek(len(line2), 1)
198 |     l = f.readlines()
199 |     line = l[-1]
200 |     f.seek(f.tell() - len(line))
201 |     line2 = f.read()
202 |     if line != line2:
203 |         raise RuntimeError, 'bad result after seek back from EOF'
204 |     print 'Read', len(l), 'more lines'
205 |     print 'File length =', f.tell()
206 |     if f.tell() != length:
207 |         raise RuntimeError, 'bad length'
208 |     f.truncate(length/2)
209 |     f.seek(0, 2)
210 |     print 'Truncated length =', f.tell()
211 |     if f.tell() != length/2:
212 |         raise RuntimeError, 'truncate did not adjust length'
213 |     f.close()
214 | 
215 | 
216 | if __name__ == '__main__':
217 |     test()
218 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | __author__ = 'Alex'
  5 | 
  6 | import requests
  7 | import re
  8 | import codecs
  9 | import urllib
 10 | import time
 11 | import random
 12 | from bs4 import BeautifulSoup
 13 | from url_extract import UrlExtract
 14 | 
 15 | http_header = {
 16 |         'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
 17 |         'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 18 |         'Accept-Encoding' : 'gzip, deflate, sdch',
 19 |         'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6',
 20 |         'Connection' : 'keep-alive'
 21 |         }
 22 | 
 23 | extract = UrlExtract()
 24 | p_list = []
 25 | 
 26 | def init_list_from_file(filename):
 27 |     l = []
 28 |     f = open(filename)
 29 |     for line in f:
 30 |         line = line.strip()
 31 |         if line:
 32 |             l.append(line)
 33 |     f.close()
 34 |     return l
 35 | 
 36 | def sleep_function(lower, upper):
 37 |     print "Sleeping..."
 38 |     time.sleep(random.uniform(lower, upper))
 39 |     print "Finished...\n"
 40 | 
 41 | def get_https_proxy_list():
 42 |     global http_header
 43 | 
 44 |     print "--------------------\nGeting proxy..."
 45 |     proxy_list = []
 46 |     entry_url = "http://proxy.moo.jp"
 47 |     target_url = "http://proxy.moo.jp/zh/"
 48 | 
 49 |     crawler = requests.Session()
 50 |     html = crawler.get(entry_url, headers = http_header).content
 51 |     sleep_function(1, 3)
 52 | 
 53 |     for i in range(2)[1:]:
 54 |         payload = {'pr' : 'HTTPS', 'page' : str(i)}        
 55 |         r = crawler.get(target_url, headers = http_header, params = payload)
 56 |         html = r.text
 57 | 
 58 |         with codecs.open("proxy.html", "w", encoding = r.encoding) as f:
 59 |             f.write(html)
 60 | 
 61 |         soup = BeautifulSoup(html)
 62 |         tr_list = []
 63 |         odd_list = soup.find_all("tr", class_ = "Odd")
 64 |         even_list = soup.find_all("tr", class_ = "Even")
 65 | 
 66 |         for ele in odd_list:
 67 |             tr_list.append(ele)
 68 |         for ele in even_list:
 69 |             tr_list.append(ele)
 70 | 
 71 |         for ele in tr_list:
 72 |             if ele.find("ins", class_ = "adsbygoogle"):
 73 |                 continue
 74 |             else:
 75 |                 td_list = ele.find_all("td")
 76 |                 country = td_list[4].text
 77 |                 ipdecode = td_list[0].text
 78 |                 ipdecode = td_list[0].text
 79 |                 st = ipdecode.find("(")
 80 |                 ed = ipdecode.find(")")
 81 |                 proxy_ip = urllib.unquote(ipdecode[st+2 : ed-1]) + ":" + td_list[1].text
 82 |                 proxy_list.append(proxy_ip)
 83 |     print "Find %d proxy\n--------------------" % len(proxy_list)
 84 |     return proxy_list
 85 | 
 86 | def crawl_google(crawler, keyword, num, start, proxies = {}):
 87 |     global http_header
 88 | 
 89 |     google_url = "https://www.google.co.jp/search"
 90 |     payload = {'q' : keyword, 'num' : num, 'start' : start}
 91 | 
 92 |     if proxies:
 93 |         r = crawler.get(google_url, params = payload, proxies = proxies, timeout = 10)
 94 |     else:
 95 |         r = crawler.get(google_url, params = payload, timeout = 10)
 96 |     html = r.text
 97 | 
 98 |     with codecs.open("google.html", "w", encoding = 'UTF-8') as f:
 99 |         f.write(html)
100 | 
101 |     return html
102 | 
103 | def contain_chinese(s):
104 |     for ch in s:
105 |         if u'\u4e00' <= ch <= u'\u9fff':
106 |             return True
107 |     return False
108 | 
109 | def extract_domain(fqdn):
110 |     global extract
111 |     extracted = extract.extract(fqdn)
112 |     return str(extracted.getDomain()) + "." + str(extracted.getTld()) if extracted.valid() else ""
113 | 
114 | def extract_cite_list_from_html(soup):
115 |     return soup.find_all('cite')
116 | 
117 | def extract_h3_list_from_html(soup):
118 |     return soup.find_all('h3', class_ = 'r')
119 | 
120 | def has_cheme(url):
121 |     return True if url.startswith("http://") or url.startswith("https://") else False
122 | 
123 | def extract_domain_list_from_cite_list(cite_list):
124 |     domain_list = set()
125 |     for cite in cite_list:
126 |         url = cite.text
127 |         if not contain_chinese(url):
128 |             ed = len(url)
129 |             if not has_cheme(url):
130 |                 for i in range(ed):
131 |                     if url[i] == '/' or url[i] == '?':
132 |                         ed = i
133 |                         break
134 |                 link = url[:ed]
135 |                 domain = extract_domain(link)
136 |                 if domain:
137 |                     print "Find domain : %s" % domain
138 |                     domain_list.add(domain)
139 |     return domain_list
140 | 
141 | def is_spider_pool_url(url):
142 |     if contain_chinese(url):
143 |         return False
144 |     if url.count('/') > 1:
145 |         return False
146 |     if url.count('?') > 0:
147 |         return False
148 |     pos = url.find('/')
149 |     if pos != -1:
150 |         url = url[:pos]
151 |     if url.count('.') >= 3:
152 |         return True
153 |     if url.count('.') <= 1:
154 |         return False
155 | 
156 |     pos = url.find('.') 
157 | 
158 |     if url[:pos].isalpha() and len(url[:pos]) >= 5:
159 |         return True
160 | 
161 |     has_digit = False
162 |     has_ch = False
163 |     for i in range(pos):
164 |         if url[i].isalpha():
165 |             has_ch = True
166 |         if url[i].isdigit():
167 |             has_digit = True
168 | 
169 |     if has_digit and has_ch:
170 |         return True
171 |     return False
172 | 
173 | def select_random_number_from_list(l):
174 |     return random.randint(0, len(l) - 1)
175 | 
176 | def detect(keyword):
177 |     global p_list
178 | 
179 |     num = -1
180 |     html = ""
181 |     p_list = get_https_proxy_list()
182 |     num = select_random_number_from_list(p_list)
183 |     crawler = requests.Session()
184 | 
185 |     print "====================\nDetect keyword:",
186 |     print keyword
187 |     while True:
188 |         try:
189 |             print "Select proxy : %s" % p_list[num]
190 |             html = crawl_google(crawler, keyword, 10, 0, {"https" : p_list[num]})
191 |             #html = crawl_google(crawler, keyword, 10, 0)
192 |             soup = BeautifulSoup(html)
193 |             cite_list = extract_cite_list_from_html(soup)
194 |             if len(cite_list) == 0:
195 |                 print "cite list is empty",
196 |                 raise Exception
197 |         except KeyboardInterrupt:
198 |             break
199 |         except Exception, e:
200 |             print e
201 |             print "Error! delete proxy %s" % p_list[num]
202 |             crawler = requests.Session()
203 |             del p_list[num]
204 |             print "%d remained\n" % len(p_list)
205 |             if len(p_list) == 0:
206 |                 p_list = get_https_proxy_list()
207 |             num = select_random_number_from_list(p_list)
208 |             continue
209 |         break
210 | 
211 |     soup = BeautifulSoup(html)
212 |     cite_list = extract_cite_list_from_html(soup)
213 |     print "cite list length %d" % len(cite_list)
214 |     domain_list = extract_domain_list_from_cite_list(cite_list)
215 | 
216 |     sleep_function(5, 10)
217 | 
218 |     for ele in domain_list:
219 |         while True:
220 |             try:
221 |                 print "Select proxy : %s" % p_list[num]
222 |                 html = crawl_google(crawler, "site:" + ele, 15, 0, {"https" : p_list[num]})
223 |                 #html = crawl_google(crawler, "site:" + ele, 10, 0)
224 |                 soup = BeautifulSoup(html)
225 |                 cite_list = extract_cite_list_from_html(soup)
226 |                 if len(cite_list) == 0:
227 |                     print "cite list is empty",
228 |                     raise Exception
229 |             except KeyboardInterrupt:
230 |                 break
231 |             except Exception, e:
232 |                 print e
233 |                 print "Error! delete proxy %s" % p_list[num]
234 |                 del p_list[num]
235 |                 print "%d remained\n" % len(p_list)
236 |                 crawler = requests.Session()
237 |                 if len(p_list) == 0:
238 |                     p_list = get_https_proxy_list()
239 |                 num = select_random_number_from_list(p_list)
240 |                 continue
241 |             break
242 | 
243 |         soup = BeautifulSoup(html)
244 |         cite_list = extract_cite_list_from_html(soup)
245 |         cnt = 0
246 |         for cite in cite_list:
247 |             url = cite.text
248 |             if is_spider_pool_url(url):
249 |                 print "check url : %s : yes" % url
250 |                 cnt += 1
251 |             else:
252 |                 print "check url : %s : no" % url
253 |         if cnt >= 4:
254 |             print keyword + " " + ele
255 |             with codecs.open("result2.txt", "a") as f:
256 |                 f.write(keyword + " " + ele + "\n")
257 |         else:
258 |             print ele + " is not a spider pool"
259 |         sleep_function(7, 13)
260 | 
261 |     with open("visited2.txt", "a") as f:
262 |         f.write(keyword + "\n")
263 |     print "Finished\n===================="
264 | 
265 | def run():
266 |     keyword_list = []
267 |     f = open("medicine.txt")
268 |     for line in f:
269 |         keyword_list.append(line.strip())
270 |     f.close()
271 | 
272 |     for keyword in keyword_list:
273 |         detect(keyword)
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     run()
278 | 


--------------------------------------------------------------------------------