├── README.md ├── .DS_Store ├── test.py ├── myStringIO.py └── crawler.py /README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ghost123gg/tools/HEAD/.DS_Store -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | 5 | -------------------------------------------------------------------------------- /myStringIO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | """ 5 | A StringIO like class implement by Alex 6 | """ 7 | 8 | __author__ = "Alex" 9 | __all__ = ["MyStringIO"] 10 | 11 | 12 | def _test_ifclosed(closed): 13 | if closed: 14 | raise ValueError, "I/O operation on closed file" 15 | 16 | 17 | class MyStringIO: 18 | def __init__(self, buf = ""): 19 | if not isinstance(buf, basestring): 20 | buf = str(buf) 21 | self.buf = buf 22 | self.length = len(buf) 23 | self.pos = 0 24 | self.closed = False 25 | 26 | def __iter__(self): 27 | return self 28 | 29 | def next(self): 30 | _test_ifclosed(self.closed) 31 | r = self.readline() 32 | if not r: 33 | raise StopIteration 34 | return r 35 | 36 | def close(self): 37 | """ 38 | Set closed attribute to True. A closed buffer can not be used for 39 | further IO Operation. close() can be called more than once without 40 | error. 41 | """ 42 | if not self.closed: 43 | self.closed = True 44 | del self.pos, self.buf 45 | 46 | def tell(self): 47 | """ 48 | Return current file position. 49 | """ 50 | _test_ifclosed(self.closed) 51 | return self.pos 52 | 53 | def seek(self, pos, whence = 0): 54 | """ 55 | Set the file's current position. 56 | """ 57 | _test_ifclosed(self.closed) 58 | spos = self.pos 59 | slen = self.length 60 | if whence == 1: 61 | pos += self.pos 62 | elif whence == 2: 63 | pos += self.length 64 | self.pos = max(0, pos) 65 | 66 | def read(self, size = -1): 67 | """ 68 | Read at the most size bytes, return as a string. 69 | If the size argument is negative or omitted, read until EOF is reached. 70 | """ 71 | _test_ifclosed(self.closed) 72 | if size < 0: 73 | newpos = self.length 74 | else: 75 | newpos = min(self.length, self.pos + size) 76 | r = self.buf[self.pos:newpos] 77 | self.pos = newpos 78 | return r 79 | 80 | def readline(self): 81 | """ 82 | Read next line from file, return as a string. 83 | """ 84 | _test_ifclosed(self.closed) 85 | sep = self.buf.find("\n", self.pos) 86 | newpos = self.length if sep == -1 else sep + 1 87 | r = self.buf[self.pos:newpos] 88 | self.pos = newpos 89 | return r 90 | 91 | def readlines(self, size = -1): 92 | """ 93 | Call readline() repeatedly and return a list of the lines so read. 94 | The optional size argument, if given, is an approximate bound on the 95 | total number of bytes in the lines returned. 96 | """ 97 | _test_ifclosed(self.closed) 98 | lines = [] 99 | total = 0 100 | 101 | line = self.readline() 102 | while line: 103 | lines.append(line) 104 | total += len(line) 105 | if 0 < size <= total: 106 | break 107 | line = self.readline() 108 | return lines 109 | 110 | def write(self, s): 111 | """ 112 | Write string to buffer. 113 | """ 114 | _test_ifclosed(self.closed) 115 | if not s: return 116 | if not isinstance(s, basestring): 117 | s = str(s) 118 | curpos = self.pos 119 | curlen = self.length 120 | slen = len(s) 121 | 122 | if curpos > curlen: 123 | self.buf += '\0' * (curpos - curlen) 124 | curlen = self.length = curpos 125 | 126 | newpos = curpos + slen 127 | if newpos < curlen: 128 | self.buf = self.buf[:curpos] + s + self.buf[newpos:] 129 | elif newpos == curlen: 130 | self.buf = self.buf[:curpos] + s 131 | else: 132 | self.length = newpos 133 | self.buf = self.buf[:curpos] + s 134 | self.pos = newpos 135 | 136 | def writelines(self, lines): 137 | """ 138 | Write a sequence of string to the file. 139 | """ 140 | _test_ifclosed(self.closed) 141 | for line in lines: 142 | self.write(line) 143 | 144 | def truncate(self, size = None): 145 | """ 146 | Truncate the file to at most size bytes. 147 | Size default to the current file position, as returnd by tell() 148 | """ 149 | _test_ifclosed(self.closed) 150 | if size is None: 151 | size = self.pos 152 | elif size < 0: 153 | raise IOError("Invalid argument") 154 | elif size < self.length: 155 | self.pos = size 156 | self.buf = self.buf[:size] 157 | self.length = size 158 | 159 | def getvalue(self): 160 | """ 161 | Return the value of buffer. 162 | """ 163 | _test_ifclosed(self.closed) 164 | return self.buf 165 | 166 | 167 | def test(): 168 | import sys 169 | if sys.argv[1:]: 170 | file = sys.argv[1] 171 | else: 172 | file = '/etc/passwd' 173 | lines = open(file, 'r').readlines() 174 | text = open(file, 'r').read() 175 | f = MyStringIO() 176 | for line in lines[:-2]: 177 | f.write(line) 178 | f.writelines(lines[-2:]) 179 | if f.getvalue() != text: 180 | raise RuntimeError, 'write failed' 181 | length = f.tell() 182 | print 'File length =', length 183 | print 'Text length =', len(text) 184 | f.seek(len(lines[0])) 185 | print "Position =", f.tell() 186 | f.write(lines[1]) 187 | print f.getvalue() 188 | f.seek(0) 189 | print 'First line =', repr(f.readline()) 190 | print 'Position =', f.tell() 191 | line = f.readline() 192 | print 'Second line =', repr(line) 193 | f.seek(-len(line), 1) 194 | line2 = f.read(len(line)) 195 | if line != line2: 196 | raise RuntimeError, 'bad result after seek back' 197 | f.seek(len(line2), 1) 198 | l = f.readlines() 199 | line = l[-1] 200 | f.seek(f.tell() - len(line)) 201 | line2 = f.read() 202 | if line != line2: 203 | raise RuntimeError, 'bad result after seek back from EOF' 204 | print 'Read', len(l), 'more lines' 205 | print 'File length =', f.tell() 206 | if f.tell() != length: 207 | raise RuntimeError, 'bad length' 208 | f.truncate(length/2) 209 | f.seek(0, 2) 210 | print 'Truncated length =', f.tell() 211 | if f.tell() != length/2: 212 | raise RuntimeError, 'truncate did not adjust length' 213 | f.close() 214 | 215 | 216 | if __name__ == '__main__': 217 | test() 218 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Alex' 5 | 6 | import requests 7 | import re 8 | import codecs 9 | import urllib 10 | import time 11 | import random 12 | from bs4 import BeautifulSoup 13 | from url_extract import UrlExtract 14 | 15 | http_header = { 16 | 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 17 | 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 18 | 'Accept-Encoding' : 'gzip, deflate, sdch', 19 | 'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6', 20 | 'Connection' : 'keep-alive' 21 | } 22 | 23 | extract = UrlExtract() 24 | p_list = [] 25 | 26 | def init_list_from_file(filename): 27 | l = [] 28 | f = open(filename) 29 | for line in f: 30 | line = line.strip() 31 | if line: 32 | l.append(line) 33 | f.close() 34 | return l 35 | 36 | def sleep_function(lower, upper): 37 | print "Sleeping..." 38 | time.sleep(random.uniform(lower, upper)) 39 | print "Finished...\n" 40 | 41 | def get_https_proxy_list(): 42 | global http_header 43 | 44 | print "--------------------\nGeting proxy..." 45 | proxy_list = [] 46 | entry_url = "http://proxy.moo.jp" 47 | target_url = "http://proxy.moo.jp/zh/" 48 | 49 | crawler = requests.Session() 50 | html = crawler.get(entry_url, headers = http_header).content 51 | sleep_function(1, 3) 52 | 53 | for i in range(2)[1:]: 54 | payload = {'pr' : 'HTTPS', 'page' : str(i)} 55 | r = crawler.get(target_url, headers = http_header, params = payload) 56 | html = r.text 57 | 58 | with codecs.open("proxy.html", "w", encoding = r.encoding) as f: 59 | f.write(html) 60 | 61 | soup = BeautifulSoup(html) 62 | tr_list = [] 63 | odd_list = soup.find_all("tr", class_ = "Odd") 64 | even_list = soup.find_all("tr", class_ = "Even") 65 | 66 | for ele in odd_list: 67 | tr_list.append(ele) 68 | for ele in even_list: 69 | tr_list.append(ele) 70 | 71 | for ele in tr_list: 72 | if ele.find("ins", class_ = "adsbygoogle"): 73 | continue 74 | else: 75 | td_list = ele.find_all("td") 76 | country = td_list[4].text 77 | ipdecode = td_list[0].text 78 | ipdecode = td_list[0].text 79 | st = ipdecode.find("(") 80 | ed = ipdecode.find(")") 81 | proxy_ip = urllib.unquote(ipdecode[st+2 : ed-1]) + ":" + td_list[1].text 82 | proxy_list.append(proxy_ip) 83 | print "Find %d proxy\n--------------------" % len(proxy_list) 84 | return proxy_list 85 | 86 | def crawl_google(crawler, keyword, num, start, proxies = {}): 87 | global http_header 88 | 89 | google_url = "https://www.google.co.jp/search" 90 | payload = {'q' : keyword, 'num' : num, 'start' : start} 91 | 92 | if proxies: 93 | r = crawler.get(google_url, params = payload, proxies = proxies, timeout = 10) 94 | else: 95 | r = crawler.get(google_url, params = payload, timeout = 10) 96 | html = r.text 97 | 98 | with codecs.open("google.html", "w", encoding = 'UTF-8') as f: 99 | f.write(html) 100 | 101 | return html 102 | 103 | def contain_chinese(s): 104 | for ch in s: 105 | if u'\u4e00' <= ch <= u'\u9fff': 106 | return True 107 | return False 108 | 109 | def extract_domain(fqdn): 110 | global extract 111 | extracted = extract.extract(fqdn) 112 | return str(extracted.getDomain()) + "." + str(extracted.getTld()) if extracted.valid() else "" 113 | 114 | def extract_cite_list_from_html(soup): 115 | return soup.find_all('cite') 116 | 117 | def extract_h3_list_from_html(soup): 118 | return soup.find_all('h3', class_ = 'r') 119 | 120 | def has_cheme(url): 121 | return True if url.startswith("http://") or url.startswith("https://") else False 122 | 123 | def extract_domain_list_from_cite_list(cite_list): 124 | domain_list = set() 125 | for cite in cite_list: 126 | url = cite.text 127 | if not contain_chinese(url): 128 | ed = len(url) 129 | if not has_cheme(url): 130 | for i in range(ed): 131 | if url[i] == '/' or url[i] == '?': 132 | ed = i 133 | break 134 | link = url[:ed] 135 | domain = extract_domain(link) 136 | if domain: 137 | print "Find domain : %s" % domain 138 | domain_list.add(domain) 139 | return domain_list 140 | 141 | def is_spider_pool_url(url): 142 | if contain_chinese(url): 143 | return False 144 | if url.count('/') > 1: 145 | return False 146 | if url.count('?') > 0: 147 | return False 148 | pos = url.find('/') 149 | if pos != -1: 150 | url = url[:pos] 151 | if url.count('.') >= 3: 152 | return True 153 | if url.count('.') <= 1: 154 | return False 155 | 156 | pos = url.find('.') 157 | 158 | if url[:pos].isalpha() and len(url[:pos]) >= 5: 159 | return True 160 | 161 | has_digit = False 162 | has_ch = False 163 | for i in range(pos): 164 | if url[i].isalpha(): 165 | has_ch = True 166 | if url[i].isdigit(): 167 | has_digit = True 168 | 169 | if has_digit and has_ch: 170 | return True 171 | return False 172 | 173 | def select_random_number_from_list(l): 174 | return random.randint(0, len(l) - 1) 175 | 176 | def detect(keyword): 177 | global p_list 178 | 179 | num = -1 180 | html = "" 181 | p_list = get_https_proxy_list() 182 | num = select_random_number_from_list(p_list) 183 | crawler = requests.Session() 184 | 185 | print "====================\nDetect keyword:", 186 | print keyword 187 | while True: 188 | try: 189 | print "Select proxy : %s" % p_list[num] 190 | html = crawl_google(crawler, keyword, 10, 0, {"https" : p_list[num]}) 191 | #html = crawl_google(crawler, keyword, 10, 0) 192 | soup = BeautifulSoup(html) 193 | cite_list = extract_cite_list_from_html(soup) 194 | if len(cite_list) == 0: 195 | print "cite list is empty", 196 | raise Exception 197 | except KeyboardInterrupt: 198 | break 199 | except Exception, e: 200 | print e 201 | print "Error! delete proxy %s" % p_list[num] 202 | crawler = requests.Session() 203 | del p_list[num] 204 | print "%d remained\n" % len(p_list) 205 | if len(p_list) == 0: 206 | p_list = get_https_proxy_list() 207 | num = select_random_number_from_list(p_list) 208 | continue 209 | break 210 | 211 | soup = BeautifulSoup(html) 212 | cite_list = extract_cite_list_from_html(soup) 213 | print "cite list length %d" % len(cite_list) 214 | domain_list = extract_domain_list_from_cite_list(cite_list) 215 | 216 | sleep_function(5, 10) 217 | 218 | for ele in domain_list: 219 | while True: 220 | try: 221 | print "Select proxy : %s" % p_list[num] 222 | html = crawl_google(crawler, "site:" + ele, 15, 0, {"https" : p_list[num]}) 223 | #html = crawl_google(crawler, "site:" + ele, 10, 0) 224 | soup = BeautifulSoup(html) 225 | cite_list = extract_cite_list_from_html(soup) 226 | if len(cite_list) == 0: 227 | print "cite list is empty", 228 | raise Exception 229 | except KeyboardInterrupt: 230 | break 231 | except Exception, e: 232 | print e 233 | print "Error! delete proxy %s" % p_list[num] 234 | del p_list[num] 235 | print "%d remained\n" % len(p_list) 236 | crawler = requests.Session() 237 | if len(p_list) == 0: 238 | p_list = get_https_proxy_list() 239 | num = select_random_number_from_list(p_list) 240 | continue 241 | break 242 | 243 | soup = BeautifulSoup(html) 244 | cite_list = extract_cite_list_from_html(soup) 245 | cnt = 0 246 | for cite in cite_list: 247 | url = cite.text 248 | if is_spider_pool_url(url): 249 | print "check url : %s : yes" % url 250 | cnt += 1 251 | else: 252 | print "check url : %s : no" % url 253 | if cnt >= 4: 254 | print keyword + " " + ele 255 | with codecs.open("result2.txt", "a") as f: 256 | f.write(keyword + " " + ele + "\n") 257 | else: 258 | print ele + " is not a spider pool" 259 | sleep_function(7, 13) 260 | 261 | with open("visited2.txt", "a") as f: 262 | f.write(keyword + "\n") 263 | print "Finished\n====================" 264 | 265 | def run(): 266 | keyword_list = [] 267 | f = open("medicine.txt") 268 | for line in f: 269 | keyword_list.append(line.strip()) 270 | f.close() 271 | 272 | for keyword in keyword_list: 273 | detect(keyword) 274 | 275 | 276 | if __name__ == '__main__': 277 | run() 278 | --------------------------------------------------------------------------------