├── .gitignore ├── README ├── __init__.py ├── requirements └── base.txt └── web.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.DS_Store -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy 2 | of this software and associated documentation files (the "Software"), to deal 3 | in the Software without restriction, including without limitation the rights 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 5 | copies of the Software, and to permit persons to whom the Software is 6 | furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in 9 | all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from web import * 2 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | lxml 2 | pybloom -------------------------------------------------------------------------------- /web.py: -------------------------------------------------------------------------------- 1 | import re 2 | import random 3 | import time 4 | import cookielib 5 | import urllib2 6 | import urllib 7 | import mimetypes 8 | import gzip 9 | import StringIO 10 | import urlparse 11 | import collections 12 | import json 13 | import csv 14 | import os 15 | import multiprocessing 16 | import httplib 17 | import copy 18 | import inspect 19 | import Queue 20 | import tempfile 21 | import subprocess 22 | import sys 23 | import functools 24 | 25 | import greenlet 26 | import gevent 27 | from gevent import monkey 28 | from gevent import queue 29 | from gevent import select 30 | from gevent import pool 31 | monkey.patch_all(thread=False) 32 | 33 | from lxml import etree 34 | import pybloom 35 | 36 | from urllib import quote_plus 37 | 38 | DBC_USERNAME = None 39 | DBC_PASSWORD = None 40 | 41 | EXCLUDED_LINK_EXTENSIONS = ('jpg', 'gif', 'jpeg','pdf', 'doc', 'docx', 'ppt', 'txt', 'png', 'zip', 'rar', 'mp3') 42 | 43 | def unique_domains_filter(iterable): 44 | domains = set() 45 | for i in iterable: 46 | parsed = urlparse.urlparse(i.strip()) 47 | if parsed.netloc not in domains: 48 | domains.add(parsed.netloc) 49 | yield i.strip() 50 | 51 | class BloomFilter(object): 52 | def __init__(self, name=None): 53 | if name and not name.endswith('.bloom'): 54 | name += '.bloom' 55 | self.name = name 56 | self.add_counter = 0 57 | try: 58 | self.bloom = pybloom.ScalableBloomFilter.fromfile(open(self.name, 'rb')) 59 | except: 60 | self.bloom = pybloom.ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH) 61 | 62 | def save(self): 63 | if self.name: 64 | self.bloom.tofile(open(self.name+'.bloom', 'wb')) 65 | 66 | def __del__(self): 67 | self.save() 68 | 69 | def add(self, key): 70 | self.bloom.add(key) 71 | self.add_counter += 1 72 | if len(self) / self.add_counter > 10 and self.add_counter > 100: 73 | self.save() 74 | self.add_counter = 0 75 | 76 | def __contains__(self, key, autoadd=True): 77 | result = key in self.bloom 78 | if autoadd: 79 | self.add(key) 80 | return result 81 | 82 | @property 83 | def count(self): 84 | return len(self.bloom) 85 | 86 | def __len__(self): 87 | return len(self.bloom) 88 | 89 | class RandomLines(object): 90 | def __init__(self, input_file, cache_index=True, repetitions=1): 91 | if isinstance(input_file, basestring): 92 | self.source_file = open(input_file,'rb') 93 | self.filename = input_file 94 | else: 95 | self.source_file = input_file 96 | self.filename = input_file.name 97 | self.index = [] 98 | self.cache_index = cache_index 99 | 100 | if not os.path.isfile(self.filename+'.lineindex'): 101 | self.index_file() 102 | else: 103 | for line_counter, line in enumerate(open(self.filename+'.lineindex')): 104 | line = line.strip() 105 | if line_counter == 0: 106 | if int(line) != os.path.getsize(self.filename): 107 | self.index_file() 108 | break 109 | elif len(line): 110 | self.index.append(int(line)) 111 | self.index *= repetitions 112 | self.start_index_len = len(self.index) 113 | 114 | def __iter__(self): 115 | return self 116 | 117 | def __len__(self): 118 | return len(self.index) 119 | 120 | def index_file(self): 121 | bytes_counter = 0 122 | for line in self.source_file: 123 | bytes_counter += len(line) 124 | if len(line.strip()): 125 | self.index.append(bytes_counter-len(line)) 126 | if self.cache_index: 127 | open(self.filename+'.lineindex','w').write('\n'.join(str(i) for i in [os.path.getsize(self.filename)] + self.index)) 128 | 129 | def next(self): 130 | while len(self.index): 131 | offset = self.index.pop(random.randrange(0, len(self.index))) 132 | self.source_file.seek(offset, 0) 133 | return self.source_file.readline().strip() 134 | raise StopIteration 135 | 136 | def percentage(self): 137 | if len(self.index) == 0: 138 | return 100 139 | else: 140 | return 100 - int((float(len(self.index)) / self.start_index_len) * 100) #this is buggy 141 | 142 | def spin(text_input, unique_choices=False): 143 | seen_fields = {} 144 | if text_input.count('{') - text_input.count('}') == 1: 145 | text_input += '}' 146 | for _ in range(text_input.count('{')): 147 | field = re.search('{([^{}]*)}', text_input).group(0) 148 | 149 | if unique_choices: 150 | if field not in seen_fields: 151 | seen_fields[field] = field.split('|') 152 | if len(seen_fields[field]): 153 | replacement = seen_fields[field].pop(random.randint(0,len(seen_fields[field]))) 154 | else: 155 | replacement = '' 156 | else: 157 | replacement = random.choice(field[1:-1].split('|')) 158 | text_input = text_input.replace(field, replacement, 1) 159 | return text_input 160 | 161 | class HTTPResponse(object): 162 | def __init__(self, response=None, url=None, fake=False, http=None): 163 | self._xpath = None 164 | self._json = None 165 | #self._encoded_data = None #might cache encoded data again in future, for now don't see the point 166 | if fake: 167 | self.original_domain = urlparse.urlparse(url).netloc.lower() 168 | self.original_url = url 169 | self.final_url = url 170 | self.final_domain = self.original_domain 171 | self._data = '

Hello!

' 172 | else: 173 | self.headers = response.info() 174 | compressed_data = response.read() 175 | if filter(lambda (k,v): k.lower() == 'content-encoding' and v.lower() == 'gzip', self.headers.items()): 176 | self.headers['Content-type'] = 'text/html; charset=utf-8' 177 | self._data = gzip.GzipFile(fileobj=StringIO.StringIO(compressed_data)).read() 178 | else: 179 | self._data = compressed_data 180 | 181 | self.original_domain = urlparse.urlparse(url).netloc.lower() 182 | self.original_url = url 183 | self.final_url = response.geturl() 184 | self.final_domain = urlparse.urlparse(self.final_url).netloc.lower() 185 | 186 | if http: 187 | self.http = http 188 | 189 | def encoded_data(self): 190 | return unicode(self._data,'ISO-8859-1').encode('ISO-8859-1') 191 | 192 | def __str__(self): 193 | return self._data 194 | 195 | def __len__(self): 196 | return len(str(self)) 197 | 198 | def __contains__(self,x): 199 | return x.lower() in str(self).lower() 200 | 201 | def save(self, handle): 202 | if isinstance(handle, basestring): 203 | handle = open(handle, 'w') 204 | handle.write(str(self)) 205 | handle.close() 206 | 207 | def json(self): 208 | if not self._json: 209 | self._json = json.loads(self._data) 210 | return self._json 211 | 212 | def xpath(self,expression, xml=False): 213 | if self._xpath is None: 214 | if xml: 215 | self._xpath = etree.XML(self.encoded_data()) 216 | else: 217 | self._xpath = etree.HTML(self.encoded_data()) 218 | if self._xpath is None: 219 | return [] 220 | 221 | if not isinstance(expression,basestring): 222 | expression = '||'.join(expression) 223 | if '||' in expression: 224 | results = [] 225 | for part in expression.split('||'): 226 | results.append(self.xpath(part)) 227 | return zip(*results) 228 | 229 | results = [] 230 | original_expression = expression 231 | if expression.endswith('/string()'): 232 | expression = expression.split('/string()')[0] 233 | with gevent.Timeout(30, False): 234 | xpath_result = self._xpath.xpath(expression) 235 | if isinstance(xpath_result, basestring) or not isinstance(xpath_result, collections.Iterable): 236 | return xpath_result 237 | for result in xpath_result: 238 | if expression.endswith('@href') or expression.endswith('@src') or expression.endswith('@action'): 239 | if not result.startswith('http'): 240 | result = urlparse.urljoin(self.final_url,result) 241 | result = result.split('#')[0] 242 | if original_expression.endswith('/string()'): 243 | result = result.xpath('string()') 244 | if isinstance(result,basestring) and len(result.strip()): 245 | results.append(result.strip()) 246 | else: 247 | results.append(result) 248 | return list(results) 249 | 250 | 251 | def single_xpath(self,expression): 252 | results = self.xpath(expression) 253 | if isinstance(results,basestring) or not isinstance(results,collections.Iterable): 254 | return results 255 | if results: 256 | return results[0] 257 | else: 258 | return '' 259 | 260 | def links(self): 261 | return {link.split('#')[0] for link in self.xpath('//a/@href')} 262 | 263 | def internal_links(self): 264 | return {link for link in self.links() if urlparse.urlparse(link).netloc.lower() == self.final_domain if not link.split('.')[-1].lower() in EXCLUDED_LINK_EXTENSIONS} 265 | 266 | def external_links(self, exclude_subdomains=True): 267 | if exclude_subdomains: 268 | return {link for link in self.links() if max(self.final_domain.split('.'), key=len) not in urlparse.urlparse(link).netloc and link.lower().startswith('http') and link.lower().split('.')[-1] not in EXCLUDED_LINK_EXTENSIONS} 269 | else: 270 | return {link for link in self.links() if urlparse.urlparse(link).netloc != self.final_domain and link.lower().startswith('http') and link.lower().split('.')[-1] not in EXCLUDED_LINK_EXTENSIONS} 271 | 272 | def dofollow_links(self): 273 | return set(self.xpath('//a[@rel!="nofollow" or not(@rel)]/@href')) 274 | 275 | def nofollow_links(self): 276 | return set(self.xpath('//a[@rel="nofollow"]/@href')) 277 | 278 | def external_images(self): 279 | return set([image for image in self.xpath('//img/@src') if urlparse.urlparse(image).netloc != self._domain]) 280 | 281 | def csv(self): 282 | return csv.reader(self.encoded_data()) 283 | 284 | def regex(self,expression): 285 | if not isinstance(expression,basestring): 286 | expression = '||'.join(expression) 287 | if '||' in expression: 288 | results = [] 289 | for part in expression.split('||'): 290 | results.append(self.regex(part)) 291 | return zip(*results) 292 | return re.compile(expression,re.S|re.I).findall(self.encoded_data()) 293 | 294 | def url_regex(self,expression): 295 | if not isinstance(expression,basestring): 296 | expression = '||'.join(expression) 297 | if '||' in expression: 298 | results = [] 299 | for part in expression.split('||'): 300 | results.append(self.xpath(part)) 301 | return zip(*results) 302 | return re.compile(expression).findall(self.final_url) 303 | 304 | def __repr__(self): 305 | return '' % self.final_url 306 | 307 | def link_with_url(self, link, domain=False): 308 | if not isinstance(link, basestring): 309 | for l in links: 310 | result = self.link_with_url(l, domain=domain) 311 | if result is not False: 312 | return result 313 | if domain: 314 | link = urlparse.urlparse(link).netloc 315 | for l, l_obj in self.xpath('//a/@href||//a[@href]'): 316 | if domain: 317 | if urlparse.urlparse(l).netloc == link: 318 | return l_obj 319 | else: 320 | if link in (l,l+'/',l.rstrip('/')): 321 | return l_obj 322 | return False 323 | 324 | def link_with_anchor(self, anchor): 325 | if not isinstance(anchor, basestring): 326 | for a in anchor: 327 | result = self.link_with_anchor(a, domain=domain) 328 | if result is not False: 329 | return result 330 | results = self.xpath('//a[text()="%s"]' % anchor) 331 | if len(results): 332 | return results[0] 333 | return False 334 | 335 | def image_captcha(self,xpath): 336 | try: 337 | from captcha import DBC_USERNAME, DBC_PASSWORD 338 | except: 339 | pass 340 | image_source = self.single_xpath(xpath) 341 | if image_source: 342 | image = grab(image_source, http_obj=self.http) 343 | import deathbycaptcha 344 | result = deathbycaptcha.HttpClient(DBC_USERNAME, DBC_PASSWORD).decode(StringIO.StringIO(str(image))) 345 | if result: 346 | return result['text'] 347 | 348 | def recaptcha(self): 349 | iframe_source = self.single_xpath('//iframe[contains(@src,"recaptcha")]/@src') 350 | if iframe_source: 351 | iframe = grab(iframe_source,http_obj=self.http,ref=self.final_url) 352 | return (iframe.single_xpath('//input[@id="recaptcha_challenge_field"]/@value'),iframe.image_captcha('//center/img/@src')) 353 | 354 | def solvemedia(self): 355 | iframe_source = self.single_xpath('//iframe[contains(@src, "api.solvemedia.com")]/@src') 356 | if iframe_source: 357 | iframe = grab(iframe_source,http_obj=self.http,ref=self.final_url) 358 | response = iframe.image_captcha('//img[@id="adcopy-puzzle-image"]/@src') 359 | 360 | post = iframe.hidden_fields() 361 | post['adcopy_response'] = response 362 | 363 | submit_iframe = grab('http://api.solvemedia.com/papi/verify.noscript', http_obj=self.http, ref=iframe_source, post=post) 364 | 365 | if submit_iframe: 366 | if len(submit_iframe.regex('c=(.+?)"')): 367 | return (response, submit_iframe.regex('c=(.+?)"')[0]) 368 | else: 369 | return ('', '') 370 | else: 371 | return ('', '') 372 | 373 | def hidden_fields(self): 374 | fields = {} 375 | for name, value in self.xpath('//input[@type="hidden"]/@name||//input[@type="hidden"]/@value'): 376 | fields[name] = value 377 | return fields 378 | 379 | def view(self): 380 | p = tempfile.mktemp() + '.html' 381 | self.save(p) 382 | if sys.platform == 'darwin': subprocess.call(('open', p)) 383 | elif sys.platform == 'nt': os.startfile(p) #duno lol 384 | elif sys.platform.startswith('linux'): subprocess.call(('xdg-open', p)) 385 | 386 | class ProxyManager(object): 387 | def __init__(self, proxy=True, min_delay=20, max_delay=None): 388 | if isinstance(proxy,list): 389 | proxies = proxy 390 | elif proxy == True: 391 | try: 392 | proxies = open('proxies.txt').read().strip().split('\n') 393 | except: 394 | proxies = [None] 395 | elif isinstance(proxy, basestring): 396 | if proxy.startswith('http'): 397 | proxies = [p.strip() for p in str(grab(proxy)).strip().split('\n') if len(p.strip())] 398 | elif os.path.isfile(proxy): 399 | proxies = [p.strip() for p in open(proxy) if len(p.strip())] 400 | elif ':' in proxy: 401 | proxies = proxy.strip().split('\n') 402 | new_proxies = [] 403 | for proxy in proxies: 404 | if proxy.count(':') == 3: 405 | ip, port, username, password = proxy.split(':') 406 | proxy = username+':'+password+'@'+ip+':'+port 407 | new_proxies.append(proxy) 408 | proxies = new_proxies 409 | elif isinstance(proxy, ProxyManager): 410 | proxies = proxy.records.keys() 411 | else: 412 | proxies = [None] 413 | 414 | self.records = dict(zip(proxies,[0 for p in proxies])) 415 | self.min_delay = min_delay 416 | self.max_delay = max_delay or min_delay 417 | 418 | def get(self): 419 | while True: 420 | proxies = [proxy for proxy, proxy_time in self.records.items() if proxy_time + random.randint(self.min_delay, self.max_delay) < time.time()] 421 | if not proxies: 422 | gevent.sleep(time.time() - min(self.records.values())) 423 | else: 424 | proxy = random.sample(proxies, 1)[0] 425 | self.records[proxy] = int(time.time()) 426 | return proxy 427 | 428 | def __len__(self): 429 | return len(self.records) 430 | 431 | def split(self, number): 432 | chunk_size = len(self) / number 433 | managers = [] 434 | for i in range(number): 435 | if len(self) % chunk_size >= number - i: 436 | managers.append(ProxyManager(self.records.keys()[chunk_size*i:chunk_size*(i+1)+1], min_delay=self.min_delay, max_delay=self.max_delay)) 437 | else: 438 | managers.append(ProxyManager(self.records.keys()[chunk_size*i:chunk_size*(i+1)], min_delay=self.min_delay, max_delay=self.max_delay)) 439 | return managers 440 | 441 | class RedisProxyManager(ProxyManager): 442 | def __init__(self, name, proxy=True, min_delay=20, max_delay=None, host='localhost', port=6379): 443 | ProxyManager.__init__(self, proxy=proxy) 444 | import redis 445 | self.r = redis.Redis(host=host, port=port) 446 | self.name = name 447 | for record in self.records: 448 | if self.r.zrank(self.name, record) == None: 449 | self.r.zadd(self.name, record, 0) 450 | self.cache = [] 451 | self.caching = False 452 | self.fill_cache() 453 | 454 | def fill_cache(self): 455 | while True: 456 | if len(self.cache) != 0: 457 | return 458 | if self.caching: 459 | gevent.sleep(1) 460 | else: 461 | self.caching = True 462 | while True: 463 | proxies = self.r.zrangebyscore(self.name, 0, int(time.time()), start=1, num=max(10, len(self.records) / 5)) 464 | if proxies != 0 and len(proxies) != 0: 465 | print 'filling the proxy cache', len(proxies) 466 | for proxy in proxies: 467 | self.r.zadd(self.name, proxy, int(time.time()) + 300) #Yours for 5 minutes 468 | self.cache = proxies 469 | self.caching = False 470 | return 471 | else: 472 | gevent.sleep(1) 473 | 474 | def get(self): 475 | if not len(self.cache): 476 | self.fill_cache() 477 | proxy = self.cache.pop() 478 | self.last_proxy = proxy 479 | self.r.zadd(self.name, proxy, int(time.time()) + random.randint(self.min_delay, self.max_delay)) #about to be used, so set a normal time on it 480 | return proxy 481 | 482 | def available(self): 483 | return len(self.r.zrangebyscore(self.name, 0, int(time.time()))) + len(self.cache) 484 | 485 | def __len__(self): 486 | return self.r.zcard(self.name) 487 | 488 | 489 | class HeadRequest(urllib2.Request): 490 | def get_method(self): 491 | return 'HEAD' 492 | 493 | def useragent(): 494 | agents = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)','Mozilla/5.0 (X11; Arch Linux i686; rv:2.0) Gecko/20110321 Firefox/4.0','Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.3) Gecko/20100401 Firefox/4.0 (.NET CLR 3.5.30729)','Mozilla/5.0 (Windows NT 6.1; rv:2.0) Gecko/20110319 Firefox/4.0','Mozilla/5.0 (Windows NT 6.1; rv:1.9) Gecko/20100101 Firefox/4.0','Opera/9.20 (Windows NT 6.0; U; en)','Opera/9.00 (Windows NT 5.1; U; en)','Opera/9.64(Windows NT 5.1; U; en) Presto/2.1.1') 495 | return random.choice(agents) 496 | 497 | def encode_multipart_formdata(fields, files): 498 | ''' 499 | fields is a sequence of (name, value) elements for regular form fields. 500 | files is a sequence of (name, filename, value) elements for data to be uploaded as files 501 | Return (content_type, body) ready for httplib.HTTP instance 502 | ''' 503 | BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' 504 | CRLF = '\r\n' 505 | L = [] 506 | for (key, value) in fields: 507 | L.append('--' + BOUNDARY) 508 | L.append('Content-Disposition: form-data; name="%s"' % key) 509 | L.append('') 510 | L.append(value) 511 | for (key, filename, value) in files: 512 | L.append('--' + BOUNDARY) 513 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) 514 | L.append('Content-Type: %s' % get_content_type(filename)) 515 | L.append('') 516 | L.append(value) 517 | L.append('--' + BOUNDARY + '--') 518 | L.append('') 519 | body = CRLF.join(L) 520 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 521 | return content_type, body 522 | 523 | def get_content_type(filename): 524 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 525 | 526 | class DisabledHTTPRedirectHandler(urllib2.HTTPRedirectHandler): 527 | def redirect_request(self, req, fp, code, msg, headers, newurl): 528 | req.get_full_url() 529 | raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 530 | 531 | class http(object): 532 | def __init__(self, proxy=None, cookie_filename=None, cookies=True, redirects=True): 533 | self.handlers = set() 534 | try: 535 | useragents = [ua.strip() for ua in open('useragents.txt') if len(ua.strip())] 536 | self.useragent = random.choice(useragents).strip() 537 | except: 538 | self.useragent = useragent() 539 | 540 | self.opener = urllib2.OpenerDirector() 541 | 542 | if cookies: 543 | self.cookie_jar = cookielib.LWPCookieJar() 544 | if cookie_filename: 545 | self.cookie_jar = cookielib.MozillaCookieJar(cookie_filename) 546 | self.cookie_jar.load() 547 | cookie_support = urllib2.HTTPCookieProcessor(self.cookie_jar) 548 | else: 549 | cookie_support = None 550 | 551 | self.proxy = False 552 | proxy_auth = None 553 | 554 | if proxy: 555 | if isinstance(proxy, ProxyManager): 556 | self.proxy = proxy.get() 557 | else: 558 | self.proxy = ProxyManager(proxy).get() 559 | #print 'proxy in http = ', self.proxy 560 | 561 | if self.proxy: 562 | self.proxy = self.proxy.strip() 563 | proxy_support = urllib2.ProxyHandler({'http' : self.proxy,'https':self.proxy}) 564 | if '@' in self.proxy: 565 | proxy_auth = urllib2.HTTPBasicAuthHandler() 566 | else: 567 | proxy_auth = None 568 | else: 569 | proxy_support = None 570 | 571 | if not redirects: 572 | self.build_opener(DisabledHTTPRedirectHandler()) 573 | 574 | self.build_opener(proxy_support,cookie_support,proxy_auth) 575 | 576 | def build_opener(self,*handlers): 577 | self.handlers |= set([handler for handler in handlers if handler is not None]) 578 | self.opener = urllib2.build_opener(*self.handlers) 579 | 580 | def urlopen(self, url, post=None, ref=None, files=None, username=None, password=None, compress=True, head=False, timeout=30): 581 | assert url.lower().startswith('http') 582 | if isinstance(post, basestring): 583 | post = dict([part.split('=') for part in post.strip().split('&')]) 584 | if post: 585 | for k, v in post.items(): 586 | post[k] = spin(unicode(v).encode('utf-8')) 587 | if username and password: 588 | password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 589 | password_manager.add_password(None, url, username, password) 590 | password_auth = urllib2.HTTPBasicAuthHandler(password_manager) 591 | self.build_opener(password_auth) 592 | urllib2.install_opener(self.opener) 593 | if compress: 594 | headers = {'User-Agent' : self.useragent, 'Accept-encoding' : 'gzip'} 595 | else: 596 | headers = {'User-Agent' : self.useragent} 597 | if ref: 598 | headers['Referer'] = ref 599 | if files: 600 | content_type,post = encode_multipart_formdata(post.items(), files) 601 | headers['content-type'] = content_type 602 | headers['content-length'] = str(len(post)) 603 | elif post: 604 | post = urllib.urlencode(post) 605 | if head: 606 | req = HeadRequest(url, post, headers) 607 | else: 608 | req = urllib2.Request(url, post, headers) 609 | with gevent.Timeout(timeout): 610 | response = urllib2.urlopen(req) 611 | return HTTPResponse(response, url, http=self) 612 | 613 | def grab(url, proxy=None, post=None, ref=None, compress=True, retries=1, http_obj=None, cookies=False, redirects=True, timeout=30): 614 | data = None 615 | if retries < 1: 616 | retries = 1 617 | for i in range(retries): 618 | if not http_obj: 619 | http_obj = http(proxy, cookies=cookies, redirects=redirects) 620 | try: 621 | data = http_obj.urlopen(url=url, post=post, ref=ref, compress=compress, timeout=timeout) 622 | break 623 | except urllib2.HTTPError, e: 624 | if str(e.code).startswith('3') and not redirects: 625 | data = HTTPResponse(url=url, fake=True) 626 | break 627 | except: 628 | pass 629 | if data: 630 | return data 631 | return False 632 | 633 | class RedisQueue(object): 634 | def __init__(self, name, host='localhost', port=6379): 635 | import redis 636 | self.r = redis.Redis(host=host, port=port) 637 | self.name = name 638 | 639 | def put(self, item): 640 | self.r.sadd(self.name, item) 641 | 642 | def get(self, timeout=60): 643 | timeout_counter = 0 644 | while True: 645 | result = self.r.spop(self.name) 646 | if result == None: 647 | if timeout == timeout_counter: 648 | return None 649 | gevent.sleep(1) 650 | timeout_counter += 1 651 | else: 652 | return result 653 | 654 | def get_nowait(self): 655 | return self.get() 656 | 657 | def __len__(self): 658 | return self.r.scard(self.name) 659 | 660 | def empty(self): 661 | return len(self) == 0 662 | 663 | def WebQueue(iterator=None): 664 | queue = Queue.Queue() 665 | if iterator: 666 | [queue.put(item) for item in iterator] 667 | return queue 668 | 669 | def generic_iterator(iterator): 670 | if isinstance(iterator, basestring): 671 | if '\n' in iterator: 672 | for i in iterator.split('\n'): 673 | if len(i.strip()): 674 | yield i.strip() 675 | else: 676 | yield iterator.strip() 677 | else: 678 | for i in iterator: 679 | yield i 680 | 681 | class DomainQueue(object): 682 | def __init__(self, urls): 683 | self.domains = collections.defaultdict(list) 684 | for url in urls: 685 | if isinstance(url, basestring): 686 | url = urlparse.urlparse(url) 687 | self.domains[url.netloc].append(url.geturl()) 688 | self.counter = {domain:0 for domain in self.domains.keys()} 689 | 690 | def empty(self): 691 | return len(self.domains) == 0 692 | 693 | def get_nowait(self): 694 | domain = min(self.counter, key=self.counter.get) 695 | url = self.domains[domain].pop() 696 | if len(self.domains[domain]) == 0: 697 | del(self.domains[domain]) 698 | del(self.counter[domain]) 699 | else: 700 | self.counter[domain] += 1 701 | return url 702 | 703 | def get(self): 704 | return self.get_nowait() 705 | 706 | def put(self, url): 707 | if isinstance(url, basestring): 708 | url = urlparse.urlparse(url) 709 | self.domains[url.netloc].append(url.geturl()) 710 | if url.netloc not in self.counter: 711 | self.counter[url.netloc] = 0 712 | 713 | def __len__(self): 714 | return sum((len(d) for d in self.domains.values())) 715 | 716 | def multi_grab(urls, pool_size=100, timeout=30, max_pages=-1, queuify=True, proxy=None): 717 | if queuify: 718 | in_q = WebQueue(generic_iterator(urls)) 719 | else: 720 | in_q = urls 721 | for result_counter, result in enumerate(pooler(grab, in_q, pool_size=pool_size, timeout=timeout, proxy=proxy)): 722 | yield result 723 | if result_counter == max_pages and max_pages > 0: 724 | break 725 | 726 | def domain_crawl(urls, pool_size=100, timeout=30, max_pages=-1, link_filter=None, max_domain_pages=-1): 727 | urls = {url for url in generic_iterator(urls)} 728 | domains = {urlparse.urlparse(url).netloc for url in urls} 729 | domain_counter = collections.Counter() 730 | seen_urls = set(urls) 731 | while True: 732 | if not len(urls): 733 | break 734 | urls_queue = Queue.Queue() 735 | [urls_queue.put(url) for url in urls] 736 | urls = set() 737 | for page_counter, page in enumerate(multi_grab(urls_queue, pool_size, timeout, queuify=False)): 738 | if page.final_domain in domains: 739 | domain_counter[page.final_domain] += 1 740 | if not (domain_counter[page.final_domain] > max_domain_pages and max_domain_pages > 0): 741 | try: 742 | new_urls = {url for url in page.internal_links() if url not in seen_urls} 743 | if callable(link_filter): 744 | new_urls = {url for url in new_urls if link_filter(url)} 745 | urls |= new_urls 746 | seen_urls |= new_urls 747 | except: 748 | pass 749 | if max_pages > 0 and page_counter > max_pages: 750 | break 751 | yield page 752 | 753 | 754 | def redirecturl(url, proxy=None): 755 | return http(proxy).urlopen(url, head=True).geturl() 756 | 757 | def cloud_pooler(func, in_q, chunk_size=1000, _env='python-web', _type='c2', _max_runtime=60, get_results=True, **kwargs): 758 | import cloud 759 | if chunk_size > 1: 760 | if isinstance(in_q, collections.Iterable): 761 | in_q = WebQueue(in_q) 762 | chunks = [] 763 | chunk = [] 764 | while not in_q.empty(): 765 | chunk.append(in_q.get()) 766 | if len(chunk) == chunk_size: 767 | chunks.append(chunk) 768 | chunk = [] 769 | if len(chunk): 770 | chunks.append(chunk) 771 | else: 772 | chunks = in_q 773 | 774 | partial_func = functools.partial(func, **kwargs) 775 | jids = cloud.map(partial_func, chunks, _env=_env, _type=_type, _max_runtime=_max_runtime) 776 | 777 | if get_results: 778 | print jids 779 | for result in cloud.iresult(jids, ignore_errors=True): 780 | if result: 781 | yield result 782 | else: 783 | for jid in jids: 784 | yield jid 785 | 786 | def pooler(func, in_q, pool_size=100, proxy=False, max_results=0, **kwargs): 787 | if isinstance(in_q, collections.Iterable): 788 | in_q = WebQueue(in_q) 789 | out_q = multiprocessing.Queue() 790 | if proxy and not isinstance(proxy, ProxyManager): 791 | proxy = ProxyManager(proxy) 792 | 793 | p = pool.Pool(pool_size) 794 | greenlets = set() 795 | if proxy: 796 | kwargs['proxy'] = proxy 797 | result_counter = 0 798 | while True: 799 | #print len(greenlets), 'greenlets' 800 | finished_greenlets = {g for g in greenlets if g.value != None} 801 | greenlets -= finished_greenlets 802 | for g in finished_greenlets: 803 | if g.value != False: 804 | yield g.value 805 | result_counter += 1 806 | if max_results > 0 and result_counter >= max_results: 807 | break 808 | if len(greenlets) > pool_size: 809 | print 'uhoh, greenlets are getting stuck', len(greenlets) 810 | if len(greenlets) < pool_size: 811 | try: 812 | i = in_q.get_nowait() 813 | except: 814 | break 815 | if not isinstance(i, dict): 816 | i = {inspect.getargspec(func).args[0]: i} 817 | kwargs = dict(kwargs.items() + i.items()) 818 | greenlets.add(p.spawn(func, **kwargs)) 819 | else: 820 | time.sleep(1) 821 | 822 | 823 | p.join() 824 | for g in greenlets: 825 | if g.value: 826 | yield g.value 827 | --------------------------------------------------------------------------------