├── .gitignore
├── README
├── __init__.py
├── requirements
└── base.txt
└── web.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.DS_Store
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | Permission is hereby granted, free of charge, to any person obtaining a copy
2 | of this software and associated documentation files (the "Software"), to deal
3 | in the Software without restriction, including without limitation the rights
4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
5 | copies of the Software, and to permit persons to whom the Software is
6 | furnished to do so, subject to the following conditions:
7 |
8 | The above copyright notice and this permission notice shall be included in
9 | all copies or substantial portions of the Software.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from web import *
2 |
--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | pybloom
--------------------------------------------------------------------------------
/web.py:
--------------------------------------------------------------------------------
1 | import re
2 | import random
3 | import time
4 | import cookielib
5 | import urllib2
6 | import urllib
7 | import mimetypes
8 | import gzip
9 | import StringIO
10 | import urlparse
11 | import collections
12 | import json
13 | import csv
14 | import os
15 | import multiprocessing
16 | import httplib
17 | import copy
18 | import inspect
19 | import Queue
20 | import tempfile
21 | import subprocess
22 | import sys
23 | import functools
24 |
25 | import greenlet
26 | import gevent
27 | from gevent import monkey
28 | from gevent import queue
29 | from gevent import select
30 | from gevent import pool
31 | monkey.patch_all(thread=False)
32 |
33 | from lxml import etree
34 | import pybloom
35 |
36 | from urllib import quote_plus
37 |
38 | DBC_USERNAME = None
39 | DBC_PASSWORD = None
40 |
41 | EXCLUDED_LINK_EXTENSIONS = ('jpg', 'gif', 'jpeg','pdf', 'doc', 'docx', 'ppt', 'txt', 'png', 'zip', 'rar', 'mp3')
42 |
43 | def unique_domains_filter(iterable):
44 | domains = set()
45 | for i in iterable:
46 | parsed = urlparse.urlparse(i.strip())
47 | if parsed.netloc not in domains:
48 | domains.add(parsed.netloc)
49 | yield i.strip()
50 |
51 | class BloomFilter(object):
52 | def __init__(self, name=None):
53 | if name and not name.endswith('.bloom'):
54 | name += '.bloom'
55 | self.name = name
56 | self.add_counter = 0
57 | try:
58 | self.bloom = pybloom.ScalableBloomFilter.fromfile(open(self.name, 'rb'))
59 | except:
60 | self.bloom = pybloom.ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH)
61 |
62 | def save(self):
63 | if self.name:
64 | self.bloom.tofile(open(self.name+'.bloom', 'wb'))
65 |
66 | def __del__(self):
67 | self.save()
68 |
69 | def add(self, key):
70 | self.bloom.add(key)
71 | self.add_counter += 1
72 | if len(self) / self.add_counter > 10 and self.add_counter > 100:
73 | self.save()
74 | self.add_counter = 0
75 |
76 | def __contains__(self, key, autoadd=True):
77 | result = key in self.bloom
78 | if autoadd:
79 | self.add(key)
80 | return result
81 |
82 | @property
83 | def count(self):
84 | return len(self.bloom)
85 |
86 | def __len__(self):
87 | return len(self.bloom)
88 |
89 | class RandomLines(object):
90 | def __init__(self, input_file, cache_index=True, repetitions=1):
91 | if isinstance(input_file, basestring):
92 | self.source_file = open(input_file,'rb')
93 | self.filename = input_file
94 | else:
95 | self.source_file = input_file
96 | self.filename = input_file.name
97 | self.index = []
98 | self.cache_index = cache_index
99 |
100 | if not os.path.isfile(self.filename+'.lineindex'):
101 | self.index_file()
102 | else:
103 | for line_counter, line in enumerate(open(self.filename+'.lineindex')):
104 | line = line.strip()
105 | if line_counter == 0:
106 | if int(line) != os.path.getsize(self.filename):
107 | self.index_file()
108 | break
109 | elif len(line):
110 | self.index.append(int(line))
111 | self.index *= repetitions
112 | self.start_index_len = len(self.index)
113 |
114 | def __iter__(self):
115 | return self
116 |
117 | def __len__(self):
118 | return len(self.index)
119 |
120 | def index_file(self):
121 | bytes_counter = 0
122 | for line in self.source_file:
123 | bytes_counter += len(line)
124 | if len(line.strip()):
125 | self.index.append(bytes_counter-len(line))
126 | if self.cache_index:
127 | open(self.filename+'.lineindex','w').write('\n'.join(str(i) for i in [os.path.getsize(self.filename)] + self.index))
128 |
129 | def next(self):
130 | while len(self.index):
131 | offset = self.index.pop(random.randrange(0, len(self.index)))
132 | self.source_file.seek(offset, 0)
133 | return self.source_file.readline().strip()
134 | raise StopIteration
135 |
136 | def percentage(self):
137 | if len(self.index) == 0:
138 | return 100
139 | else:
140 | return 100 - int((float(len(self.index)) / self.start_index_len) * 100) #this is buggy
141 |
142 | def spin(text_input, unique_choices=False):
143 | seen_fields = {}
144 | if text_input.count('{') - text_input.count('}') == 1:
145 | text_input += '}'
146 | for _ in range(text_input.count('{')):
147 | field = re.search('{([^{}]*)}', text_input).group(0)
148 |
149 | if unique_choices:
150 | if field not in seen_fields:
151 | seen_fields[field] = field.split('|')
152 | if len(seen_fields[field]):
153 | replacement = seen_fields[field].pop(random.randint(0,len(seen_fields[field])))
154 | else:
155 | replacement = ''
156 | else:
157 | replacement = random.choice(field[1:-1].split('|'))
158 | text_input = text_input.replace(field, replacement, 1)
159 | return text_input
160 |
161 | class HTTPResponse(object):
162 | def __init__(self, response=None, url=None, fake=False, http=None):
163 | self._xpath = None
164 | self._json = None
165 | #self._encoded_data = None #might cache encoded data again in future, for now don't see the point
166 | if fake:
167 | self.original_domain = urlparse.urlparse(url).netloc.lower()
168 | self.original_url = url
169 | self.final_url = url
170 | self.final_domain = self.original_domain
171 | self._data = '
Hello!
'
172 | else:
173 | self.headers = response.info()
174 | compressed_data = response.read()
175 | if filter(lambda (k,v): k.lower() == 'content-encoding' and v.lower() == 'gzip', self.headers.items()):
176 | self.headers['Content-type'] = 'text/html; charset=utf-8'
177 | self._data = gzip.GzipFile(fileobj=StringIO.StringIO(compressed_data)).read()
178 | else:
179 | self._data = compressed_data
180 |
181 | self.original_domain = urlparse.urlparse(url).netloc.lower()
182 | self.original_url = url
183 | self.final_url = response.geturl()
184 | self.final_domain = urlparse.urlparse(self.final_url).netloc.lower()
185 |
186 | if http:
187 | self.http = http
188 |
189 | def encoded_data(self):
190 | return unicode(self._data,'ISO-8859-1').encode('ISO-8859-1')
191 |
192 | def __str__(self):
193 | return self._data
194 |
195 | def __len__(self):
196 | return len(str(self))
197 |
198 | def __contains__(self,x):
199 | return x.lower() in str(self).lower()
200 |
201 | def save(self, handle):
202 | if isinstance(handle, basestring):
203 | handle = open(handle, 'w')
204 | handle.write(str(self))
205 | handle.close()
206 |
207 | def json(self):
208 | if not self._json:
209 | self._json = json.loads(self._data)
210 | return self._json
211 |
212 | def xpath(self,expression, xml=False):
213 | if self._xpath is None:
214 | if xml:
215 | self._xpath = etree.XML(self.encoded_data())
216 | else:
217 | self._xpath = etree.HTML(self.encoded_data())
218 | if self._xpath is None:
219 | return []
220 |
221 | if not isinstance(expression,basestring):
222 | expression = '||'.join(expression)
223 | if '||' in expression:
224 | results = []
225 | for part in expression.split('||'):
226 | results.append(self.xpath(part))
227 | return zip(*results)
228 |
229 | results = []
230 | original_expression = expression
231 | if expression.endswith('/string()'):
232 | expression = expression.split('/string()')[0]
233 | with gevent.Timeout(30, False):
234 | xpath_result = self._xpath.xpath(expression)
235 | if isinstance(xpath_result, basestring) or not isinstance(xpath_result, collections.Iterable):
236 | return xpath_result
237 | for result in xpath_result:
238 | if expression.endswith('@href') or expression.endswith('@src') or expression.endswith('@action'):
239 | if not result.startswith('http'):
240 | result = urlparse.urljoin(self.final_url,result)
241 | result = result.split('#')[0]
242 | if original_expression.endswith('/string()'):
243 | result = result.xpath('string()')
244 | if isinstance(result,basestring) and len(result.strip()):
245 | results.append(result.strip())
246 | else:
247 | results.append(result)
248 | return list(results)
249 |
250 |
251 | def single_xpath(self,expression):
252 | results = self.xpath(expression)
253 | if isinstance(results,basestring) or not isinstance(results,collections.Iterable):
254 | return results
255 | if results:
256 | return results[0]
257 | else:
258 | return ''
259 |
260 | def links(self):
261 | return {link.split('#')[0] for link in self.xpath('//a/@href')}
262 |
263 | def internal_links(self):
264 | return {link for link in self.links() if urlparse.urlparse(link).netloc.lower() == self.final_domain if not link.split('.')[-1].lower() in EXCLUDED_LINK_EXTENSIONS}
265 |
266 | def external_links(self, exclude_subdomains=True):
267 | if exclude_subdomains:
268 | return {link for link in self.links() if max(self.final_domain.split('.'), key=len) not in urlparse.urlparse(link).netloc and link.lower().startswith('http') and link.lower().split('.')[-1] not in EXCLUDED_LINK_EXTENSIONS}
269 | else:
270 | return {link for link in self.links() if urlparse.urlparse(link).netloc != self.final_domain and link.lower().startswith('http') and link.lower().split('.')[-1] not in EXCLUDED_LINK_EXTENSIONS}
271 |
272 | def dofollow_links(self):
273 | return set(self.xpath('//a[@rel!="nofollow" or not(@rel)]/@href'))
274 |
275 | def nofollow_links(self):
276 | return set(self.xpath('//a[@rel="nofollow"]/@href'))
277 |
278 | def external_images(self):
279 | return set([image for image in self.xpath('//img/@src') if urlparse.urlparse(image).netloc != self._domain])
280 |
281 | def csv(self):
282 | return csv.reader(self.encoded_data())
283 |
284 | def regex(self,expression):
285 | if not isinstance(expression,basestring):
286 | expression = '||'.join(expression)
287 | if '||' in expression:
288 | results = []
289 | for part in expression.split('||'):
290 | results.append(self.regex(part))
291 | return zip(*results)
292 | return re.compile(expression,re.S|re.I).findall(self.encoded_data())
293 |
294 | def url_regex(self,expression):
295 | if not isinstance(expression,basestring):
296 | expression = '||'.join(expression)
297 | if '||' in expression:
298 | results = []
299 | for part in expression.split('||'):
300 | results.append(self.xpath(part))
301 | return zip(*results)
302 | return re.compile(expression).findall(self.final_url)
303 |
304 | def __repr__(self):
305 | return '' % self.final_url
306 |
307 | def link_with_url(self, link, domain=False):
308 | if not isinstance(link, basestring):
309 | for l in links:
310 | result = self.link_with_url(l, domain=domain)
311 | if result is not False:
312 | return result
313 | if domain:
314 | link = urlparse.urlparse(link).netloc
315 | for l, l_obj in self.xpath('//a/@href||//a[@href]'):
316 | if domain:
317 | if urlparse.urlparse(l).netloc == link:
318 | return l_obj
319 | else:
320 | if link in (l,l+'/',l.rstrip('/')):
321 | return l_obj
322 | return False
323 |
324 | def link_with_anchor(self, anchor):
325 | if not isinstance(anchor, basestring):
326 | for a in anchor:
327 | result = self.link_with_anchor(a, domain=domain)
328 | if result is not False:
329 | return result
330 | results = self.xpath('//a[text()="%s"]' % anchor)
331 | if len(results):
332 | return results[0]
333 | return False
334 |
335 | def image_captcha(self,xpath):
336 | try:
337 | from captcha import DBC_USERNAME, DBC_PASSWORD
338 | except:
339 | pass
340 | image_source = self.single_xpath(xpath)
341 | if image_source:
342 | image = grab(image_source, http_obj=self.http)
343 | import deathbycaptcha
344 | result = deathbycaptcha.HttpClient(DBC_USERNAME, DBC_PASSWORD).decode(StringIO.StringIO(str(image)))
345 | if result:
346 | return result['text']
347 |
348 | def recaptcha(self):
349 | iframe_source = self.single_xpath('//iframe[contains(@src,"recaptcha")]/@src')
350 | if iframe_source:
351 | iframe = grab(iframe_source,http_obj=self.http,ref=self.final_url)
352 | return (iframe.single_xpath('//input[@id="recaptcha_challenge_field"]/@value'),iframe.image_captcha('//center/img/@src'))
353 |
354 | def solvemedia(self):
355 | iframe_source = self.single_xpath('//iframe[contains(@src, "api.solvemedia.com")]/@src')
356 | if iframe_source:
357 | iframe = grab(iframe_source,http_obj=self.http,ref=self.final_url)
358 | response = iframe.image_captcha('//img[@id="adcopy-puzzle-image"]/@src')
359 |
360 | post = iframe.hidden_fields()
361 | post['adcopy_response'] = response
362 |
363 | submit_iframe = grab('http://api.solvemedia.com/papi/verify.noscript', http_obj=self.http, ref=iframe_source, post=post)
364 |
365 | if submit_iframe:
366 | if len(submit_iframe.regex('c=(.+?)"')):
367 | return (response, submit_iframe.regex('c=(.+?)"')[0])
368 | else:
369 | return ('', '')
370 | else:
371 | return ('', '')
372 |
373 | def hidden_fields(self):
374 | fields = {}
375 | for name, value in self.xpath('//input[@type="hidden"]/@name||//input[@type="hidden"]/@value'):
376 | fields[name] = value
377 | return fields
378 |
379 | def view(self):
380 | p = tempfile.mktemp() + '.html'
381 | self.save(p)
382 | if sys.platform == 'darwin': subprocess.call(('open', p))
383 | elif sys.platform == 'nt': os.startfile(p) #duno lol
384 | elif sys.platform.startswith('linux'): subprocess.call(('xdg-open', p))
385 |
386 | class ProxyManager(object):
387 | def __init__(self, proxy=True, min_delay=20, max_delay=None):
388 | if isinstance(proxy,list):
389 | proxies = proxy
390 | elif proxy == True:
391 | try:
392 | proxies = open('proxies.txt').read().strip().split('\n')
393 | except:
394 | proxies = [None]
395 | elif isinstance(proxy, basestring):
396 | if proxy.startswith('http'):
397 | proxies = [p.strip() for p in str(grab(proxy)).strip().split('\n') if len(p.strip())]
398 | elif os.path.isfile(proxy):
399 | proxies = [p.strip() for p in open(proxy) if len(p.strip())]
400 | elif ':' in proxy:
401 | proxies = proxy.strip().split('\n')
402 | new_proxies = []
403 | for proxy in proxies:
404 | if proxy.count(':') == 3:
405 | ip, port, username, password = proxy.split(':')
406 | proxy = username+':'+password+'@'+ip+':'+port
407 | new_proxies.append(proxy)
408 | proxies = new_proxies
409 | elif isinstance(proxy, ProxyManager):
410 | proxies = proxy.records.keys()
411 | else:
412 | proxies = [None]
413 |
414 | self.records = dict(zip(proxies,[0 for p in proxies]))
415 | self.min_delay = min_delay
416 | self.max_delay = max_delay or min_delay
417 |
418 | def get(self):
419 | while True:
420 | proxies = [proxy for proxy, proxy_time in self.records.items() if proxy_time + random.randint(self.min_delay, self.max_delay) < time.time()]
421 | if not proxies:
422 | gevent.sleep(time.time() - min(self.records.values()))
423 | else:
424 | proxy = random.sample(proxies, 1)[0]
425 | self.records[proxy] = int(time.time())
426 | return proxy
427 |
428 | def __len__(self):
429 | return len(self.records)
430 |
431 | def split(self, number):
432 | chunk_size = len(self) / number
433 | managers = []
434 | for i in range(number):
435 | if len(self) % chunk_size >= number - i:
436 | managers.append(ProxyManager(self.records.keys()[chunk_size*i:chunk_size*(i+1)+1], min_delay=self.min_delay, max_delay=self.max_delay))
437 | else:
438 | managers.append(ProxyManager(self.records.keys()[chunk_size*i:chunk_size*(i+1)], min_delay=self.min_delay, max_delay=self.max_delay))
439 | return managers
440 |
441 | class RedisProxyManager(ProxyManager):
442 | def __init__(self, name, proxy=True, min_delay=20, max_delay=None, host='localhost', port=6379):
443 | ProxyManager.__init__(self, proxy=proxy)
444 | import redis
445 | self.r = redis.Redis(host=host, port=port)
446 | self.name = name
447 | for record in self.records:
448 | if self.r.zrank(self.name, record) == None:
449 | self.r.zadd(self.name, record, 0)
450 | self.cache = []
451 | self.caching = False
452 | self.fill_cache()
453 |
454 | def fill_cache(self):
455 | while True:
456 | if len(self.cache) != 0:
457 | return
458 | if self.caching:
459 | gevent.sleep(1)
460 | else:
461 | self.caching = True
462 | while True:
463 | proxies = self.r.zrangebyscore(self.name, 0, int(time.time()), start=1, num=max(10, len(self.records) / 5))
464 | if proxies != 0 and len(proxies) != 0:
465 | print 'filling the proxy cache', len(proxies)
466 | for proxy in proxies:
467 | self.r.zadd(self.name, proxy, int(time.time()) + 300) #Yours for 5 minutes
468 | self.cache = proxies
469 | self.caching = False
470 | return
471 | else:
472 | gevent.sleep(1)
473 |
474 | def get(self):
475 | if not len(self.cache):
476 | self.fill_cache()
477 | proxy = self.cache.pop()
478 | self.last_proxy = proxy
479 | self.r.zadd(self.name, proxy, int(time.time()) + random.randint(self.min_delay, self.max_delay)) #about to be used, so set a normal time on it
480 | return proxy
481 |
482 | def available(self):
483 | return len(self.r.zrangebyscore(self.name, 0, int(time.time()))) + len(self.cache)
484 |
485 | def __len__(self):
486 | return self.r.zcard(self.name)
487 |
488 |
489 | class HeadRequest(urllib2.Request):
490 | def get_method(self):
491 | return 'HEAD'
492 |
493 | def useragent():
494 | agents = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)','Mozilla/5.0 (X11; Arch Linux i686; rv:2.0) Gecko/20110321 Firefox/4.0','Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.3) Gecko/20100401 Firefox/4.0 (.NET CLR 3.5.30729)','Mozilla/5.0 (Windows NT 6.1; rv:2.0) Gecko/20110319 Firefox/4.0','Mozilla/5.0 (Windows NT 6.1; rv:1.9) Gecko/20100101 Firefox/4.0','Opera/9.20 (Windows NT 6.0; U; en)','Opera/9.00 (Windows NT 5.1; U; en)','Opera/9.64(Windows NT 5.1; U; en) Presto/2.1.1')
495 | return random.choice(agents)
496 |
497 | def encode_multipart_formdata(fields, files):
498 | '''
499 | fields is a sequence of (name, value) elements for regular form fields.
500 | files is a sequence of (name, filename, value) elements for data to be uploaded as files
501 | Return (content_type, body) ready for httplib.HTTP instance
502 | '''
503 | BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
504 | CRLF = '\r\n'
505 | L = []
506 | for (key, value) in fields:
507 | L.append('--' + BOUNDARY)
508 | L.append('Content-Disposition: form-data; name="%s"' % key)
509 | L.append('')
510 | L.append(value)
511 | for (key, filename, value) in files:
512 | L.append('--' + BOUNDARY)
513 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
514 | L.append('Content-Type: %s' % get_content_type(filename))
515 | L.append('')
516 | L.append(value)
517 | L.append('--' + BOUNDARY + '--')
518 | L.append('')
519 | body = CRLF.join(L)
520 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
521 | return content_type, body
522 |
523 | def get_content_type(filename):
524 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
525 |
526 | class DisabledHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
527 | def redirect_request(self, req, fp, code, msg, headers, newurl):
528 | req.get_full_url()
529 | raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
530 |
531 | class http(object):
532 | def __init__(self, proxy=None, cookie_filename=None, cookies=True, redirects=True):
533 | self.handlers = set()
534 | try:
535 | useragents = [ua.strip() for ua in open('useragents.txt') if len(ua.strip())]
536 | self.useragent = random.choice(useragents).strip()
537 | except:
538 | self.useragent = useragent()
539 |
540 | self.opener = urllib2.OpenerDirector()
541 |
542 | if cookies:
543 | self.cookie_jar = cookielib.LWPCookieJar()
544 | if cookie_filename:
545 | self.cookie_jar = cookielib.MozillaCookieJar(cookie_filename)
546 | self.cookie_jar.load()
547 | cookie_support = urllib2.HTTPCookieProcessor(self.cookie_jar)
548 | else:
549 | cookie_support = None
550 |
551 | self.proxy = False
552 | proxy_auth = None
553 |
554 | if proxy:
555 | if isinstance(proxy, ProxyManager):
556 | self.proxy = proxy.get()
557 | else:
558 | self.proxy = ProxyManager(proxy).get()
559 | #print 'proxy in http = ', self.proxy
560 |
561 | if self.proxy:
562 | self.proxy = self.proxy.strip()
563 | proxy_support = urllib2.ProxyHandler({'http' : self.proxy,'https':self.proxy})
564 | if '@' in self.proxy:
565 | proxy_auth = urllib2.HTTPBasicAuthHandler()
566 | else:
567 | proxy_auth = None
568 | else:
569 | proxy_support = None
570 |
571 | if not redirects:
572 | self.build_opener(DisabledHTTPRedirectHandler())
573 |
574 | self.build_opener(proxy_support,cookie_support,proxy_auth)
575 |
576 | def build_opener(self,*handlers):
577 | self.handlers |= set([handler for handler in handlers if handler is not None])
578 | self.opener = urllib2.build_opener(*self.handlers)
579 |
580 | def urlopen(self, url, post=None, ref=None, files=None, username=None, password=None, compress=True, head=False, timeout=30):
581 | assert url.lower().startswith('http')
582 | if isinstance(post, basestring):
583 | post = dict([part.split('=') for part in post.strip().split('&')])
584 | if post:
585 | for k, v in post.items():
586 | post[k] = spin(unicode(v).encode('utf-8'))
587 | if username and password:
588 | password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
589 | password_manager.add_password(None, url, username, password)
590 | password_auth = urllib2.HTTPBasicAuthHandler(password_manager)
591 | self.build_opener(password_auth)
592 | urllib2.install_opener(self.opener)
593 | if compress:
594 | headers = {'User-Agent' : self.useragent, 'Accept-encoding' : 'gzip'}
595 | else:
596 | headers = {'User-Agent' : self.useragent}
597 | if ref:
598 | headers['Referer'] = ref
599 | if files:
600 | content_type,post = encode_multipart_formdata(post.items(), files)
601 | headers['content-type'] = content_type
602 | headers['content-length'] = str(len(post))
603 | elif post:
604 | post = urllib.urlencode(post)
605 | if head:
606 | req = HeadRequest(url, post, headers)
607 | else:
608 | req = urllib2.Request(url, post, headers)
609 | with gevent.Timeout(timeout):
610 | response = urllib2.urlopen(req)
611 | return HTTPResponse(response, url, http=self)
612 |
613 | def grab(url, proxy=None, post=None, ref=None, compress=True, retries=1, http_obj=None, cookies=False, redirects=True, timeout=30):
614 | data = None
615 | if retries < 1:
616 | retries = 1
617 | for i in range(retries):
618 | if not http_obj:
619 | http_obj = http(proxy, cookies=cookies, redirects=redirects)
620 | try:
621 | data = http_obj.urlopen(url=url, post=post, ref=ref, compress=compress, timeout=timeout)
622 | break
623 | except urllib2.HTTPError, e:
624 | if str(e.code).startswith('3') and not redirects:
625 | data = HTTPResponse(url=url, fake=True)
626 | break
627 | except:
628 | pass
629 | if data:
630 | return data
631 | return False
632 |
633 | class RedisQueue(object):
634 | def __init__(self, name, host='localhost', port=6379):
635 | import redis
636 | self.r = redis.Redis(host=host, port=port)
637 | self.name = name
638 |
639 | def put(self, item):
640 | self.r.sadd(self.name, item)
641 |
642 | def get(self, timeout=60):
643 | timeout_counter = 0
644 | while True:
645 | result = self.r.spop(self.name)
646 | if result == None:
647 | if timeout == timeout_counter:
648 | return None
649 | gevent.sleep(1)
650 | timeout_counter += 1
651 | else:
652 | return result
653 |
654 | def get_nowait(self):
655 | return self.get()
656 |
657 | def __len__(self):
658 | return self.r.scard(self.name)
659 |
660 | def empty(self):
661 | return len(self) == 0
662 |
663 | def WebQueue(iterator=None):
664 | queue = Queue.Queue()
665 | if iterator:
666 | [queue.put(item) for item in iterator]
667 | return queue
668 |
669 | def generic_iterator(iterator):
670 | if isinstance(iterator, basestring):
671 | if '\n' in iterator:
672 | for i in iterator.split('\n'):
673 | if len(i.strip()):
674 | yield i.strip()
675 | else:
676 | yield iterator.strip()
677 | else:
678 | for i in iterator:
679 | yield i
680 |
681 | class DomainQueue(object):
682 | def __init__(self, urls):
683 | self.domains = collections.defaultdict(list)
684 | for url in urls:
685 | if isinstance(url, basestring):
686 | url = urlparse.urlparse(url)
687 | self.domains[url.netloc].append(url.geturl())
688 | self.counter = {domain:0 for domain in self.domains.keys()}
689 |
690 | def empty(self):
691 | return len(self.domains) == 0
692 |
693 | def get_nowait(self):
694 | domain = min(self.counter, key=self.counter.get)
695 | url = self.domains[domain].pop()
696 | if len(self.domains[domain]) == 0:
697 | del(self.domains[domain])
698 | del(self.counter[domain])
699 | else:
700 | self.counter[domain] += 1
701 | return url
702 |
703 | def get(self):
704 | return self.get_nowait()
705 |
706 | def put(self, url):
707 | if isinstance(url, basestring):
708 | url = urlparse.urlparse(url)
709 | self.domains[url.netloc].append(url.geturl())
710 | if url.netloc not in self.counter:
711 | self.counter[url.netloc] = 0
712 |
713 | def __len__(self):
714 | return sum((len(d) for d in self.domains.values()))
715 |
716 | def multi_grab(urls, pool_size=100, timeout=30, max_pages=-1, queuify=True, proxy=None):
717 | if queuify:
718 | in_q = WebQueue(generic_iterator(urls))
719 | else:
720 | in_q = urls
721 | for result_counter, result in enumerate(pooler(grab, in_q, pool_size=pool_size, timeout=timeout, proxy=proxy)):
722 | yield result
723 | if result_counter == max_pages and max_pages > 0:
724 | break
725 |
726 | def domain_crawl(urls, pool_size=100, timeout=30, max_pages=-1, link_filter=None, max_domain_pages=-1):
727 | urls = {url for url in generic_iterator(urls)}
728 | domains = {urlparse.urlparse(url).netloc for url in urls}
729 | domain_counter = collections.Counter()
730 | seen_urls = set(urls)
731 | while True:
732 | if not len(urls):
733 | break
734 | urls_queue = Queue.Queue()
735 | [urls_queue.put(url) for url in urls]
736 | urls = set()
737 | for page_counter, page in enumerate(multi_grab(urls_queue, pool_size, timeout, queuify=False)):
738 | if page.final_domain in domains:
739 | domain_counter[page.final_domain] += 1
740 | if not (domain_counter[page.final_domain] > max_domain_pages and max_domain_pages > 0):
741 | try:
742 | new_urls = {url for url in page.internal_links() if url not in seen_urls}
743 | if callable(link_filter):
744 | new_urls = {url for url in new_urls if link_filter(url)}
745 | urls |= new_urls
746 | seen_urls |= new_urls
747 | except:
748 | pass
749 | if max_pages > 0 and page_counter > max_pages:
750 | break
751 | yield page
752 |
753 |
754 | def redirecturl(url, proxy=None):
755 | return http(proxy).urlopen(url, head=True).geturl()
756 |
757 | def cloud_pooler(func, in_q, chunk_size=1000, _env='python-web', _type='c2', _max_runtime=60, get_results=True, **kwargs):
758 | import cloud
759 | if chunk_size > 1:
760 | if isinstance(in_q, collections.Iterable):
761 | in_q = WebQueue(in_q)
762 | chunks = []
763 | chunk = []
764 | while not in_q.empty():
765 | chunk.append(in_q.get())
766 | if len(chunk) == chunk_size:
767 | chunks.append(chunk)
768 | chunk = []
769 | if len(chunk):
770 | chunks.append(chunk)
771 | else:
772 | chunks = in_q
773 |
774 | partial_func = functools.partial(func, **kwargs)
775 | jids = cloud.map(partial_func, chunks, _env=_env, _type=_type, _max_runtime=_max_runtime)
776 |
777 | if get_results:
778 | print jids
779 | for result in cloud.iresult(jids, ignore_errors=True):
780 | if result:
781 | yield result
782 | else:
783 | for jid in jids:
784 | yield jid
785 |
786 | def pooler(func, in_q, pool_size=100, proxy=False, max_results=0, **kwargs):
787 | if isinstance(in_q, collections.Iterable):
788 | in_q = WebQueue(in_q)
789 | out_q = multiprocessing.Queue()
790 | if proxy and not isinstance(proxy, ProxyManager):
791 | proxy = ProxyManager(proxy)
792 |
793 | p = pool.Pool(pool_size)
794 | greenlets = set()
795 | if proxy:
796 | kwargs['proxy'] = proxy
797 | result_counter = 0
798 | while True:
799 | #print len(greenlets), 'greenlets'
800 | finished_greenlets = {g for g in greenlets if g.value != None}
801 | greenlets -= finished_greenlets
802 | for g in finished_greenlets:
803 | if g.value != False:
804 | yield g.value
805 | result_counter += 1
806 | if max_results > 0 and result_counter >= max_results:
807 | break
808 | if len(greenlets) > pool_size:
809 | print 'uhoh, greenlets are getting stuck', len(greenlets)
810 | if len(greenlets) < pool_size:
811 | try:
812 | i = in_q.get_nowait()
813 | except:
814 | break
815 | if not isinstance(i, dict):
816 | i = {inspect.getargspec(func).args[0]: i}
817 | kwargs = dict(kwargs.items() + i.items())
818 | greenlets.add(p.spawn(func, **kwargs))
819 | else:
820 | time.sleep(1)
821 |
822 |
823 | p.join()
824 | for g in greenlets:
825 | if g.value:
826 | yield g.value
827 |
--------------------------------------------------------------------------------