├── .gitignore
├── README
├── __init__.py
├── requirements
    └── base.txt
└── web.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.DS_Store


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any person obtaining a copy
 2 | of this software and associated documentation files (the "Software"), to deal
 3 | in the Software without restriction, including without limitation the rights
 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 5 | copies of the Software, and to permit persons to whom the Software is
 6 | furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in
 9 | all copies or substantial portions of the Software.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from web import *
2 | 


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
1 | lxml
2 | pybloom


--------------------------------------------------------------------------------
/web.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import random
  3 | import time
  4 | import cookielib
  5 | import urllib2
  6 | import urllib
  7 | import mimetypes
  8 | import gzip
  9 | import StringIO
 10 | import urlparse
 11 | import collections
 12 | import json
 13 | import csv
 14 | import os
 15 | import multiprocessing
 16 | import httplib
 17 | import copy
 18 | import inspect
 19 | import Queue
 20 | import tempfile
 21 | import subprocess
 22 | import sys
 23 | import functools
 24 | 
 25 | import greenlet
 26 | import gevent
 27 | from gevent import monkey
 28 | from gevent import queue
 29 | from gevent import select
 30 | from gevent import pool
 31 | monkey.patch_all(thread=False)
 32 | 
 33 | from lxml import etree
 34 | import pybloom
 35 | 
 36 | from urllib import quote_plus
 37 | 
 38 | DBC_USERNAME = None
 39 | DBC_PASSWORD = None
 40 | 
 41 | EXCLUDED_LINK_EXTENSIONS = ('jpg', 'gif', 'jpeg','pdf', 'doc', 'docx', 'ppt', 'txt', 'png', 'zip', 'rar', 'mp3')
 42 | 
 43 | def unique_domains_filter(iterable):
 44 | 	domains = set()
 45 | 	for i in iterable:
 46 | 		parsed = urlparse.urlparse(i.strip())
 47 | 		if parsed.netloc not in domains:
 48 | 			domains.add(parsed.netloc)
 49 | 			yield i.strip()
 50 | 
 51 | class BloomFilter(object):
 52 | 	def __init__(self, name=None):
 53 | 		if name and not name.endswith('.bloom'):
 54 | 			name += '.bloom'
 55 | 		self.name = name
 56 | 		self.add_counter = 0
 57 | 		try:
 58 | 			self.bloom = pybloom.ScalableBloomFilter.fromfile(open(self.name, 'rb'))
 59 | 		except:
 60 | 			self.bloom = pybloom.ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH)
 61 | 		
 62 | 	def save(self):
 63 | 		if self.name:
 64 | 			self.bloom.tofile(open(self.name+'.bloom', 'wb'))
 65 | 
 66 | 	def __del__(self):
 67 | 		self.save()
 68 | 
 69 | 	def add(self, key):
 70 | 		self.bloom.add(key)
 71 | 		self.add_counter += 1
 72 | 		if len(self) / self.add_counter > 10 and self.add_counter > 100:
 73 | 			self.save()
 74 | 			self.add_counter = 0
 75 | 
 76 | 	def __contains__(self, key, autoadd=True):
 77 | 		result = key in self.bloom
 78 | 		if autoadd:
 79 | 			self.add(key)
 80 | 		return result
 81 | 
 82 | 	@property
 83 | 	def count(self):
 84 | 		return len(self.bloom)
 85 | 
 86 | 	def __len__(self):
 87 | 		return len(self.bloom)
 88 | 
 89 | class RandomLines(object):
 90 | 	def __init__(self, input_file, cache_index=True, repetitions=1):
 91 | 		if isinstance(input_file, basestring):
 92 | 			self.source_file = open(input_file,'rb')
 93 | 			self.filename = input_file
 94 | 		else:
 95 | 			self.source_file = input_file
 96 | 			self.filename = input_file.name
 97 | 		self.index = []
 98 | 		self.cache_index = cache_index
 99 | 
100 | 		if not os.path.isfile(self.filename+'.lineindex'):
101 | 			self.index_file()
102 | 		else:
103 | 			for line_counter, line in enumerate(open(self.filename+'.lineindex')):
104 | 				line = line.strip()
105 | 				if line_counter == 0:
106 | 					if int(line) != os.path.getsize(self.filename):
107 | 						self.index_file()
108 | 						break
109 | 				elif len(line):
110 | 					self.index.append(int(line))
111 | 		self.index *= repetitions
112 | 		self.start_index_len = len(self.index)
113 | 
114 | 	def __iter__(self):
115 | 		return self
116 | 
117 | 	def __len__(self):
118 | 		return len(self.index)
119 | 
120 | 	def index_file(self):
121 | 		bytes_counter = 0
122 | 		for line in self.source_file:
123 | 			bytes_counter += len(line)
124 | 			if len(line.strip()):
125 | 				self.index.append(bytes_counter-len(line))
126 | 		if self.cache_index:
127 | 			open(self.filename+'.lineindex','w').write('\n'.join(str(i) for i in [os.path.getsize(self.filename)] + self.index))		
128 | 
129 | 	def next(self):
130 | 		while len(self.index):
131 | 			offset = self.index.pop(random.randrange(0, len(self.index)))
132 | 			self.source_file.seek(offset, 0)
133 | 			return self.source_file.readline().strip()
134 | 		raise StopIteration
135 | 
136 | 	def percentage(self):
137 | 		if len(self.index) == 0:
138 | 			return 100
139 | 		else:
140 | 			return 100 - int((float(len(self.index)) / self.start_index_len) * 100) #this is buggy
141 | 
142 | def spin(text_input, unique_choices=False):
143 | 	seen_fields = {}
144 | 	if text_input.count('{') - text_input.count('}') == 1:
145 | 		text_input += '}'
146 | 	for _ in range(text_input.count('{')):
147 | 		field = re.search('{([^{}]*)}', text_input).group(0)
148 | 
149 | 		if unique_choices:
150 | 			if field not in seen_fields:
151 | 				seen_fields[field] = field.split('|')
152 | 			if len(seen_fields[field]):
153 | 				replacement = seen_fields[field].pop(random.randint(0,len(seen_fields[field])))
154 | 			else:
155 | 				replacement = ''
156 | 		else:
157 | 			replacement = random.choice(field[1:-1].split('|'))
158 | 		text_input = text_input.replace(field, replacement, 1)
159 | 	return text_input
160 | 
161 | class HTTPResponse(object):
162 | 	def __init__(self, response=None, url=None, fake=False, http=None):
163 | 		self._xpath = None
164 | 		self._json = None
165 | 		#self._encoded_data = None #might cache encoded data again in future, for now don't see the point
166 | 		if fake:
167 | 			self.original_domain = urlparse.urlparse(url).netloc.lower()
168 | 			self.original_url = url
169 | 			self.final_url = url
170 | 			self.final_domain = self.original_domain
171 | 			self._data = '<html><body><p>Hello!</p></body></html>'
172 | 		else:
173 | 			self.headers = response.info()
174 | 			compressed_data = response.read()
175 | 			if filter(lambda (k,v): k.lower() == 'content-encoding' and v.lower() == 'gzip', self.headers.items()):
176 | 				self.headers['Content-type'] = 'text/html; charset=utf-8'
177 | 				self._data = gzip.GzipFile(fileobj=StringIO.StringIO(compressed_data)).read()
178 | 			else:
179 | 				self._data = compressed_data
180 | 			
181 | 			self.original_domain = urlparse.urlparse(url).netloc.lower()
182 | 			self.original_url = url
183 | 			self.final_url = response.geturl()
184 | 			self.final_domain = urlparse.urlparse(self.final_url).netloc.lower()
185 | 
186 | 		if http:
187 | 			self.http = http
188 | 
189 | 	def encoded_data(self):
190 | 		return unicode(self._data,'ISO-8859-1').encode('ISO-8859-1')
191 | 		
192 | 	def __str__(self):
193 | 		return self._data
194 | 		
195 | 	def __len__(self):
196 | 		return len(str(self))
197 | 
198 | 	def __contains__(self,x):
199 | 		return x.lower() in str(self).lower()
200 | 		
201 | 	def save(self, handle):
202 | 		if isinstance(handle, basestring):
203 | 			handle = open(handle, 'w')
204 | 		handle.write(str(self))
205 | 		handle.close()
206 | 
207 | 	def json(self):
208 | 		if not self._json:
209 | 			self._json = json.loads(self._data)
210 | 		return self._json	
211 | 		
212 | 	def xpath(self,expression, xml=False):
213 | 		if self._xpath is None:
214 | 			if xml:
215 | 				self._xpath = etree.XML(self.encoded_data())
216 | 			else:
217 | 				self._xpath = etree.HTML(self.encoded_data())
218 | 			if self._xpath is None:
219 | 				return []
220 | 
221 | 		if not isinstance(expression,basestring):
222 | 			expression = '||'.join(expression)
223 | 		if '||' in expression:
224 | 			results = []
225 | 			for part in expression.split('||'):
226 | 				results.append(self.xpath(part))
227 | 			return zip(*results)
228 | 
229 | 		results = []
230 | 		original_expression = expression
231 | 		if expression.endswith('/string()'):
232 | 			expression = expression.split('/string()')[0]
233 | 		with gevent.Timeout(30, False):	
234 | 			xpath_result = self._xpath.xpath(expression)
235 | 		if isinstance(xpath_result, basestring) or not isinstance(xpath_result, collections.Iterable):
236 | 			return xpath_result
237 | 		for result in xpath_result:
238 | 			if expression.endswith('@href') or expression.endswith('@src') or expression.endswith('@action'):
239 | 				if not result.startswith('http'):
240 | 					result = urlparse.urljoin(self.final_url,result)
241 | 				result = result.split('#')[0]
242 | 			if original_expression.endswith('/string()'):
243 | 				result = result.xpath('string()')
244 | 			if isinstance(result,basestring) and len(result.strip()):
245 | 					results.append(result.strip())
246 | 			else:
247 | 				results.append(result)
248 | 		return list(results)
249 | 				
250 | 		
251 | 	def single_xpath(self,expression):
252 | 		results = self.xpath(expression)
253 | 		if isinstance(results,basestring) or not isinstance(results,collections.Iterable):
254 | 			return results
255 | 		if results:
256 | 			return results[0]
257 | 		else:
258 | 			return ''
259 | 
260 | 	def links(self):
261 | 		return {link.split('#')[0] for link in self.xpath('//a/@href')}
262 | 			
263 | 	def internal_links(self):
264 | 		return {link for link in self.links() if urlparse.urlparse(link).netloc.lower() == self.final_domain if not link.split('.')[-1].lower() in EXCLUDED_LINK_EXTENSIONS}
265 | 		
266 | 	def external_links(self, exclude_subdomains=True):
267 | 		if exclude_subdomains:
268 | 			return {link for link in self.links() if max(self.final_domain.split('.'), key=len) not in urlparse.urlparse(link).netloc and link.lower().startswith('http') and link.lower().split('.')[-1] not in EXCLUDED_LINK_EXTENSIONS}
269 | 		else:
270 | 			return {link for link in self.links() if urlparse.urlparse(link).netloc != self.final_domain and link.lower().startswith('http') and link.lower().split('.')[-1] not in EXCLUDED_LINK_EXTENSIONS}
271 | 		
272 | 	def dofollow_links(self):
273 | 		return set(self.xpath('//a[@rel!="nofollow" or not(@rel)]/@href'))
274 | 	
275 | 	def nofollow_links(self):
276 | 		return set(self.xpath('//a[@rel="nofollow"]/@href'))
277 | 		
278 | 	def external_images(self):
279 | 		return set([image for image in self.xpath('//img/@src') if urlparse.urlparse(image).netloc != self._domain])
280 | 
281 | 	def csv(self):
282 | 		return csv.reader(self.encoded_data())
283 | 
284 | 	def regex(self,expression):
285 | 		if not isinstance(expression,basestring):
286 | 			expression = '||'.join(expression)
287 | 		if '||' in expression:
288 | 			results = []
289 | 			for part in expression.split('||'):
290 | 				results.append(self.regex(part))
291 | 			return zip(*results)
292 | 		return re.compile(expression,re.S|re.I).findall(self.encoded_data())
293 | 
294 | 	def url_regex(self,expression):
295 | 		if not isinstance(expression,basestring):
296 | 			expression = '||'.join(expression)
297 | 		if '||' in expression:
298 | 			results = []
299 | 			for part in expression.split('||'):
300 | 				results.append(self.xpath(part))
301 | 			return zip(*results)
302 | 		return re.compile(expression).findall(self.final_url)
303 | 		
304 | 	def __repr__(self):
305 | 		return '<HTTPResponse for %s>' % self.final_url
306 | 		
307 | 	def link_with_url(self, link, domain=False):
308 | 		if not isinstance(link, basestring):
309 | 			for l in links:
310 | 				result = self.link_with_url(l, domain=domain)
311 | 				if result is not False:
312 | 					return result
313 | 		if domain:
314 | 			link = urlparse.urlparse(link).netloc
315 | 		for l, l_obj in self.xpath('//a/@href||//a[@href]'):
316 | 			if domain:
317 | 				if urlparse.urlparse(l).netloc == link:
318 | 					return l_obj
319 | 			else:
320 | 				if link in (l,l+'/',l.rstrip('/')):
321 | 					return l_obj
322 | 		return False
323 | 
324 | 	def link_with_anchor(self, anchor):
325 | 		if not isinstance(anchor, basestring):
326 | 			for a in anchor:
327 | 				result = self.link_with_anchor(a, domain=domain)
328 | 				if result is not False:
329 | 					return result
330 | 		results = self.xpath('//a[text()="%s"]' % anchor)
331 | 		if len(results):
332 | 			return results[0]
333 | 		return False
334 | 
335 | 	def image_captcha(self,xpath):
336 | 		try:
337 | 			from captcha import DBC_USERNAME, DBC_PASSWORD
338 | 		except:
339 | 			pass
340 | 		image_source = self.single_xpath(xpath)
341 | 		if image_source:
342 | 			image = grab(image_source, http_obj=self.http)
343 | 			import deathbycaptcha
344 | 			result = deathbycaptcha.HttpClient(DBC_USERNAME, DBC_PASSWORD).decode(StringIO.StringIO(str(image)))
345 | 			if result:
346 | 				return result['text']
347 | 
348 | 	def recaptcha(self):
349 | 		iframe_source = self.single_xpath('//iframe[contains(@src,"recaptcha")]/@src')
350 | 		if iframe_source:
351 | 			iframe = grab(iframe_source,http_obj=self.http,ref=self.final_url)
352 | 			return (iframe.single_xpath('//input[@id="recaptcha_challenge_field"]/@value'),iframe.image_captcha('//center/img/@src'))
353 | 
354 | 	def solvemedia(self):
355 | 		iframe_source = self.single_xpath('//iframe[contains(@src, "api.solvemedia.com")]/@src')
356 | 		if iframe_source:
357 | 			iframe = grab(iframe_source,http_obj=self.http,ref=self.final_url)
358 | 			response = iframe.image_captcha('//img[@id="adcopy-puzzle-image"]/@src')
359 | 
360 | 			post = iframe.hidden_fields()
361 | 			post['adcopy_response'] = response
362 | 
363 | 			submit_iframe = grab('http://api.solvemedia.com/papi/verify.noscript', http_obj=self.http, ref=iframe_source, post=post)
364 | 
365 | 			if submit_iframe:
366 | 				if len(submit_iframe.regex('c=(.+?)"')):
367 | 					return (response, submit_iframe.regex('c=(.+?)"')[0])
368 | 				else:
369 | 					return ('', '')
370 | 			else:
371 | 				return ('', '')
372 | 
373 | 	def hidden_fields(self):
374 | 		fields = {}
375 | 		for name, value in self.xpath('//input[@type="hidden"]/@name||//input[@type="hidden"]/@value'):
376 | 			fields[name] = value
377 | 		return fields
378 | 
379 | 	def view(self):
380 | 		p = tempfile.mktemp() + '.html'
381 | 		self.save(p)
382 | 		if sys.platform == 'darwin':      subprocess.call(('open', p))
383 | 		elif sys.platform == 'nt':     os.startfile(p) #duno lol
384 | 		elif sys.platform.startswith('linux'):  subprocess.call(('xdg-open', p))
385 | 
386 | class ProxyManager(object):
387 | 	def __init__(self, proxy=True, min_delay=20, max_delay=None):
388 | 		if isinstance(proxy,list):
389 | 			proxies = proxy
390 | 		elif proxy == True:
391 | 			try:
392 | 				proxies = open('proxies.txt').read().strip().split('\n')
393 | 			except:
394 | 				proxies = [None]
395 | 		elif isinstance(proxy, basestring):
396 | 			if proxy.startswith('http'):
397 | 				proxies = [p.strip() for p in str(grab(proxy)).strip().split('\n') if len(p.strip())]
398 | 			elif os.path.isfile(proxy):
399 | 				proxies = [p.strip() for p in open(proxy) if len(p.strip())]
400 | 			elif ':' in proxy:
401 | 				proxies = proxy.strip().split('\n')
402 | 			new_proxies = []
403 | 			for proxy in proxies:
404 | 				if proxy.count(':') == 3:
405 | 					ip, port, username, password = proxy.split(':')
406 | 					proxy = username+':'+password+'@'+ip+':'+port
407 | 				new_proxies.append(proxy)
408 | 			proxies = new_proxies
409 | 		elif isinstance(proxy, ProxyManager):
410 | 			proxies = proxy.records.keys()
411 | 		else:
412 | 			proxies = [None]
413 | 			
414 | 		self.records = dict(zip(proxies,[0 for p in proxies]))
415 | 		self.min_delay = min_delay
416 | 		self.max_delay = max_delay or min_delay
417 | 		
418 | 	def get(self):
419 | 		while True:
420 | 			proxies = [proxy for proxy, proxy_time in self.records.items() if proxy_time + random.randint(self.min_delay, self.max_delay) < time.time()]
421 | 			if not proxies:
422 | 				gevent.sleep(time.time() - min(self.records.values()))
423 | 			else:
424 | 				proxy = random.sample(proxies, 1)[0]
425 | 				self.records[proxy] = int(time.time())
426 | 				return proxy
427 | 
428 | 	def __len__(self):
429 | 		return len(self.records)
430 | 
431 | 	def split(self, number):
432 | 		chunk_size = len(self) / number
433 | 		managers = []
434 | 		for i in range(number):
435 | 			if len(self) % chunk_size >= number - i:
436 | 				managers.append(ProxyManager(self.records.keys()[chunk_size*i:chunk_size*(i+1)+1], min_delay=self.min_delay, max_delay=self.max_delay))
437 | 			else:
438 | 				managers.append(ProxyManager(self.records.keys()[chunk_size*i:chunk_size*(i+1)], min_delay=self.min_delay, max_delay=self.max_delay))
439 | 		return managers
440 | 
441 | class RedisProxyManager(ProxyManager):
442 | 	def __init__(self, name, proxy=True, min_delay=20, max_delay=None, host='localhost', port=6379):
443 | 		ProxyManager.__init__(self, proxy=proxy)
444 | 		import redis
445 | 		self.r = redis.Redis(host=host, port=port)
446 | 		self.name = name
447 | 		for record in self.records:
448 | 			if self.r.zrank(self.name, record) == None:
449 | 				self.r.zadd(self.name, record, 0)
450 | 		self.cache = []
451 | 		self.caching = False
452 | 		self.fill_cache()
453 | 
454 | 	def fill_cache(self):
455 | 		while True:
456 | 			if len(self.cache) != 0:
457 | 				return
458 | 			if self.caching:
459 | 				gevent.sleep(1)
460 | 			else:
461 | 				self.caching = True
462 | 				while True:
463 | 					proxies = self.r.zrangebyscore(self.name, 0, int(time.time()), start=1, num=max(10, len(self.records) / 5))
464 | 					if proxies != 0 and len(proxies) != 0:
465 | 						print 'filling the proxy cache', len(proxies)
466 | 						for proxy in proxies:
467 | 							self.r.zadd(self.name, proxy, int(time.time()) + 300) #Yours for 5 minutes
468 | 						self.cache = proxies
469 | 						self.caching = False
470 | 						return
471 | 					else:
472 | 						gevent.sleep(1)
473 | 
474 | 	def get(self):
475 | 			if not len(self.cache):
476 | 				self.fill_cache()
477 | 			proxy = self.cache.pop()
478 | 			self.last_proxy = proxy
479 | 			self.r.zadd(self.name, proxy, int(time.time()) + random.randint(self.min_delay, self.max_delay)) #about to be used, so set a normal time on it
480 | 			return proxy
481 | 
482 | 	def available(self):
483 | 		return len(self.r.zrangebyscore(self.name, 0, int(time.time()))) + len(self.cache)
484 | 
485 | 	def __len__(self):
486 | 		return self.r.zcard(self.name)
487 | 
488 | 		
489 | class HeadRequest(urllib2.Request):
490 | 	def get_method(self):
491 | 		return 'HEAD'
492 | 
493 | def useragent():
494 | 	agents = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)','Mozilla/5.0 (X11; Arch Linux i686; rv:2.0) Gecko/20110321 Firefox/4.0','Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2.3) Gecko/20100401 Firefox/4.0 (.NET CLR 3.5.30729)','Mozilla/5.0 (Windows NT 6.1; rv:2.0) Gecko/20110319 Firefox/4.0','Mozilla/5.0 (Windows NT 6.1; rv:1.9) Gecko/20100101 Firefox/4.0','Opera/9.20 (Windows NT 6.0; U; en)','Opera/9.00 (Windows NT 5.1; U; en)','Opera/9.64(Windows NT 5.1; U; en) Presto/2.1.1')
495 | 	return random.choice(agents)
496 | 	
497 | def encode_multipart_formdata(fields, files):
498 | 	'''
499 | 	fields is a sequence of (name, value) elements for regular form fields.
500 | 	files is a sequence of (name, filename, value) elements for data to be uploaded as files
501 | 	Return (content_type, body) ready for httplib.HTTP instance
502 | 	'''
503 | 	BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
504 | 	CRLF = '\r\n'
505 | 	L = []
506 | 	for (key, value) in fields:
507 | 		L.append('--' + BOUNDARY)
508 | 		L.append('Content-Disposition: form-data; name="%s"' % key)
509 | 		L.append('')
510 | 		L.append(value)
511 | 	for (key, filename, value) in files:
512 | 		L.append('--' + BOUNDARY)
513 | 		L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
514 | 		L.append('Content-Type: %s' % get_content_type(filename))
515 | 		L.append('')
516 | 		L.append(value)
517 | 	L.append('--' + BOUNDARY + '--')
518 | 	L.append('')
519 | 	body = CRLF.join(L)
520 | 	content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
521 | 	return content_type, body
522 | 
523 | def get_content_type(filename):
524 | 	return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
525 | 
526 | class DisabledHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
527 | 	def redirect_request(self, req, fp, code, msg, headers, newurl):
528 | 		req.get_full_url()
529 | 		raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
530 | 
531 | class http(object):
532 | 	def __init__(self, proxy=None, cookie_filename=None, cookies=True, redirects=True):
533 | 		self.handlers = set()
534 | 		try:
535 | 			useragents = [ua.strip() for ua in open('useragents.txt') if len(ua.strip())]
536 | 			self.useragent = random.choice(useragents).strip()
537 | 		except:
538 | 			self.useragent = useragent()
539 | 			
540 | 		self.opener = urllib2.OpenerDirector()
541 | 		
542 | 		if cookies:
543 | 			self.cookie_jar = cookielib.LWPCookieJar()
544 | 			if cookie_filename:
545 | 				self.cookie_jar = cookielib.MozillaCookieJar(cookie_filename)
546 | 				self.cookie_jar.load()
547 | 			cookie_support = urllib2.HTTPCookieProcessor(self.cookie_jar)
548 | 		else:
549 | 			cookie_support = None
550 | 			
551 | 		self.proxy = False
552 | 		proxy_auth = None
553 | 		
554 | 		if proxy:
555 | 			if isinstance(proxy, ProxyManager):
556 | 				self.proxy = proxy.get()
557 | 			else:
558 | 				self.proxy = ProxyManager(proxy).get()
559 | 			#print 'proxy in http = ', self.proxy
560 | 
561 | 		if self.proxy:
562 | 			self.proxy = self.proxy.strip()
563 | 			proxy_support = urllib2.ProxyHandler({'http' : self.proxy,'https':self.proxy})
564 | 			if '@' in self.proxy:
565 | 				proxy_auth = urllib2.HTTPBasicAuthHandler()
566 | 			else:
567 | 				proxy_auth = None
568 | 		else:
569 | 			proxy_support = None
570 | 
571 | 		if not redirects:
572 | 			self.build_opener(DisabledHTTPRedirectHandler())
573 | 
574 | 		self.build_opener(proxy_support,cookie_support,proxy_auth)
575 | 			
576 | 	def build_opener(self,*handlers):
577 | 		self.handlers |= set([handler for handler in handlers if handler is not None])
578 | 		self.opener = urllib2.build_opener(*self.handlers)
579 | 
580 | 	def urlopen(self, url, post=None, ref=None, files=None, username=None, password=None, compress=True, head=False, timeout=30):
581 | 		assert url.lower().startswith('http')
582 | 		if isinstance(post, basestring):
583 | 			post = dict([part.split('=') for part in post.strip().split('&')])
584 | 		if post:
585 | 			for k, v in post.items():
586 | 				post[k] = spin(unicode(v).encode('utf-8'))
587 | 		if username and password:
588 | 			password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
589 | 			password_manager.add_password(None, url, username, password)
590 | 			password_auth = urllib2.HTTPBasicAuthHandler(password_manager)
591 | 			self.build_opener(password_auth)
592 | 		urllib2.install_opener(self.opener)
593 | 		if compress:
594 | 			headers = {'User-Agent' : self.useragent, 'Accept-encoding' : 'gzip'}
595 | 		else:
596 | 			headers = {'User-Agent' : self.useragent}
597 | 		if ref:
598 | 			 headers['Referer'] = ref
599 | 		if files:
600 | 			content_type,post = encode_multipart_formdata(post.items(), files)
601 | 			headers['content-type'] = content_type
602 | 			headers['content-length'] = str(len(post))
603 | 		elif post:
604 | 			post = urllib.urlencode(post)
605 | 		if head:
606 | 			req = HeadRequest(url, post, headers)
607 | 		else:
608 | 			req = urllib2.Request(url, post, headers)
609 | 		with gevent.Timeout(timeout):
610 | 			response = urllib2.urlopen(req)
611 | 			return HTTPResponse(response, url, http=self)
612 | 		
613 | def grab(url, proxy=None, post=None, ref=None, compress=True, retries=1, http_obj=None, cookies=False, redirects=True, timeout=30):
614 | 	data = None
615 | 	if retries < 1:
616 | 		retries = 1
617 | 	for i in range(retries):
618 | 		if not http_obj:
619 | 			http_obj = http(proxy, cookies=cookies, redirects=redirects)
620 | 		try:
621 | 			data = http_obj.urlopen(url=url, post=post, ref=ref, compress=compress, timeout=timeout)
622 | 			break
623 | 		except urllib2.HTTPError, e:
624 | 			if str(e.code).startswith('3') and not redirects:
625 | 				data = HTTPResponse(url=url, fake=True)
626 | 				break
627 | 		except:
628 | 			pass
629 | 	if data:
630 | 		return data
631 | 	return False
632 | 
633 | class RedisQueue(object):
634 | 	def __init__(self, name, host='localhost', port=6379):
635 | 		import redis
636 | 		self.r = redis.Redis(host=host, port=port)
637 | 		self.name = name
638 | 
639 | 	def put(self, item):
640 | 		self.r.sadd(self.name, item)
641 | 
642 | 	def get(self, timeout=60):
643 | 		timeout_counter = 0
644 | 		while True:
645 | 			result = self.r.spop(self.name)
646 | 			if result == None:
647 | 				if timeout == timeout_counter:
648 | 					return None
649 | 				gevent.sleep(1)
650 | 				timeout_counter += 1
651 | 			else:
652 | 				return result
653 | 
654 | 	def get_nowait(self):
655 | 		return self.get()
656 | 
657 | 	def __len__(self):
658 | 		return self.r.scard(self.name)
659 | 
660 | 	def empty(self):
661 | 		return len(self) == 0
662 | 
663 | def WebQueue(iterator=None):
664 | 	queue = Queue.Queue()
665 | 	if iterator:
666 | 		[queue.put(item) for item in iterator]
667 | 	return queue
668 | 
669 | def generic_iterator(iterator):
670 | 	if isinstance(iterator, basestring):
671 | 		if '\n' in iterator:
672 | 			for i in iterator.split('\n'):
673 | 				if len(i.strip()):
674 | 					yield i.strip()
675 | 		else:
676 | 			yield iterator.strip()
677 | 	else:
678 | 		for i in iterator:
679 | 			yield i
680 | 
681 | class DomainQueue(object):
682 | 	def __init__(self, urls):
683 | 		self.domains = collections.defaultdict(list)
684 | 		for url in urls:
685 | 			if isinstance(url, basestring):
686 | 				url = urlparse.urlparse(url)
687 | 			self.domains[url.netloc].append(url.geturl())
688 | 		self.counter = {domain:0 for domain in self.domains.keys()}
689 | 
690 | 	def empty(self):
691 | 		return len(self.domains) == 0
692 | 
693 | 	def get_nowait(self):
694 | 		domain = min(self.counter, key=self.counter.get)
695 | 		url = self.domains[domain].pop()
696 | 		if len(self.domains[domain]) == 0:
697 | 			del(self.domains[domain])
698 | 			del(self.counter[domain])
699 | 		else:
700 | 			self.counter[domain] += 1
701 | 		return url
702 | 
703 | 	def get(self):
704 | 		return self.get_nowait()
705 | 
706 | 	def put(self, url):
707 | 		if isinstance(url, basestring):
708 | 			url = urlparse.urlparse(url)
709 | 		self.domains[url.netloc].append(url.geturl())
710 | 		if url.netloc not in self.counter:
711 | 			self.counter[url.netloc] = 0
712 | 
713 | 	def __len__(self):
714 | 		return sum((len(d) for d in self.domains.values()))
715 | 
716 | def multi_grab(urls, pool_size=100, timeout=30, max_pages=-1, queuify=True, proxy=None):
717 | 	if queuify:
718 | 		in_q = WebQueue(generic_iterator(urls))
719 | 	else:
720 | 		in_q = urls
721 | 	for result_counter, result in enumerate(pooler(grab, in_q, pool_size=pool_size, timeout=timeout, proxy=proxy)):
722 | 		yield result
723 | 		if result_counter == max_pages and max_pages > 0:
724 | 			break
725 | 
726 | def domain_crawl(urls, pool_size=100, timeout=30, max_pages=-1, link_filter=None, max_domain_pages=-1):
727 | 	urls = {url for url in generic_iterator(urls)}
728 | 	domains = {urlparse.urlparse(url).netloc for url in urls}
729 | 	domain_counter = collections.Counter()
730 | 	seen_urls = set(urls)
731 | 	while True:
732 | 		if not len(urls):
733 | 			break
734 | 		urls_queue = Queue.Queue()
735 | 		[urls_queue.put(url) for url in urls]
736 | 		urls = set()
737 | 		for page_counter, page in enumerate(multi_grab(urls_queue, pool_size, timeout, queuify=False)):
738 | 			if page.final_domain in domains:
739 | 				domain_counter[page.final_domain] += 1
740 | 				if not (domain_counter[page.final_domain] > max_domain_pages and max_domain_pages > 0):
741 | 					try:
742 | 						new_urls = {url for url in page.internal_links() if url not in seen_urls}
743 | 						if callable(link_filter):
744 | 							new_urls = {url for url in new_urls if link_filter(url)}
745 | 						urls |= new_urls
746 | 						seen_urls |= new_urls
747 | 					except:
748 | 						pass
749 | 				if max_pages > 0 and page_counter > max_pages:
750 | 					break
751 | 				yield page
752 | 
753 | 
754 | def redirecturl(url, proxy=None):
755 | 	return http(proxy).urlopen(url, head=True).geturl()
756 | 
757 | def cloud_pooler(func, in_q, chunk_size=1000, _env='python-web', _type='c2', _max_runtime=60, get_results=True, **kwargs):
758 | 	import cloud
759 | 	if chunk_size > 1:
760 | 		if isinstance(in_q, collections.Iterable):
761 | 			in_q = WebQueue(in_q)
762 | 		chunks = []
763 | 		chunk = []
764 | 		while not in_q.empty():
765 | 			chunk.append(in_q.get())
766 | 			if len(chunk) == chunk_size:
767 | 				chunks.append(chunk)
768 | 				chunk = []
769 | 		if len(chunk):
770 | 			chunks.append(chunk)
771 | 	else:
772 | 		chunks = in_q
773 | 
774 | 	partial_func = functools.partial(func, **kwargs)
775 | 	jids = cloud.map(partial_func, chunks, _env=_env, _type=_type, _max_runtime=_max_runtime)
776 | 
777 | 	if get_results:
778 | 		print jids
779 | 		for result in cloud.iresult(jids, ignore_errors=True):
780 | 			if result:
781 | 				yield result
782 | 	else:
783 | 		for jid in jids:
784 | 			yield jid
785 | 
786 | def pooler(func, in_q, pool_size=100, proxy=False, max_results=0, **kwargs):
787 | 	if isinstance(in_q, collections.Iterable):
788 | 		in_q = WebQueue(in_q)
789 | 	out_q = multiprocessing.Queue()
790 | 	if proxy and not isinstance(proxy, ProxyManager):
791 | 		proxy = ProxyManager(proxy)
792 | 
793 | 	p = pool.Pool(pool_size)
794 | 	greenlets = set()
795 | 	if proxy:
796 | 		kwargs['proxy'] = proxy
797 | 	result_counter = 0
798 | 	while True:
799 | 		#print len(greenlets), 'greenlets'
800 | 		finished_greenlets = {g for g in greenlets if g.value != None}
801 | 		greenlets -= finished_greenlets
802 | 		for g in finished_greenlets:
803 | 			if g.value != False:
804 | 				yield g.value
805 | 				result_counter += 1
806 | 		if max_results > 0 and result_counter >= max_results:
807 | 			break
808 | 		if len(greenlets) > pool_size:
809 | 			print 'uhoh, greenlets are getting stuck', len(greenlets)
810 | 		if len(greenlets) < pool_size:		
811 | 			try:
812 | 				i = in_q.get_nowait()
813 | 			except:
814 | 				break
815 | 			if not isinstance(i, dict):
816 | 				i = {inspect.getargspec(func).args[0]: i}
817 | 			kwargs = dict(kwargs.items() + i.items())
818 | 			greenlets.add(p.spawn(func, **kwargs))
819 | 		else:
820 | 			time.sleep(1)
821 | 
822 | 
823 | 	p.join()
824 | 	for g in greenlets:
825 | 		if g.value:
826 | 			yield g.value
827 | 


--------------------------------------------------------------------------------