├── .gitignore ├── MANIFEST.in ├── README.rst ├── __init__.py ├── adt.py ├── alg.py ├── async.py ├── common.py ├── docs ├── Makefile ├── conf.py ├── examples.rst ├── index.rst ├── introduction.rst └── reference.rst ├── download.py ├── pdict.py ├── settings.py ├── setup.py ├── webkit.py └── xpath.py /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | *.pyc 3 | *~ 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Documentation is hosted at 2 | `docs.webscraping.com `__. 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | Website: 3 | http://code.google.com/p/webscraping/ 4 | 5 | License: 6 | LGPL 7 | """ 8 | 9 | if __name__ == '__main__': 10 | import doctest 11 | for name in ['adt', 'alg', 'common', 'download', 'pdict', 'settings', 'webkit', 'xpath']: 12 | module = __import__(name) 13 | print name 14 | print doctest.testmod(module) 15 | -------------------------------------------------------------------------------- /adt.py: -------------------------------------------------------------------------------- 1 | __doc__ = 'High level abstract datatypes' 2 | 3 | from datetime import datetime, timedelta 4 | from collections import defaultdict, deque 5 | try: 6 | import pybloom 7 | except ImportError: 8 | pass 9 | 10 | 11 | class Bag(dict): 12 | """Dictionary object with attribute like access 13 | 14 | >>> b = Bag() 15 | >>> b.name = 'company' 16 | >>> b.name 17 | 'company' 18 | >>> b.address 19 | """ 20 | def __init__(self, *args, **kwargs): 21 | dict.__init__(self, *args, **kwargs) 22 | 23 | def __getattr__(self, name): 24 | return self.get(name) 25 | 26 | def __setattr__(self, name, value): 27 | self[name] = value 28 | 29 | 30 | class HashDict: 31 | """For storing large quantities of keys where don't need the original value of the key 32 | Instead each key is hashed and hashes are compared for equality 33 | 34 | >>> hd = HashDict() 35 | >>> url = 'http://webscraping.com' 36 | >>> hd[url] = True 37 | >>> url in hd 38 | True 39 | >>> 'other url' in hd 40 | False 41 | >>> len(hd) 42 | 1 43 | """ 44 | def __init__(self, default_factory=str): 45 | self.d = defaultdict(default_factory) 46 | 47 | def __len__(self): 48 | """How many keys are stored in the HashDict 49 | """ 50 | return self.d.__len__() 51 | 52 | def __contains__(self, name): 53 | return self.d.__contains__(self.get_hash(name)) 54 | 55 | def __getitem__(self, name): 56 | return self.d.__getitem__(self.get_hash(name)) 57 | 58 | def __setitem__(self, name, value): 59 | return self.d.__setitem__(self.get_hash(name), value) 60 | 61 | def add(self, name): 62 | self[name] = True 63 | 64 | def get(self, name, default=None): 65 | """Get the value at this key 66 | 67 | Returns default if key does not exist 68 | """ 69 | return self.d.get(self.get_hash(name), default) 70 | 71 | def get_hash(self, value): 72 | """get the hash value of this value 73 | """ 74 | return hash(value) 75 | 76 | 77 | class Bloom: 78 | """A bloom filter is a space efficient way to tell if an element is in a set. 79 | False positive are possible - set by err rate - but false negatives are not. 80 | """ 81 | def __init__(self, start_items=10000, err_rate=0.0001): 82 | self.bloom = pybloom.ScalableBloomFilter(10000, err, 4) 83 | 84 | def __contains__(self, key): 85 | return key in self.bloom 86 | 87 | def add(self, key): 88 | return self.bloom.add(key) 89 | -------------------------------------------------------------------------------- /alg.py: -------------------------------------------------------------------------------- 1 | __doc__ = 'High level functions for interpreting useful data from input' 2 | 3 | import csv, logging, math, os, random, re 4 | import common, xpath 5 | 6 | 7 | def get_excerpt(html, try_meta=False, max_chars=255): 8 | """Extract excerpt from this HTML by finding the largest text block 9 | 10 | try_meta: 11 | indicates whether to try extracting from meta description tag 12 | max_chars: 13 | the maximum number of characters for the excerpt 14 | """ 15 | # try extracting meta description tag 16 | excerpt = '' 17 | if try_meta: 18 | excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content') 19 | if not excerpt: 20 | # remove these tags and then find biggest text block 21 | bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 22 | content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags)) 23 | if content: 24 | excerpt = max((len(p.strip()), p) for p in content.splitlines())[1] 25 | return common.unescape(excerpt.strip())[:max_chars] 26 | 27 | 28 | IGNORED_EMAILS = 'username@location.com', 'johndoe@domain.com' 29 | def extract_emails(html, ignored=IGNORED_EMAILS): 30 | """Remove common obfuscations from HTML and then extract all emails 31 | 32 | ignored: 33 | list of dummy emails to ignore 34 | 35 | >>> extract_emails('') 36 | [] 37 | >>> extract_emails('hello contact@webscraping.com world') 38 | ['contact@webscraping.com'] 39 | >>> extract_emails('hello contact@webscraping.com world') 40 | ['contact@webscraping.com'] 41 | >>> extract_emails('hello contact AT webscraping DOT com world') 42 | ['contact@webscraping.com'] 43 | >>> extract_emails(' info+hn@gmail.com ') 44 | ['info+hn@gmail.com'] 45 | >>> extract_emails('Contact') 46 | ['first.last@mail.co.uk'] 47 | """ 48 | emails = [] 49 | if html: 50 | email_re = re.compile('([\w\.\-\+]{1,64})@(\w[\w\.-]{1,255})\.(\w+)') 51 | # remove comments, which can obfuscate emails 52 | html = re.compile('', re.DOTALL).sub('', html).replace('mailto:', '') 53 | for user, domain, ext in email_re.findall(html): 54 | if ext.lower() not in common.MEDIA_EXTENSIONS and len(ext)>=2 and not re.compile('\d').search(ext) and domain.count('.')<=3: 55 | email = '%s@%s.%s' % (user, domain, ext) 56 | if email not in emails: 57 | emails.append(email) 58 | 59 | # look for obfuscated email 60 | for user, domain, ext in re.compile('([\w\.\-\+]{1,64})\s?.?AT.?\s?([\w\.-]{1,255})\s?.?DOT.?\s?(\w+)', re.IGNORECASE).findall(html): 61 | if ext.lower() not in common.MEDIA_EXTENSIONS and len(ext)>=2 and not re.compile('\d').search(ext) and domain.count('.')<=3: 62 | email = '%s@%s.%s' % (user, domain, ext) 63 | if email not in emails: 64 | emails.append(email) 65 | return [email for email in emails if email not in ignored] 66 | 67 | 68 | def extract_phones(html): 69 | """Extract phone numbers from this HTML 70 | 71 | >>> extract_phones('Phone: (123) 456-7890
') 72 | ['(123) 456-7890'] 73 | >>> extract_phones('Phone 123.456.7890 ') 74 | ['123.456.7890'] 75 | >>> extract_phones('+1-123-456-7890
123 456 7890n') 76 | ['123-456-7890', '123 456 7890'] 77 | >>> extract_phones('456-7890') 78 | [] 79 | >>> extract_phones('Contact') 80 | ['0234673460'] 81 | """ 82 | return [match.group() for match in re.finditer('(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}', html)] + re.findall('tel:(\d+)', html) 83 | 84 | 85 | def parse_us_address(address): 86 | """Parse USA address into address, city, state, and zip code 87 | 88 | >>> parse_us_address('6200 20th Street, Vero Beach, FL 32966') 89 | ('6200 20th Street', 'Vero Beach', 'FL', '32966') 90 | """ 91 | city = state = zipcode = '' 92 | addrs = map(lambda x:x.strip(), address.split(',')) 93 | if addrs: 94 | m = re.compile('([A-Z]{2,})\s*(\d[\d\-\s]+\d)').search(addrs[-1]) 95 | if m: 96 | state = m.groups()[0].strip() 97 | zipcode = m.groups()[1].strip() 98 | 99 | if len(addrs)>=3: 100 | city = addrs[-2].strip() 101 | address = ','.join(addrs[:-2]) 102 | else: 103 | address = ','.join(addrs[:-1]) 104 | 105 | return address, city, state, zipcode 106 | 107 | 108 | def get_earth_radius(scale): 109 | if scale is None: 110 | return 1.0 111 | elif scale == 'km': 112 | return 6373.0 113 | elif scale == 'miles': 114 | return 3960.0 115 | else: 116 | raise common.WebScrapingError('Invalid scale: %s' % str(scale)) 117 | 118 | 119 | def distance(p1, p2, scale=None): 120 | """Calculate distance between 2 (latitude, longitude) points. 121 | 122 | scale: 123 | By default the distance will be returned as a ratio of the earth's radius 124 | Use 'km' to return distance in kilometres, 'miles' to return distance in miles 125 | 126 | >>> melbourne = -37.7833, 144.9667 127 | >>> san_francisco = 37.7750, -122.4183 128 | >>> int(distance(melbourne, san_francisco, 'km')) 129 | 12659 130 | """ 131 | if p1 == p2: 132 | return 0 133 | lat1, long1 = p1 134 | lat2, long2 = p2 135 | # Convert latitude and longitude to 136 | # spherical coordinates in radians. 137 | degrees_to_radians = math.pi / 180.0 138 | 139 | # phi = 90 - latitude 140 | phi1 = (90.0 - lat1)*degrees_to_radians 141 | phi2 = (90.0 - lat2)*degrees_to_radians 142 | 143 | # theta = longitude 144 | theta1 = long1*degrees_to_radians 145 | theta2 = long2*degrees_to_radians 146 | 147 | # Compute spherical distance from spherical coordinates. 148 | 149 | # For two locations in spherical coordinates 150 | # (1, theta, phi) and (1, theta, phi) 151 | # cosine( arc length ) = 152 | # sin phi sin phi' cos(theta-theta') + cos phi cos phi' 153 | # distance = rho * arc length 154 | 155 | cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) + math.cos(phi1)*math.cos(phi2)) 156 | arc = math.acos(cos) 157 | return arc * get_earth_radius(scale) 158 | 159 | 160 | def find_coordinates(ch_lat=100, ch_lng=100, ch_scale='miles', min_lat=-90, max_lat=90, min_lng=-180, max_lng=180): 161 | """Find all latitude/longitude coordinates within bounding box, with given increments 162 | """ 163 | cur_lat = min_lat 164 | while cur_lat < max_lat: 165 | cur_lng = min_lng 166 | while cur_lng < max_lng: 167 | yield cur_lat, cur_lng 168 | _, cur_lng = move_coordinate(cur_lat, cur_lng, 0, ch_lng, ch_scale) 169 | cur_lat, _ = move_coordinate(cur_lat, cur_lng, ch_lat, 0, ch_scale) 170 | 171 | 172 | def move_coordinate(lat, lng, ch_lat, ch_lng, ch_scale=None): 173 | """Move latitude/longitude coordinate a given increment 174 | """ 175 | r_earth = get_earth_radius(ch_scale) 176 | new_lat = lat + (ch_lat / r_earth) * (180 / math.pi); 177 | new_lng = lng + (ch_lng / r_earth) * (180 / math.pi) / math.cos(lat * math.pi/180.0) 178 | return new_lat, new_lng 179 | 180 | 181 | def get_zip_codes(filename, min_distance=100, scale='miles', lat_key='Latitude', lng_key='Longitude', zip_key='Zip'): 182 | """Reads CSV file of zip,lat,lng and returns zip codes that aren't within the minimum distance of each other 183 | """ 184 | for zip_code, lat, lng in get_zip_lat_lngs(filename, min_distance, scale, lat_key, lng_key, zip_key): 185 | yield zip_code 186 | 187 | def get_zip_lat_lngs(filename, min_distance=100, scale='miles', lat_key='Latitude', lng_key='Longitude', zip_key='Zip'): 188 | if min_distance > 0: 189 | locations = [] 190 | for record in csv.DictReader(open(filename)): 191 | lat, lng = float(record[lat_key]), float(record[lng_key]) 192 | for other_lat, other_lng in locations: 193 | if distance((lat, lng), (other_lat, other_lng), scale=scale) < min_distance: 194 | break 195 | else: 196 | locations.append((lat, lng)) 197 | yield record[zip_key], record[lat_key], record[lng_key] 198 | else: 199 | for record in csv.DictReader(open(filename)): 200 | yield record[zip_key], record[lat_key], record[lng_key] 201 | 202 | 203 | def find_json_path(e, value, path=''): 204 | """Find the JSON path that points to this value 205 | """ 206 | results = [] 207 | if e == value: 208 | results.append(path) 209 | if isinstance(e, dict): 210 | for k, v in e.items(): 211 | key_path = '{}["{}"]'.format(path, k) 212 | results.extend(find_json_path(v, value, key_path)) 213 | elif isinstance(e, list): 214 | for i, v in enumerate(e): 215 | index_path = '{}[{}]'.format(path, i) 216 | results.extend(find_json_path(v, value, index_path)) 217 | return results 218 | 219 | 220 | # support to generate a random user agent 221 | 222 | # the operating system templates 223 | def linux_os(): 224 | dist = random.choice(['', ' U;', ' Ubuntu;']) 225 | system = random.choice(['', ' x86_64', ' i686']) 226 | return 'X11;%s Linux%s' % (dist, system) 227 | 228 | 229 | def osx_os(): 230 | return 'Macintosh; Intel Mac OS X 10.%d' % random.randint(6, 9) 231 | 232 | 233 | def windows_os(): 234 | system = random.choice(['', '; Win64; x64', '; WOW64']) 235 | return 'Windows NT %d.%d%s' % (random.randint(5, 6), random.randint(0, 2), system) 236 | 237 | 238 | def rand_os(): 239 | return random.choice([linux_os, osx_os, windows_os])() 240 | 241 | # the browser templates 242 | def firefox_browser(os_version): 243 | browser_version = random.randint(20, 25) 244 | return 'Mozilla/5.0 (%s; rv:%d.0) Gecko/20100101 Firefox/%d.0' % (os_version, browser_version, browser_version) 245 | 246 | def ie_browser(os_version=None): 247 | os_version = windows_os() # always use windows with IE 248 | return 'Mozilla/5.0 (compatible; MSIE %d.0; %s; Trident/%d.0)' % (random.randint(8, 10), os_version, random.randint(5, 6)) 249 | 250 | def chrome_browser(os_version): 251 | return 'Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%d.0.%d.%d Safari/537.36' % (os_version, random.randint(28, 32), random.randint(1464, 1667), random.randint(0, 9)) 252 | 253 | 254 | def rand_agent(): 255 | """Returns a random user agent across Firefox, IE, and Chrome on Linux, OSX, and Windows 256 | """ 257 | browser = random.choice([firefox_browser, ie_browser, chrome_browser]) 258 | return browser(rand_os()) 259 | 260 | 261 | -------------------------------------------------------------------------------- /async.py: -------------------------------------------------------------------------------- 1 | __doc__ = 'Helper methods to download and crawl web content using threads' 2 | 3 | import sys 4 | import time 5 | import cookielib 6 | import base64 7 | import signal 8 | import urlparse 9 | import collections 10 | 11 | from twisted.internet import reactor, defer, protocol, endpoints 12 | from twisted.web import client, error, http, http_headers 13 | from twisted.python import failure, log 14 | 15 | import adt, common, download, settings 16 | 17 | 18 | """ 19 | TODO 20 | - support for POST 21 | - efficient get request callback 22 | """ 23 | 24 | 25 | def threaded_get(**kwargs): 26 | """Download using asynchronous single threaded twisted callbacks 27 | """ 28 | tc = TwistedCrawler(**kwargs) 29 | tc.start() 30 | 31 | 32 | class TwistedCrawler: 33 | def __init__(self, url=None, urls=None, url_iter=None, num_threads=20, cb=None, depth=True, max_errors=None, pattern=None, **kwargs): 34 | self.settings = adt.Bag( 35 | read_cache = True, 36 | write_cache = True, 37 | num_redirects = 5, 38 | num_retries = 2, 39 | timeout = 20, 40 | headers = {}, 41 | num_threads = num_threads, 42 | cb = cb, 43 | url_iter = url_iter, 44 | depth = depth, 45 | pattern = pattern 46 | ) 47 | self.settings.update(**kwargs) 48 | self.D = download.Download(**kwargs) 49 | self.kwargs = kwargs 50 | # queue of html to be written to cache 51 | self.cache_queue = [] 52 | # URL's that are waiting to download 53 | self.download_queue = collections.deque() 54 | if urls: 55 | self.download_queue.extend(urls) 56 | if url: 57 | self.download_queue.append(url) # XXX create compressed dict data type for large in memory? 58 | # URL's currently downloading 59 | self.processing = {} 60 | # defereds that are downloading 61 | self.downloading = [] 62 | # URL's that have been found before 63 | self.found = adt.HashDict() 64 | for url in self.download_queue: 65 | self.found[url] = True 66 | self.state = download.State() 67 | self.max_errors = max_errors 68 | self.num_errors = 0 # counter for the number of subsequent errors 69 | 70 | 71 | def start(self): 72 | """Start the twisted event loop 73 | """ 74 | # catch ctrl-c keyboard event and stop twisted 75 | signal.signal(signal.SIGINT, self.kill) 76 | self.running = True 77 | reactor.callWhenRunning(self.crawl) 78 | reactor.run() 79 | 80 | 81 | def stop(self): 82 | """Stop the twisted event loop 83 | """ 84 | if self.running: 85 | common.logger.info('Twisted eventloop shutting down') 86 | self.running = False 87 | self.state.save() 88 | reactor.stop() 89 | 90 | 91 | def kill(self, *ignore): 92 | """Exit the script 93 | """ 94 | for d in self.downloading: 95 | d.cancel() 96 | self.stop() 97 | sys.exit() 98 | 99 | 100 | def is_finished(self): 101 | """Call finish callback in case more processing to do 102 | """ 103 | for url in self.settings.url_iter or []: 104 | self.download_queue.append(url) 105 | return False 106 | return True 107 | 108 | 109 | def crawl(self): 110 | """Crawl more URLs if available 111 | """ 112 | if self.download_queue or self.processing or self.cache_queue or not self.is_finished(): 113 | #print 'Running: %d, queue: %d, cache: %d, processing: %d, threads: %d' % (self.running, len(self.download_queue), len(self.cache_queue), len(self.processing), self.settings.num_threads) 114 | while self.running and self.download_queue and len(self.processing) < self.settings.num_threads: 115 | url = str(self.download_queue.pop() if self.settings.depth else self.download_queue.popleft()) 116 | self.processing[url] = '' 117 | downloaded = False 118 | if self.D.cache and self.settings.read_cache: 119 | key = self.D.get_key(url, self.settings.data) 120 | try: 121 | html = self.D.cache[key] 122 | except KeyError: 123 | pass 124 | else: 125 | # html is available so scrape this directly 126 | if self.D.invalid_response(html, self.settings.pattern): 127 | # invalid result from download 128 | html = '' 129 | if html or self.settings.num_retries == 0: 130 | reactor.callLater(0, self.scrape, url, html) 131 | downloaded = True 132 | 133 | if downloaded: 134 | # record cache load 135 | self.state.update(num_caches=1) 136 | else: 137 | # need to download this new URL 138 | self.download_start(url) 139 | self.state.update(queue_size=len(self.download_queue)) 140 | 141 | # XXX test inactive 142 | try: 143 | self.inactive_call.cancel() 144 | except AttributeError: 145 | pass # not defined yet 146 | self.inactive_call = reactor.callLater(5*60, self.inactive) 147 | # XXX 148 | 149 | if self.running: 150 | reactor.callLater(0, self.cache_downloads) 151 | reactor.callLater(0, self.crawl) 152 | else: 153 | # save the final state and exit 154 | self.stop() 155 | 156 | 157 | def inactive(self): 158 | common.logger.error('crawler inactive') 159 | common.logger.error('queue (%d): %s' % (len(self.download_queue), ', '.join(self.download_queue))) 160 | common.logger.error('processing (%d): %s' % (len(self.processing), ', '.join(self.processing))) 161 | self.stop() 162 | 163 | 164 | def download_start(self, url, num_retries=0, redirects=None, proxy=None): 165 | """Start URL download 166 | """ 167 | redirects = redirects or [] 168 | redirects.append(url) 169 | if not proxy: 170 | proxy = self.D.get_proxy() 171 | self.processing[redirects[0]] = proxy 172 | 173 | headers = {} 174 | headers['User-Agent'] = [self.settings.get('user_agent', self.D.get_user_agent(proxy))] 175 | for name, value in self.settings.headers.items() + settings.default_headers.items(): 176 | if name not in headers: 177 | if not value: 178 | if name == 'Referer': 179 | value = url 180 | headers[name] = [value] 181 | agent = self.build_agent(proxy, headers) 182 | data = None 183 | d = agent.request('GET', url, http_headers.Headers(headers), data) 184 | d.addCallback(self.download_headers, url, num_retries, redirects) 185 | d.addErrback(self.download_error, redirects[0]) 186 | d.addErrback(log.err) 187 | 188 | # timeout to stop download if hangs 189 | timeout_call = reactor.callLater(self.settings.timeout, self.download_timeout, d, url) 190 | self.downloading.append(d) 191 | 192 | def completed(ignore): 193 | # remove timeout callback on completion 194 | if timeout_call.active(): 195 | timeout_call.cancel() 196 | self.downloading.remove(d) 197 | d.addBoth(completed) 198 | 199 | 200 | def download_headers(self, response, url, num_retries, redirects): 201 | """Headers have been returned from download 202 | """ 203 | common.logger.info('Downloading ' + url) 204 | finished = defer.Deferred() 205 | # XXX how to ignore processing body for errors? 206 | response.deliverBody(DownloadPrinter(finished)) 207 | if self.handle_redirect(url, response, num_retries, redirects): 208 | # redirect handled 209 | pass 210 | elif 400 <= response.code < 500: 211 | raise TwistedError(response.phrase) 212 | elif 500 <= response.code < 600: 213 | # server error so try again 214 | message = '%s (%d)' % (response.phrase, response.code) 215 | self.handle_retry(url, message, num_retries, redirects) 216 | elif self.running: 217 | # handle download 218 | finished.addCallbacks(self.download_complete, self.download_error, 219 | callbackArgs=[num_retries, redirects], errbackArgs=[redirects[0]] 220 | ) 221 | finished.addErrback(self.download_error, redirects[0]) 222 | 223 | 224 | def download_complete(self, html, num_retries, redirects): 225 | """Body has completed downloading 226 | """ 227 | redirect_url = download.get_redirect(redirects[0], html) 228 | if redirect_url: 229 | # meta redirect 230 | proxy = self.processing[redirects[0]] 231 | reactor.callLater(0, self.download_start, redirect_url, 0, redirects, proxy) 232 | elif self.D.invalid_response(html, self.settings.pattern): 233 | # invalid result from download 234 | message = 'Content did not match expected pattern' 235 | self.handle_retry(redirects[0], message, num_retries, redirects) 236 | 237 | else: 238 | # successful download 239 | self.num_errors = 0 240 | self.state.update(num_downloads=1) 241 | if self.D.cache and self.settings.write_cache: 242 | self.cache_queue.append((redirects, html)) 243 | reactor.callLater(0, self.scrape, redirects[0], html) 244 | 245 | 246 | def download_timeout(self, d, url): 247 | """Catch timeout error and cancel request 248 | """ 249 | self.downloading.remove(d) 250 | d.cancel() 251 | 252 | 253 | def download_error(self, reason, url): 254 | """Error received during download 255 | """ 256 | # XXX how to properly pass error from download timeout cancel 257 | error = reason.getErrorMessage() or 'Download timeout' 258 | common.logger.warning('Download error: %s: %s' % (error, url)) 259 | self.state.update(num_errors=1) 260 | if self.D.cache and self.settings.write_cache: 261 | self.cache_queue.append((url, '')) 262 | del self.processing[url] 263 | # check whether to give up the crawl 264 | self.num_errors += 1 265 | if self.max_errors is not None: 266 | common.logger.debug('Errors: %d / %d' % (self.num_errors, self.max_errors)) 267 | if self.num_errors > self.max_errors: 268 | common.logger.error('Too many download errors, shutting down') 269 | self.stop() 270 | 271 | 272 | def handle_retry(self, url, message, num_retries, redirects): 273 | """Handle retrying a download error 274 | """ 275 | if num_retries < self.settings.num_retries: 276 | # retry the download 277 | common.logger.info('Download retry: %d: %s' % (num_retries, url)) 278 | reactor.callLater(0, self.download_start, url, num_retries+1, redirects) 279 | else: 280 | # out of retries 281 | raise TwistedError('Retry failure: %s' % message) 282 | 283 | 284 | def handle_redirect(self, url, response, num_retries, redirects): 285 | """Handle redirects - the builtin RedirectAgent does not handle relative redirects 286 | """ 287 | if response.code in (301, 302, 303, 307): 288 | # redirect HTTP code 289 | locations = response.headers.getRawHeaders('location', []) 290 | if locations: 291 | # a new redirect url 292 | if len(redirects) < self.settings.num_redirects: 293 | # can still redirect 294 | redirect_url = urlparse.urljoin(url, locations[0]) 295 | if redirect_url != url: 296 | # new redirect URL 297 | redirects.append(url) 298 | reactor.callLater(0, self.download_start, redirect_url, num_retries, redirects) 299 | return True 300 | return False 301 | 302 | 303 | def scrape(self, url, html): 304 | """Pass completed body to callback for scraping 305 | """ 306 | del self.processing[url] 307 | if self.settings.cb and self.running: 308 | try: 309 | # get links crawled from webpage 310 | links = self.settings.cb(self.D, url, html) or [] 311 | except download.StopCrawl: 312 | common.logger.info('Stopping crawl signal') 313 | self.stop() 314 | except Exception as e: 315 | common.logger.exception('\nIn callback for: ' + str(url)) 316 | else: 317 | # add new links to queue 318 | for link in links: 319 | cb_url = urlparse.urljoin(url, link) 320 | if cb_url not in self.found: 321 | self.found[cb_url] = True 322 | self.download_queue.append(cb_url) 323 | 324 | 325 | def build_pool(self): 326 | """Create connection pool 327 | """ 328 | # XXX create limited number of instances 329 | pool = client.HTTPConnectionPool(reactor, persistent=True) 330 | # 1 connection for each proxy or thread 331 | # XXX will this take too much memory? 332 | pool.maxPersistentPerHost = len(self.D.settings.proxies) or self.settings.num_threads 333 | pool.cachedConnectionTimeout = 240 334 | return pool 335 | 336 | 337 | #agents = {} 338 | cookiejars = {} 339 | def build_agent(self, proxy, headers): 340 | """Build an agent for this request 341 | """ 342 | fragments = common.parse_proxy(proxy) 343 | pool = self.build_pool() 344 | if fragments.host: 345 | # add proxy authentication header 346 | auth = base64.b64encode("%s:%s" % (fragments.username, fragments.password)) 347 | headers['Proxy-Authorization'] = ["Basic " + auth.strip()] 348 | # generate the agent 349 | endpoint = endpoints.TCP4ClientEndpoint(reactor, fragments.host, int(fragments.port), timeout=self.settings.timeout) 350 | agent = client.ProxyAgent(endpoint, reactor=reactor, pool=pool) 351 | else: 352 | agent = client.Agent(reactor, connectTimeout=self.settings.timeout, pool=pool) 353 | 354 | agent = client.ContentDecoderAgent(agent, [('gzip', client.GzipDecoder)]) 355 | # XXX if use same cookie for all then works... 356 | # cookies usually empty 357 | if proxy in self.cookiejars: 358 | cj = self.cookiejars[proxy] 359 | else: 360 | cj = cookielib.CookieJar() 361 | self.cookiejars[proxy] = cj 362 | agent = client.CookieAgent(agent, cj) 363 | return agent 364 | 365 | 366 | def cache_downloads(self): 367 | """Cache the downloaded HTML 368 | """ 369 | if self.cache_queue: 370 | while self.cache_queue: 371 | redirects, html = self.cache_queue.pop() 372 | common.logger.debug('Cached: %d' % len(self.cache_queue)) 373 | url = redirects[0] 374 | self.D[url] = html 375 | final_url = redirects[-1] 376 | if url != final_url: 377 | # store the redirect map 378 | self.D.cache.meta(start_url, dict(url=final_url)) 379 | 380 | 381 | class TwistedError(Exception): 382 | pass 383 | 384 | 385 | class DownloadPrinter(protocol.Protocol): 386 | """Collect together body requests 387 | """ 388 | def __init__(self, finished): 389 | self.finished = finished 390 | self.data = [] 391 | 392 | def dataReceived(self, page): 393 | self.data.append(page) 394 | 395 | def connectionLost(self, reason): 396 | if str(reason.value) not in ('', 'Response body fully received'): 397 | common.logger.info('Download body error: ' + str(reason.value)) 398 | html = ''.join(self.data) 399 | self.finished.callback(html) 400 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __doc__ = 'Common web scraping related functions' 3 | 4 | import os 5 | import re 6 | import sys 7 | import csv 8 | csv.field_size_limit(sys.maxint) 9 | import time 10 | import glob 11 | import json 12 | import string 13 | import urllib 14 | import string 15 | import urllib2 16 | import urlparse 17 | import cookielib 18 | import itertools 19 | import htmlentitydefs 20 | import logging 21 | import logging.handlers 22 | import threading 23 | import collections 24 | from datetime import datetime, timedelta 25 | try: 26 | # should use pysqlite2 to read the cookies.sqlite on Windows 27 | # otherwise will raise the "sqlite3.DatabaseError: file is encrypted or is not a database" exception 28 | from pysqlite2 import dbapi2 as sqlite3 29 | except ImportError: 30 | import sqlite3 31 | import adt 32 | import settings 33 | 34 | try: 35 | import json 36 | except ImportError: 37 | import simplejson as json 38 | 39 | 40 | class WebScrapingError(Exception): 41 | pass 42 | 43 | 44 | # known media file extensions 45 | MEDIA_EXTENSIONS = ['ai', 'aif', 'aifc', 'aiff', 'asc', 'avi', 'bcpio', 'bin', 'c', 'cc', 'ccad', 'cdf', 'class', 'cpio', 'cpt', 'csh', 'css', 'csv', 'dcr', 'dir', 'dms', 'doc', 'drw', 'dvi', 'dwg', 'dxf', 'dxr', 'eps', 'etx', 'exe', 'ez', 'f', 'f90', 'fli', 'flv', 'gif', 'gtar', 'gz', 'h', 'hdf', 'hh', 'hqx', 'ice', 'ico', 'ief', 'iges', 'igs', 'imq', 'ips', 'ipx', 'jpe', 'jpeg', 'jpg', 'js', 'kar', 'latex', 'lha', 'lsp', 'lzh', 'm', 'man', 'me', 'mesh', 'mid', 'midi', 'mif', 'mime', 'mov', 'movie', 'mp2', 'mp3', 'mpe', 'mpeg', 'mpg', 'mpga', 'ms', 'msh', 'nc', 'oda', 'pbm', 'pdb', 'pdf', 'pgm', 'pgn', 'png', 'pnm', 'pot', 'ppm', 'pps', 'ppt', 'ppz', 'pre', 'prt', 'ps', 'qt', 'ra', 'ram', 'ras', 'raw', 'rgb', 'rm', 'roff', 'rpm', 'rtf', 'rtx', 'scm', 'set', 'sgm', 'sgml', 'sh', 'shar', 'silo', 'sit', 'skd', 'skm', 'skp', 'skt', 'smi', 'smil', 'snd', 'sol', 'spl', 'src', 'step', 'stl', 'stp', 'sv4cpio', 'sv4crc', 'swf', 't', 'tar', 'tcl', 'tex', 'texi', 'tif', 'tiff', 'tr', 'tsi', 'tsp', 'tsv', 'unv', 'ustar', 'vcd', 'vda', 'viv', 'vivo', 'vrml', 'w2p', 'wav', 'wmv', 'wrl', 'xbm', 'xlc', 'xll', 'xlm', 'xls', 'xlw', 'xml', 'xpm', 'xsl', 'xwd', 'xyz', 'zip'] 46 | 47 | # tags that do not contain content 48 | EMPTY_TAGS = 'br', 'hr', 'meta', 'link', 'base', 'img', 'embed', 'param', 'area', 'col', 'input' 49 | 50 | 51 | def to_ascii(html): 52 | """Return ascii part of html 53 | """ 54 | return ''.join(c for c in (html or '') if ord(c) < 128) 55 | 56 | def to_int(s, default=0): 57 | """Return integer from this string 58 | 59 | >>> to_int('90') 60 | 90 61 | >>> to_int('-90.2432') 62 | -90 63 | >>> to_int('a90a') 64 | 90 65 | >>> to_int('a') 66 | 0 67 | >>> to_int('a', 90) 68 | 90 69 | """ 70 | return int(to_float(s, default)) 71 | 72 | def to_float(s, default=0.0): 73 | """Return float from this string 74 | 75 | >>> to_float('90.45') 76 | 90.45 77 | >>> to_float('') 78 | 0.0 79 | >>> to_float('90') 80 | 90.0 81 | >>> to_float('..9') 82 | 0.0 83 | >>> to_float('.9') 84 | 0.9 85 | >>> to_float(None) 86 | 0.0 87 | >>> to_float(1) 88 | 1.0 89 | """ 90 | result = default 91 | if s: 92 | valid = string.digits + '.-' 93 | try: 94 | result = float(''.join(c for c in str(s) if c in valid)) 95 | except ValueError: 96 | pass # input does not contain a number 97 | return result 98 | 99 | 100 | def to_unicode(obj, encoding=settings.default_encoding): 101 | """Convert obj to unicode 102 | """ 103 | if isinstance(obj, basestring): 104 | if not isinstance(obj, unicode): 105 | obj = obj.decode(encoding, 'ignore') 106 | return obj 107 | 108 | 109 | def html_to_unicode(html, charset=settings.default_encoding): 110 | """Convert html to unicode, decoding by specified charset when available 111 | """ 112 | m = re.compile(r']*charset=\s*([a-z\d\-]+)', re.IGNORECASE).search(html) 113 | if m: 114 | charset = m.groups()[0].strip().lower() 115 | 116 | return to_unicode(html, charset) 117 | 118 | 119 | def is_html(html): 120 | """Returns whether content is likely HTML based on search for common tags 121 | """ 122 | try: 123 | result = re.search('html|head|body', html) is not None 124 | except TypeError: 125 | result = False 126 | return result 127 | 128 | 129 | def is_url(text): 130 | """Returns whether passed text is a URL 131 | 132 | >>> is_url('abc') 133 | False 134 | >>> is_url('webscraping.com') 135 | False 136 | >>> is_url('http://webscraping.com/blog') 137 | True 138 | """ 139 | return re.match('https?://', text) is not None 140 | 141 | 142 | def unique(l): 143 | """Remove duplicates from list, while maintaining order 144 | 145 | >>> unique([3,6,4,4,6]) 146 | [3, 6, 4] 147 | >>> unique([]) 148 | [] 149 | >>> unique([3,6,4]) 150 | [3, 6, 4] 151 | """ 152 | checked = [] 153 | for e in l: 154 | if e not in checked: 155 | checked.append(e) 156 | return checked 157 | 158 | 159 | def flatten(l): 160 | """Flatten a list of lists into a single list 161 | 162 | >>> flatten([[1,2,3], [4,5,6]]) 163 | [1, 2, 3, 4, 5, 6] 164 | """ 165 | return [item for sublist in l for item in sublist] 166 | 167 | 168 | def nth(l, i, default=''): 169 | """Return nth item from list or default value if out of range 170 | """ 171 | try: 172 | return l[i] 173 | except IndexError: 174 | return default 175 | 176 | def first(l, default=''): 177 | """Return first element from list or default value if out of range 178 | 179 | >>> first([1,2,3]) 180 | 1 181 | >>> first([], None) 182 | 183 | """ 184 | return nth(l, i=0, default=default) 185 | 186 | def last(l, default=''): 187 | """Return last element from list or default value if out of range 188 | """ 189 | return nth(l, i=-1, default=default) 190 | 191 | 192 | def pad(l, size, default=None, end=True): 193 | """Return list of given size 194 | Insert elements of default value if too small 195 | Remove elements if too large 196 | Manipulate end of list if end is True, else start 197 | 198 | >>> pad(range(5), 5) 199 | [0, 1, 2, 3, 4] 200 | >>> pad(range(5), 3) 201 | [0, 1, 2] 202 | >>> pad(range(5), 7, -1) 203 | [0, 1, 2, 3, 4, -1, -1] 204 | >>> pad(range(5), 7, end=False) 205 | [None, None, 0, 1, 2, 3, 4] 206 | """ 207 | while len(l) < size: 208 | if end: 209 | l.append(default) 210 | else: 211 | l.insert(0, default) 212 | while len(l) > size: 213 | if end: 214 | l.pop() 215 | else: 216 | l.pop(0) 217 | return l 218 | 219 | 220 | def remove_tags(html, keep_children=True): 221 | """Remove HTML tags leaving just text 222 | If keep children is True then keep text within child tags 223 | 224 | >>> remove_tags('hello world!') 225 | 'hello world!' 226 | >>> remove_tags('hello world!', False) 227 | 'hello !' 228 | >>> remove_tags('hello
world
!', False) 229 | 'hello world!' 230 | >>> remove_tags('test', False) 231 | 'test' 232 | """ 233 | html = re.sub('<(%s)[^>]*>' % '|'.join(EMPTY_TAGS), '', html) 234 | if not keep_children: 235 | for tag in unique(re.findall('<(\w+?)\W', html)): 236 | if tag not in EMPTY_TAGS: 237 | html = re.compile('<\s*%s.*?>.*?' % (tag, tag), re.DOTALL).sub('', html) 238 | return re.compile('<[^<]*?>').sub('', html) 239 | 240 | 241 | def unescape(text, encoding=settings.default_encoding, keep_unicode=False): 242 | """Interpret escape characters 243 | 244 | >>> unescape('<hello &%20world>') 245 | '' 246 | """ 247 | if not text: 248 | return '' 249 | try: 250 | text = to_unicode(text, encoding) 251 | except UnicodeError: 252 | pass 253 | 254 | def fixup(m): 255 | text = m.group(0) 256 | if text[:2] == '&#': 257 | # character reference 258 | try: 259 | if text[:3] == '&#x': 260 | return unichr(int(text[3:-1], 16)) 261 | else: 262 | return unichr(int(text[2:-1])) 263 | except ValueError: 264 | pass 265 | else: 266 | # named entity 267 | try: 268 | text = unichr(htmlentitydefs.name2codepoint[text[1:-1].lower()]) 269 | except KeyError: 270 | pass 271 | return text # leave as is 272 | text = re.sub('&#?\w+;', fixup, text) 273 | text = urllib.unquote(text) 274 | if keep_unicode: 275 | return text 276 | try: 277 | text = text.encode(encoding, 'ignore') 278 | except UnicodeError: 279 | pass 280 | 281 | if encoding != 'utf-8': 282 | return text 283 | 284 | # remove annoying characters 285 | chars = { 286 | '\xc2\x82' : ',', # High code comma 287 | '\xc2\x84' : ',,', # High code double comma 288 | '\xc2\x85' : '...', # Tripple dot 289 | '\xc2\x88' : '^', # High carat 290 | '\xc2\x91' : '\x27', # Forward single quote 291 | '\xc2\x92' : '\x27', # Reverse single quote 292 | '\xc2\x93' : '\x22', # Forward double quote 293 | '\xc2\x94' : '\x22', # Reverse double quote 294 | '\xc2\x95' : ' ', 295 | '\xc2\x96' : '-', # High hyphen 296 | '\xc2\x97' : '--', # Double hyphen 297 | '\xc2\x99' : ' ', 298 | '\xc2\xa0' : ' ', 299 | '\xc2\xa6' : '|', # Split vertical bar 300 | '\xc2\xab' : '<<', # Double less than 301 | '\xc2\xae' : '®', 302 | '\xc2\xbb' : '>>', # Double greater than 303 | '\xc2\xbc' : '1/4', # one quarter 304 | '\xc2\xbd' : '1/2', # one half 305 | '\xc2\xbe' : '3/4', # three quarters 306 | '\xca\xbf' : '\x27', # c-single quote 307 | '\xcc\xa8' : '', # modifier - under curve 308 | '\xcc\xb1' : '' # modifier - under line 309 | } 310 | def replace_chars(match): 311 | char = match.group(0) 312 | return chars[char] 313 | 314 | return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text) 315 | 316 | 317 | def normalize(s, encoding=settings.default_encoding, newlines=False): 318 | """Normalize the string by removing tags, unescaping, and removing surrounding whitespace 319 | 320 | >>> normalize('Tel.: 029 - 12345678 ') 321 | 'Tel.: 029 - 12345678' 322 | """ 323 | if isinstance(s, basestring): 324 | # remove tags and set encoding 325 | s = unescape(remove_tags(s), encoding=encoding, keep_unicode=isinstance(s, unicode)) 326 | if newlines: 327 | # keep multiple newlines 328 | s = re.sub('[\n\r]+', '\n', s) 329 | s = re.sub('[ \t\f\v]+', ' ', s) 330 | else: 331 | # replace all subsequent whitespace with single space 332 | s = re.sub('[\s]+', ' ', s) 333 | s = re.compile('', re.DOTALL).sub('', s).strip() 334 | return s 335 | 336 | 337 | def regex_get(html, pattern, index=None, normalized=True, flag=re.DOTALL|re.IGNORECASE, default='', one=False): 338 | """Helper method to extract content from regular expression 339 | 340 | >>> regex_get('
Phone: 029 01054609
', r'Phone:([^<>]+)') 341 | '029 01054609' 342 | >>> regex_get('
Phone: 029 01054609
', r'Phone:\s*(\d+) (\d+)') 343 | ['029', '01054609'] 344 | """ 345 | m = re.compile(pattern, flag).search(html) 346 | if m: 347 | if len(m.groups()) == 1: 348 | return normalize(m.groups()[0]) if normalized else m.groups()[0] 349 | elif index != None: 350 | return normalize(m.groups()[index]) if normalized else m.groups()[index] 351 | else: 352 | return [normalize(item) if normalized else item for item in m.groups()] 353 | return default 354 | 355 | 356 | def parse_jsonp(s): 357 | try: 358 | rindex = s.index('(') 359 | lindex = s.rindex(')') 360 | except IndexError: 361 | pass 362 | else: 363 | return json.loads(s[rindex+1 : lindex]) 364 | 365 | 366 | def safe(s): 367 | """Return characters in string that are safe for URLs 368 | 369 | >>> safe('U@#$_#^&*-2') 370 | 'U_-2' 371 | """ 372 | safe_chars = string.letters + string.digits + '-_ ' 373 | return ''.join(c for c in s if c in safe_chars).replace(' ', '-') 374 | 375 | 376 | def pretty(s): 377 | """Return pretty version of string for display 378 | 379 | >>> pretty('hello_world') 380 | 'Hello World' 381 | """ 382 | return re.sub('[-_]', ' ', s.title()) 383 | 384 | 385 | def pretty_paragraph(s): 386 | """Return pretty version of text in paragraph for display 387 | """ 388 | s = re.sub('<(br|hr|/li)[^>]*>', '\n', s, re.IGNORECASE) 389 | s = unescape(remove_tags(s)) 390 | def fixup(m): 391 | text = m.group(0) 392 | if '\r' in text or '\n' in text: return '\n' 393 | return ' ' 394 | return re.sub('\s+', fixup, s).strip() 395 | 396 | 397 | def get_extension(url): 398 | """Return extension from given URL 399 | 400 | >>> get_extension('hello_world.JPG') 401 | 'jpg' 402 | >>> get_extension('http://www.google-analytics.com/__utm.gif?utmwv=1.3&utmn=420639071') 403 | 'gif' 404 | """ 405 | return os.path.splitext(urlparse.urlsplit(url).path)[-1].lower().replace('.', '') 406 | 407 | 408 | def get_domain(url): 409 | """Extract the domain from the given URL 410 | 411 | >>> get_domain('http://www.google.com.au/tos.html') 412 | 'google.com.au' 413 | >>> get_domain('www.google.com') 414 | 'google.com' 415 | """ 416 | m = re.compile(r"^.*://(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})").search(url) 417 | if m: 418 | # an IP address 419 | return m.groups()[0] 420 | 421 | suffixes = 'ac', 'ad', 'ae', 'aero', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', 'arpa', 'as', 'asia', 'at', 'au', 'aw', 'ax', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi', 'biz', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', 'by', 'bz', 'ca', 'cat', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', 'cm', 'cn', 'co', 'com', 'coop', 'cr', 'cu', 'cv', 'cx', 'cy', 'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'edu', 'ee', 'eg', 'er', 'es', 'et', 'eu', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gb', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gov', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'im', 'in', 'info', 'int', 'io', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jobs', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mil', 'mk', 'ml', 'mm', 'mn', 'mo', 'mobi', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'name', 'nc', 'ne', 'net', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'org', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'pro', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'st', 'su', 'sv', 'sy', 'sz', 'tc', 'td', 'tel', 'tf', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 'to', 'tp', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'xn', 'ye', 'yt', 'za', 'zm', 'zw' 422 | url = re.sub('^.*://', '', url).partition('/')[0].lower() 423 | domain = [] 424 | for section in url.split('.'): 425 | if section in suffixes: 426 | domain.append(section) 427 | else: 428 | domain = [section] 429 | return '.'.join(domain) 430 | 431 | 432 | def same_domain(url1, url2): 433 | """Return whether URLs belong to same domain 434 | 435 | >>> same_domain('http://www.google.com.au', 'code.google.com') 436 | True 437 | >>> same_domain('http://www.facebook.com', 'http://www.myspace.com') 438 | False 439 | """ 440 | server1 = get_domain(url1) 441 | server2 = get_domain(url2) 442 | return server1 and server2 and (server1 in server2 or server2 in server1) 443 | 444 | 445 | def pretty_duration(dt): 446 | """Return english description of this time difference 447 | 448 | >>> from datetime import timedelta 449 | >>> pretty_duration(timedelta(seconds=1)) 450 | '1 second' 451 | >>> pretty_duration(timedelta(hours=1)) 452 | '1 hour' 453 | >>> pretty_duration(timedelta(days=2)) 454 | '2 days' 455 | """ 456 | if isinstance(dt, datetime): 457 | # convert datetime to timedelta 458 | dt = datetime.now() - dt 459 | if not isinstance(dt, timedelta): 460 | return '' 461 | if dt.days >= 2*365: 462 | return '%d years' % int(dt.days / 365) 463 | elif dt.days >= 365: 464 | return '1 year' 465 | elif dt.days >= 60: 466 | return '%d months' % int(dt.days / 30) 467 | elif dt.days > 21: 468 | return '1 month' 469 | elif dt.days >= 14: 470 | return '%d weeks' % int(dt.days / 7) 471 | elif dt.days >= 7: 472 | return '1 week' 473 | elif dt.days > 1: 474 | return '%d days' % dt.days 475 | elif dt.days == 1: 476 | return '1 day' 477 | elif dt.seconds >= 2*60*60: 478 | return '%d hours' % int(dt.seconds / 3600) 479 | elif dt.seconds >= 60*60: 480 | return '1 hour' 481 | elif dt.seconds >= 2*60: 482 | return '%d minutes' % int(dt.seconds / 60) 483 | elif dt.seconds >= 60: 484 | return '1 minute' 485 | elif dt.seconds > 1: 486 | return '%d seconds' % dt.seconds 487 | elif dt.seconds == 1: 488 | return '1 second' 489 | else: 490 | return '' 491 | 492 | 493 | def parse_proxy(proxy): 494 | """Parse a proxy into its fragments 495 | Returns a dict with username, password, host, and port 496 | 497 | >>> f = parse_proxy('login:pw@66.197.208.200:8080') 498 | >>> f.username 499 | 'login' 500 | >>> f.password 501 | 'pw' 502 | >>> f.host 503 | '66.197.208.200' 504 | >>> f.port 505 | '8080' 506 | >>> f = parse_proxy('66.197.208.200') 507 | >>> f.username == f.password == f.port == '' 508 | True 509 | >>> f.host 510 | '66.197.208.200' 511 | """ 512 | fragments = adt.Bag() 513 | if isinstance(proxy, basestring): 514 | match = re.match('((?P\w+):(?P\w+)@)?(?P\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})(:(?P\d+))?', proxy) 515 | if match: 516 | groups = match.groupdict() 517 | fragments.username = groups.get('username') or '' 518 | fragments.password = groups.get('password') or '' 519 | fragments.host = groups.get('host') 520 | fragments.port = groups.get('port') or '' 521 | return fragments 522 | 523 | 524 | def read_list(file): 525 | """Return file as list if exists 526 | """ 527 | l = [] 528 | if os.path.exists(file): 529 | l.extend(open(file).read().splitlines()) 530 | else: 531 | logger.debug('%s not found' % file) 532 | return l 533 | 534 | 535 | class UnicodeWriter: 536 | """A CSV writer that produces Excel-compatible CSV files from unicode data. 537 | 538 | file: 539 | can either be a filename or a file object 540 | encoding: 541 | the encoding to use for output 542 | mode: 543 | the mode for writing to file 544 | unique: 545 | if True then will only write unique rows to output 546 | unique_by: 547 | make the rows unique by these columns(the value is a list of indexs), default by all columns 548 | quoting: 549 | csv module quoting style to use 550 | utf8_bom: 551 | whether need to add the BOM 552 | auto_repair: 553 | whether need to remove the invalid rows automatically 554 | 555 | >>> from StringIO import StringIO 556 | >>> fp = StringIO() 557 | >>> writer = UnicodeWriter(fp, quoting=csv.QUOTE_MINIMAL) 558 | >>> writer.writerow(['a', '1']) 559 | >>> writer.flush() 560 | >>> fp.seek(0) 561 | >>> fp.read().strip() 562 | 'a,1' 563 | """ 564 | def __init__(self, file, encoding=settings.default_encoding, mode='wb', unique=False, unique_by=None, quoting=csv.QUOTE_ALL, utf8_bom=False, auto_repair=False, **argv): 565 | self.encoding = encoding 566 | self.unique = unique 567 | self.unique_by = unique_by 568 | if hasattr(file, 'write'): 569 | self.fp = file 570 | else: 571 | if auto_repair: 572 | self._remove_invalid_rows(file=file, quoting=quoting, **argv) 573 | if utf8_bom: 574 | self.fp = open(file, 'wb') 575 | self.fp.write('\xef\xbb\xbf') 576 | self.fp.close() 577 | self.fp = open(file, mode=mode.replace('w', 'a')) 578 | else: 579 | self.fp = open(file, mode) 580 | if self.unique: 581 | self.rows = adt.HashDict() # cache the rows that have already been written 582 | for row in csv.reader(open(self.fp.name)): 583 | self.rows[self._unique_key(row)] = True 584 | self.writer = csv.writer(self.fp, quoting=quoting, **argv) 585 | 586 | def _unique_key(self, row): 587 | """Generate the unique key 588 | """ 589 | return '_'.join([str(row[i]) for i in self.unique_by]) if self.unique_by else str(row) 590 | 591 | def _remove_invalid_rows(self, file, **argv): 592 | """Remove invalid csv rows e.g. newline inside string 593 | """ 594 | if os.path.exists(file): 595 | file_obj = open(file) 596 | tmp_file = file + '.tmp' 597 | tmp_file_obj = open(tmp_file, 'wb') 598 | writer = csv.writer(tmp_file_obj, **argv) 599 | try: 600 | for row in csv.reader(file_obj): 601 | writer.writerow(row) 602 | except Exception, e: 603 | pass 604 | file_obj.close() 605 | tmp_file_obj.close() 606 | os.remove(file) 607 | os.rename(tmp_file, file) 608 | 609 | def _cell(self, s): 610 | """Normalize the content for this cell 611 | """ 612 | if isinstance(s, basestring): 613 | if isinstance(s, unicode): 614 | s = s.encode(self.encoding, 'ignore') 615 | elif s is None: 616 | s = '' 617 | else: 618 | s = str(s) 619 | return s 620 | 621 | def writerow(self, row): 622 | """Write row to output 623 | """ 624 | row = [self._cell(col) for col in row] 625 | if self.unique: 626 | if self._unique_key(row) not in self.rows: 627 | self.writer.writerow(row) 628 | self.rows[self._unique_key(row)] = True 629 | else: 630 | self.writer.writerow(row) 631 | 632 | def writerows(self, rows): 633 | """Write multiple rows to output 634 | """ 635 | for row in rows: 636 | self.writerow(row) 637 | 638 | def flush(self): 639 | """Flush output to disk 640 | """ 641 | self.fp.flush() 642 | if hasattr(self.fp, 'fileno'): 643 | # this is a real file 644 | os.fsync(self.fp.fileno()) 645 | 646 | def close(self): 647 | """Close the output file pointer 648 | """ 649 | self.fp.close() 650 | 651 | 652 | def csv_to_xls(filename): 653 | from xlsxwriter.workbook import Workbook 654 | workbook = Workbook(filename[:-4] + '.xlsx') 655 | worksheet = workbook.add_worksheet() 656 | with open(filename, 'rt') as f: 657 | reader = csv.reader(f) 658 | for r, row in enumerate(reader): 659 | for c, col in enumerate(row): 660 | worksheet.write(r, c, col.decode('utf-8')) 661 | workbook.close() 662 | 663 | 664 | 665 | # decrypt chrome cookies 666 | class Chrome: 667 | def __init__(self): 668 | import keyring 669 | from Crypto.Protocol.KDF import PBKDF2 670 | salt = b'saltysalt' 671 | length = 16 672 | # If running Chrome on OSX 673 | if sys.platform == 'darwin': 674 | my_pass = keyring.get_password('Chrome Safe Storage', 'Chrome') 675 | my_pass = my_pass.encode('utf8') 676 | iterations = 1003 677 | self.cookie_file = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default/Cookies') 678 | 679 | # If running Chromium on Linux 680 | elif 'linux' in sys.platform: 681 | my_pass = 'peanuts'.encode('utf8') 682 | iterations = 1 683 | self.cookie_file = os.path.expanduser('~/.config/chromium/Default/Cookies') 684 | else: 685 | raise Exception("This script only works on OSX or Linux.") 686 | self.key = PBKDF2(my_pass, salt, length, iterations) 687 | 688 | def decrypt(self, value, encrypted_value): 689 | if value or (encrypted_value[:3] != b'v10'): 690 | return value 691 | 692 | from Crypto.Cipher import AES 693 | 694 | # Encrypted cookies should be prefixed with 'v10' according to the 695 | # Chromium code. Strip it off. 696 | encrypted_value = encrypted_value[3:] 697 | 698 | # Strip padding by taking off number indicated by padding 699 | # eg if last is '\x0e' then ord('\x0e') == 14, so take off 14. 700 | # You'll need to change this function to use ord() for python2. 701 | def clean(x): 702 | return x[:-ord(x[-1])].decode('utf8') 703 | 704 | iv = b' ' * 16 705 | cipher = AES.new(self.key, AES.MODE_CBC, IV=iv) 706 | decrypted = cipher.decrypt(encrypted_value) 707 | return clean(decrypted) 708 | 709 | 710 | # XXX merge common parts with firefox 711 | def chrome_cookie(filename=None, tmp_sqlite_file='cookies.sqlite', tmp_cookie_file='cookies.txt'): 712 | if filename is None: 713 | filename = os.path.expanduser("~/.config/google-chrome/Default/Cookies") 714 | if not os.path.exists(filename): 715 | raise WebScrapingError('Can not find chrome cookie file') 716 | 717 | open(tmp_sqlite_file, 'wb').write(open(filename, 'rb').read()) 718 | con = sqlite3.connect(tmp_sqlite_file) 719 | cur = con.cursor() 720 | cur.execute('SELECT host_key, path, secure, expires_utc, name, value, encrypted_value FROM cookies;') 721 | # create standard cookies file that can be interpreted by cookie jar 722 | # XXX change to create directly without temp file 723 | fp = open(tmp_cookie_file, 'w') 724 | fp.write('# Netscape HTTP Cookie File\n') 725 | fp.write('# http://www.netscape.com/newsref/std/cookie_spec.html\n') 726 | fp.write('# This is a generated file! Do not edit.\n') 727 | ftstr = ['FALSE', 'TRUE'] 728 | chrome = Chrome() 729 | for item in cur.fetchall(): 730 | value = chrome.decrypt(item[5], item[6]) 731 | row = u'%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (item[0], ftstr[item[0].startswith('.')], item[1], ftstr[item[2]], item[3], item[4], value) 732 | fp.write(row) 733 | 734 | fp.close() 735 | # close the connection before delete the sqlite file 736 | con.close() 737 | os.remove(tmp_sqlite_file) 738 | 739 | cookie_jar = cookielib.MozillaCookieJar() 740 | cookie_jar.load(tmp_cookie_file) 741 | os.remove(tmp_cookie_file) 742 | 743 | return cookie_jar 744 | 745 | 746 | 747 | def firefox_cookie(file=None, tmp_sqlite_file='cookies.sqlite', tmp_cookie_file='cookies.txt'): 748 | """Create a cookie jar from this FireFox 3 sqlite cookie database 749 | 750 | >>> cj = firefox_cookie() 751 | >>> opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 752 | >>> url = 'http://code.google.com/p/webscraping' 753 | >>> html = opener.open(url).read() 754 | """ 755 | if file is None: 756 | try: 757 | # add Windows version support 758 | file = (glob.glob(os.path.join(os.environ.get('PROGRAMFILES', ''), 'Mozilla Firefox/profile/cookies.sqlite')) or \ 759 | glob.glob(os.path.join(os.environ.get('PROGRAMFILES(X86)', ''), 'Mozilla Firefox/profile/cookies.sqlite')) or \ 760 | glob.glob(os.path.expanduser('~/.mozilla/firefox/*.default/cookies.sqlite')) or \ 761 | glob.glob(os.path.expanduser(r'~\AppData\Roaming\Mozilla\Firefox\Profiles\*.default\cookies.sqlite')))[0] 762 | except IndexError: 763 | raise WebScrapingError('Can not find filefox cookie file') 764 | 765 | # copy firefox cookie file locally to avoid locking problems 766 | open(tmp_sqlite_file, 'wb').write(open(file, 'rb').read()) 767 | con = sqlite3.connect(tmp_sqlite_file) 768 | cur = con.cursor() 769 | cur.execute('select host, path, isSecure, expiry, name, value from moz_cookies') 770 | 771 | # create standard cookies file that can be interpreted by cookie jar 772 | fp = open(tmp_cookie_file, 'w') 773 | fp.write('# Netscape HTTP Cookie File\n') 774 | fp.write('# http://www.netscape.com/newsref/std/cookie_spec.html\n') 775 | fp.write('# This is a generated file! Do not edit.\n') 776 | ftstr = ['FALSE', 'TRUE'] 777 | for item in cur.fetchall(): 778 | row = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (item[0], ftstr[item[0].startswith('.')], item[1], ftstr[item[2]], item[3], item[4], item[5]) 779 | fp.write(row) 780 | 781 | # session cookies are saved into sessionstore.js 782 | session_cookie_path = os.path.join(os.path.dirname(file), 'sessionstore.js') 783 | if os.path.exists(session_cookie_path): 784 | try: 785 | json_data = json.loads(open(session_cookie_path, 'rb').read().strip('()')) 786 | except Exception, e: 787 | print str(e) 788 | else: 789 | ftstr = ['FALSE', 'TRUE'] 790 | if 'windows' in json_data: 791 | for window in json_data['windows']: 792 | if 'cookies' in window: 793 | for cookie in window['cookies']: 794 | row = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (cookie.get('host', ''), ftstr[cookie.get('host', '').startswith('.')], \ 795 | cookie.get('path', ''), False, str(int(time.time()) + 3600 * 24 * 7), \ 796 | cookie.get('name', ''), cookie.get('value', '')) 797 | fp.write(row) 798 | 799 | fp.close() 800 | # close the connection before delete the sqlite file 801 | con.close() 802 | 803 | cookie_jar = cookielib.MozillaCookieJar() 804 | cookie_jar.load(tmp_cookie_file) 805 | 806 | # remove temporary files 807 | os.remove(tmp_sqlite_file) 808 | os.remove(tmp_cookie_file) 809 | return cookie_jar 810 | 811 | 812 | def build_opener(cj=None): 813 | if cj is None: 814 | cj = cookielib.CookieJar() 815 | return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 816 | 817 | 818 | def start_threads(fn, num_threads=20, args=(), wait=True): 819 | """Shortcut to start these threads with given args and wait for all to finish 820 | """ 821 | threads = [threading.Thread(target=fn, args=args) for i in range(num_threads)] 822 | # Start threads one by one 823 | for thread in threads: 824 | thread.start() 825 | # Wait for all threads to finish 826 | if wait: 827 | for thread in threads: 828 | thread.join() 829 | 830 | 831 | class ConsoleHandler(logging.StreamHandler): 832 | """Log to stderr for errors else stdout 833 | """ 834 | def __init__(self): 835 | logging.StreamHandler.__init__(self) 836 | self.stream = None 837 | 838 | def emit(self, record): 839 | if record.levelno >= logging.ERROR: 840 | self.stream = sys.stderr 841 | else: 842 | self.stream = sys.stdout 843 | logging.StreamHandler.emit(self, record) 844 | 845 | 846 | def get_logger(output_file, level=settings.log_level, maxbytes=0): 847 | """Create a logger instance 848 | 849 | output_file: 850 | file where to save the log 851 | level: 852 | the minimum logging level to save 853 | maxbytes: 854 | the maxbytes allowed for the log file size. 0 means no limit. 855 | """ 856 | logger = logging.getLogger(output_file) 857 | # avoid duplicate handlers 858 | if not logger.handlers: 859 | logger.setLevel(logging.DEBUG) 860 | try: 861 | if not maxbytes: 862 | file_handler = logging.FileHandler(output_file) 863 | else: 864 | file_handler = logging.handlers.RotatingFileHandler(output_file, maxBytes=maxbytes) 865 | except IOError: 866 | pass # can not write file 867 | else: 868 | file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) 869 | logger.addHandler(file_handler) 870 | 871 | console_handler = ConsoleHandler() 872 | console_handler.setLevel(level) 873 | logger.addHandler(console_handler) 874 | return logger 875 | logger = get_logger(settings.log_file, maxbytes=2*1024*1024*1024) 876 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Webscraping.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Webscraping.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Webscraping" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Webscraping" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Webscraping documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Dec 28 09:34:47 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.viewcode'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'webscraping' 44 | copyright = u'2012, Richard Penman' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'webscrapingdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'Webscraping.tex', u'Webscraping Documentation', 187 | u'Richard Penman', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'webscraping', u'Webscraping Documentation', 217 | [u'Richard Penman'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'Webscraping', u'Webscraping Documentation', 231 | u'Richard Penman', 'Webscraping', 'One line description of project.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | =========== 3 | 4 | 5 | Simple extraction 6 | ----------------- 7 | 8 | Except project title from the Google Code page: 9 | 10 | .. code-block:: python 11 | 12 | from webscraping import download, xpath 13 | D = download.Download() 14 | # download and cache the Google Code webpage 15 | html = D.get('http://code.google.com/p/webscraping') 16 | # use xpath to extract the project title 17 | project_title = xpath.get(html, '//div[@id="pname"]/a/span') 18 | 19 | 20 | Blog scraper 21 | ------------ 22 | 23 | Scrape all articles from a blog 24 | 25 | .. code-block:: python 26 | 27 | import itertools 28 | import urlparse 29 | from webscraping import common, download, xpath 30 | 31 | DOMAIN = ... 32 | writer = common.UnicodeWriter('articles.csv') 33 | writer.writerow(['Title', 'Num reads', 'URL']) 34 | seen_urls = set() # track which articles URL's already seen, to prevent duplicates 35 | D = download.Download() 36 | 37 | # iterate each of the categories 38 | for category_link in ('/developer/knowledge-base?page=%d', '/developer/articles?page=%d'): 39 | # iterate the pages of a category 40 | for page in itertools.count(): 41 | category_html = D.get(urlparse.urljoin(DOMAIN, category_link % page)) 42 | article_links = xpath.search(category_html, '//div[@class="morelink"]/a/@href') 43 | num_new_articles = 0 44 | for article_link in article_links: 45 | # scrape each article 46 | url = urlparse.urljoin(DOMAIN, article_link) 47 | if url not in seen_urls: 48 | num_new_articles += 1 49 | seen_urls.add(url) 50 | html = D.get(url) 51 | title = xpath.get(html, '//div[@class="feed-header-wrap"]/h2') 52 | num_reads = xpath.get(html, '//li[@class="statistics_counter last"]/span').replace(' reads', '') 53 | row = title, num_reads, url 54 | writer.writerow(row) 55 | if num_new_articles == 0: 56 | break # have found all articles for this category 57 | 58 | 59 | Business directory threaded scraper 60 | ----------------------------------- 61 | 62 | Scrape all businesses from this popular directory 63 | 64 | .. code-block:: python 65 | 66 | import csv 67 | import re 68 | import string 69 | from webscraping import common, download, xpath 70 | 71 | DOMAIN = ... 72 | 73 | class BusinessDirectory: 74 | def __init__(self, output_file='businesses.csv'): 75 | self.writer = common.UnicodeWriter(output_file) 76 | self.writer.writerow(['Name', 'Address']) 77 | 78 | def __call__(self, D, url, html): 79 | urls = [] 80 | if url == DOMAIN: 81 | # crawl the index pages 82 | urls = [DOMAIN + '/atoz/%s.html' % letter for letter in string.uppercase + '#'] 83 | elif re.search('/atoz/\w\.html', url): 84 | # crawl the categories 85 | urls = [DOMAIN + link for link in xpath.search(html, '//div[@id="partitionContainer"]//a/@href')] 86 | elif re.search('/atoz/\w/\d+\.html', url): 87 | # crawl the businesses 88 | urls = [DOMAIN + link for link in xpath.search(html, '//div[@id="listingsContainer"]//a/@href')] 89 | else: 90 | # scrape business details 91 | name = xpath.get(html, '//h1[@class="listingName"]') 92 | address = xpath.get(html, '//span[@class="listingAddressText"]') 93 | row = name, address 94 | self.writer.writerow(row) 95 | return urls 96 | 97 | download.threaded_get(url=DOMAIN, proxies=proxies, cb=BusinessDirectory()) 98 | 99 | 100 | Daily deal threaded scraper 101 | --------------------------- 102 | 103 | Scrape all deals from a popular daily deal website: 104 | 105 | .. code-block:: python 106 | 107 | import re 108 | import csv 109 | import urlparse 110 | from webscraping import common, download, xpath 111 | 112 | 113 | DOMAIN = ... 114 | writer = csv.writer(open('daily_deals.csv', 'w')) 115 | writer.writerow(['Company', 'Address', 'Website', 'Email']) 116 | 117 | def daily_deal(D, url, html): 118 | """This callback is called after each download 119 | """ 120 | if url == DOMAIN: 121 | # first download - get all the city deal pages 122 | links = [link.replace('/deals/', '/all-deals/') for link in xpath.search(html, '//a[@class="jCityLink"]/@href')] 123 | elif '/all-deals/' in url: 124 | # city page downloaded - get all the deals 125 | links = re.findall('"dealPermaLink":"(.*?)"', html) 126 | else: 127 | # deal page downloaded - extract the details 128 | company = xpath.get(html, '//div[@class="merchantContact"]/h2') 129 | website = xpath.get(html, '//div[@class="merchantContact"]/a/@href') 130 | address = common.unescape(xpath.get(html, '//div[@class="merchantContact"]/text()')).replace('Returns:', '').strip() 131 | if website: 132 | # crawl website for contact email 133 | email = '\n'.join(D.get_emails(website)) 134 | else: 135 | email = None 136 | row = company, address, website, email 137 | # write deal details to CSV 138 | writer.writerow(row) 139 | links = [] 140 | 141 | return [urlparse.urljoin(DOMAIN, link) for link in links] 142 | 143 | # start the crawler 144 | download.threaded_get(url=DOMAIN, proxy_file='proxies.txt', cb=daily_deal, num_retries=1) 145 | 146 | 147 | Navigate a website 148 | ------------------ 149 | 150 | Use webkit to navigate and interact with a website: 151 | 152 | .. code-block:: python 153 | 154 | from webscraping import webkit 155 | w = webkit.WebkitBrowser(gui=True) 156 | # load webpage 157 | w.get('http://duckduckgo.com') 158 | # fill search textbox 159 | w.fill('input[id=search_form_input_homepage]', 'webscraping') 160 | # take screenshot of browser 161 | w.screenshot('duckduckgo_search.jpg') 162 | # click search button 163 | w.click('input[id=search_button_homepage]') 164 | # wait on results page 165 | w.wait(10) 166 | # take another screenshot 167 | w.screenshot('duckduckgo_results.jpg') 168 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Webscraping documentation master file, created by 2 | sphinx-quickstart on Fri Dec 28 09:34:47 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | webscraping documentation 7 | ======================================= 8 | 9 | .. toctree:: 10 | 11 | introduction 12 | examples 13 | reference 14 | 15 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | Background 5 | ---------- 6 | 7 | For the last few years I have been specializing at web scraping and collected what I found useful into this library. 8 | 9 | All code is pure Python and has been run across multiple Linux servers, Windows machines, as well as `Google App Engine `_. 10 | 11 | 12 | Install 13 | ------- 14 | 15 | Some options to install the webscraping package: 16 | 17 | #. Checkout the repository: *hg clone https://code.google.com/p/webscraping/* 18 | #. Download the zip: https://pypi.python.org/pypi/webscraping/ 19 | #. Install with pypi: *pip install webscraping* 20 | 21 | The only dependency is python 2.5 or higher. 22 | 23 | 24 | License 25 | ------- 26 | 27 | This code is licensed under the `LGPL license `_. 28 | 29 | 30 | Contact 31 | ------- 32 | 33 | richard@webscraping.com 34 | -------------------------------------------------------------------------------- /docs/reference.rst: -------------------------------------------------------------------------------- 1 | .. Webscraping documentation master file, created by 2 | sphinx-quickstart on Fri Dec 28 09:34:47 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Reference 7 | ======================================= 8 | 9 | 10 | :mod:`adt` Module 11 | ----------------- 12 | .. automodule:: webscraping.adt 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | 18 | :mod:`alg` Module 19 | ----------------- 20 | .. automodule:: webscraping.alg 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | :mod:`common` Module 26 | -------------------- 27 | .. automodule:: webscraping.common 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | :mod:`download` Module 33 | ---------------------- 34 | .. automodule:: webscraping.download 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | :mod:`pdict` Module 40 | ------------------- 41 | .. automodule:: webscraping.pdict 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | :mod:`webkit` Module 47 | -------------------- 48 | .. automodule:: webscraping.webkit 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | :mod:`xpath` Module 54 | ------------------- 55 | .. automodule:: webscraping.xpath 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | __doc__ = 'Helper methods to download and crawl web content using threads' 2 | 3 | import os 4 | import re 5 | import sys 6 | import copy 7 | import collections 8 | import random 9 | import urllib 10 | import urllib2 11 | import urlparse 12 | import StringIO 13 | import time 14 | import datetime 15 | import subprocess 16 | import socket 17 | import gzip 18 | import zlib 19 | import thread 20 | import threading 21 | import contextlib 22 | import tempfile 23 | try: 24 | import hashlib 25 | except ImportError: 26 | import md5 as hashlib 27 | try: 28 | import cPickle as pickle 29 | except: 30 | import pickle 31 | try: 32 | import json 33 | except ImportError: 34 | import simplejson as json 35 | 36 | import adt 37 | import alg 38 | import common 39 | import settings 40 | try: 41 | import pdict 42 | except ImportError: 43 | # sqlite not installed 44 | pdict = None 45 | 46 | SLEEP_TIME = 0.1 # how long to sleep when waiting for network activity 47 | DEFAULT_PRIORITY = 1 # default queue priority 48 | 49 | 50 | 51 | class ProxyPerformance: 52 | """Track performance of proxies 53 | If 10 errors in a row that other proxies could handle then need to remove 54 | """ 55 | def __init__(self): 56 | self.proxy_errors = collections.defaultdict(int) 57 | 58 | def success(self, proxy): 59 | """Successful download - so clear error count 60 | """ 61 | self.proxy_errors[proxy] = 0 62 | 63 | def error(self, proxy): 64 | """Add to error count and returns number of consecutive errors for this proxy 65 | """ 66 | if proxy: 67 | self.proxy_errors[proxy] += 1 68 | return self.proxy_errors[proxy] 69 | 70 | 71 | 72 | class Download: 73 | """ 74 | cache: 75 | a pdict object to use for the cache 76 | cache_file: 77 | filename to store cached data 78 | read_cache: 79 | whether to read from the cache 80 | write_cache: 81 | whether to write to the cache 82 | use_network: 83 | whether to download content not in the cache 84 | user_agent 85 | the User Agent to download content with 86 | timeout: 87 | the maximum amount of time to wait for http response 88 | delay: 89 | the minimum amount of time (in seconds) to wait after downloading content from a domain per proxy 90 | proxy_file: 91 | a filename to read proxies from 92 | max_proxy_errors: 93 | the maximum number of consecutive errors allowed per proxy before discarding 94 | an error is only counted if another proxy is able to successfully download the URL 95 | set to None to disable 96 | proxies: 97 | a list of proxies to cycle through when downloading content 98 | opener: 99 | an optional opener to use instead of using urllib2 directly 100 | headers: 101 | the headers to include in the request 102 | data: 103 | what to post at the URL 104 | if None (default) then a GET request will be made 105 | num_retries: 106 | how many times to try downloading a URL when get an error 107 | num_redirects: 108 | how many times the URL is allowed to be redirected, to avoid infinite loop 109 | force_html: 110 | whether to download non-text data 111 | force_ascii: 112 | whether to only return ascii characters 113 | max_size: 114 | maximum number of bytes that will be downloaded, or None to disable 115 | default: 116 | what to return when no content can be downloaded 117 | pattern: 118 | a regular expression that the downloaded HTML has to match to be considered a valid download 119 | acceptable_errors: 120 | a list contains all acceptable HTTP codes, don't try downloading for them e.g. no need to retry for 404 error 121 | throttle_additional_key: 122 | Sometimes the website limits the request only by session(rather than IP), we can use this parameter to keep each thread delaying independently 123 | keep_ip_ua: 124 | If it's True, one proxy IP will keep using the same User-agent, otherwise will use a random User-agent for each request. 125 | ssl_context: 126 | provide ssl context argument to urlopen 127 | """ 128 | 129 | def __init__(self, cache=None, cache_file=None, read_cache=True, write_cache=True, cache_expires=None, use_network=True, 130 | user_agent=None, timeout=30, delay=5, proxies=None, proxy_file=None, max_proxy_errors=5, 131 | opener=None, headers=None, data=None, num_retries=0, num_redirects=0, 132 | force_html=False, force_ascii=False, max_size=None, default='', pattern=None, acceptable_errors=None, 133 | throttle_additional_key=None, keep_ip_ua=True, ssl_context=None, **kwargs): 134 | socket.setdefaulttimeout(timeout) 135 | need_cache = read_cache or write_cache 136 | if pdict and need_cache: 137 | cache_file = cache_file or settings.cache_file 138 | self.cache = cache or pdict.PersistentDict(cache_file, expires=cache_expires) 139 | else: 140 | self.cache = None 141 | if need_cache: 142 | common.logger.warning('Cache disabled because could not import pdict') 143 | 144 | self.settings = adt.Bag( 145 | read_cache = read_cache, 146 | write_cache = write_cache, 147 | use_network = use_network, 148 | delay = delay, 149 | proxies = (common.read_list(proxy_file) if proxy_file else []) or proxies or [], 150 | proxy_file = proxy_file, 151 | max_proxy_errors = max_proxy_errors, 152 | user_agent = user_agent, 153 | opener = opener, 154 | headers = headers, 155 | data = data, 156 | num_retries = num_retries, 157 | num_redirects = num_redirects, 158 | force_html = force_html, 159 | force_ascii = force_ascii, 160 | max_size = max_size, 161 | default = default, 162 | pattern = pattern, 163 | keep_ip_ua = keep_ip_ua, 164 | acceptable_errors = acceptable_errors, 165 | ssl_context = ssl_context 166 | ) 167 | self.last_load_time = self.last_mtime = time.time() 168 | self.num_downloads = self.num_errors = 0 169 | self.throttle_additional_key = throttle_additional_key 170 | 171 | 172 | proxy_performance = ProxyPerformance() 173 | def get(self, url, **kwargs): 174 | """Download this URL and return the HTML. 175 | By default HTML is cached so only have to download once. 176 | 177 | url: 178 | what to download 179 | kwargs: 180 | override any of the arguments passed to constructor 181 | """ 182 | self.reload_proxies() 183 | self.proxy = None # the current proxy 184 | self.final_url = None # for tracking redirects 185 | self.response_code = '' # keep response code 186 | self.response_headers = {} # keep response headers 187 | self.downloading_error = None # keep downloading error 188 | self.num_downloads = self.num_errors = 0 # track the number of downloads made 189 | 190 | # update settings with any local overrides 191 | settings = adt.Bag(self.settings) 192 | settings.update(kwargs) 193 | # check cache for whether this content is already downloaded 194 | key = self.get_key(url, settings.data) 195 | if self.cache and settings.read_cache: 196 | try: 197 | html = self.cache[key] 198 | if self.invalid_response(html, settings.pattern): 199 | # invalid result from download 200 | html = None 201 | except KeyError: 202 | pass # have not downloaded yet 203 | else: 204 | if not html and settings.num_retries > 0: 205 | try: 206 | meta = self.cache.meta(key) 207 | except KeyError: 208 | pass 209 | else: 210 | if meta.get('status', '').startswith('404'): 211 | # don't retry 4XX errors 212 | common.logger.debug('Ignoring URL with previous status {}'.format(meta['status'])) 213 | return settings.default 214 | # try downloading again 215 | common.logger.debug('Redownloading') 216 | settings.num_retries -= 1 217 | else: 218 | # return previously downloaded content 219 | return html or settings.default 220 | if not settings.use_network: 221 | # only want previously cached content 222 | return settings.default 223 | 224 | html = None 225 | failed_proxies = set() # record which proxies failed to download for this URL 226 | # attempt downloading content at URL 227 | while settings.num_retries >= 0 and html is None: 228 | settings.num_retries -= 1 229 | if settings.proxy: 230 | self.proxy = settings.proxy 231 | else: 232 | self.proxy = self.get_proxy(settings.proxies) 233 | # crawl slowly for each domain to reduce risk of being blocked 234 | self.throttle(url, delay=settings.delay, proxy=self.proxy) 235 | html = self.fetch(url, headers=settings.headers, data=settings.data, proxy=self.proxy, user_agent=settings.user_agent, opener=settings.opener, pattern=settings.pattern, max_size=settings.max_size, ssl_context=settings.ssl_context) 236 | 237 | if html: 238 | # successfully downloaded 239 | self.num_downloads += 1 240 | if settings.max_proxy_errors is not None: 241 | Download.proxy_performance.success(self.proxy) 242 | # record which proxies failed for this download 243 | for proxy in failed_proxies: 244 | if Download.proxy_performance.error(self.proxy) > settings.max_proxy_errors: 245 | # this proxy has had too many errors so remove 246 | common.logger.warning('Removing unstable proxy from list after %d consecutive errors: %s' % (settings.max_proxy_errors, self.proxy)) 247 | settings.proxies.remove(self.proxy) 248 | else: 249 | # download failed - try again 250 | self.num_errors += 1 251 | failed_proxies.add(self.proxy) 252 | 253 | 254 | if html: 255 | if settings.num_redirects > 0: 256 | # allowed to redirect 257 | redirect_url = get_redirect(url=url, html=html) 258 | if redirect_url: 259 | # found a redirection 260 | common.logger.debug('%s redirecting to %s' % (url, redirect_url)) 261 | settings.num_redirects -= 1 262 | html = self.get(redirect_url, **settings) or '' 263 | # make relative links absolute so will still work after redirect 264 | relative_re = re.compile('(<\s*a[^>]+href\s*=\s*["\']?)(?!http)([^"\'>]+)', re.IGNORECASE) 265 | try: 266 | html = relative_re.sub(lambda m: m.group(1) + urlparse.urljoin(url, m.group(2)), html) 267 | except UnicodeDecodeError: 268 | pass 269 | html = self._clean_content(html=html, max_size=settings.max_size, force_html=settings.force_html, force_ascii=settings.force_ascii) 270 | 271 | if self.cache and settings.write_cache: 272 | # cache results 273 | self.cache[key] = html 274 | meta = {} 275 | if self.final_url and url != self.final_url: 276 | # cache what URL was redirected to 277 | meta['url'] = self.final_url 278 | if self.response_code and self.response_code != '200': 279 | meta['status'] = self.response_code 280 | if meta: 281 | self.cache.meta(key, meta) 282 | 283 | # return default if no content 284 | return html or settings.default 285 | 286 | 287 | def exists(self, url): 288 | """Do a HEAD request to check whether webpage exists 289 | """ 290 | success = False 291 | key = self.get_key(url, 'head') 292 | try: 293 | if self.cache and self.settings.read_cache: 294 | success = self.cache[key] 295 | else: 296 | raise KeyError('No cache') 297 | except KeyError: 298 | # have not downloaded yet 299 | request = urllib2.Request(url) 300 | request.get_method = lambda : 'HEAD' 301 | try: 302 | response = urllib2.urlopen(request, context=self.settings.ssl_context) 303 | except Exception, e: 304 | common.logger.warning('HEAD check miss: %s %s' % (url, e)) 305 | else: 306 | success = True 307 | common.logger.info('HEAD check hit: %s' % url) 308 | if self.cache: 309 | self.cache[key] = success 310 | return success 311 | 312 | 313 | def get_key(self, url, data=None): 314 | """Create key for caching this request 315 | """ 316 | key = url 317 | if data: 318 | key += ' ' + str(data) 319 | return key 320 | 321 | 322 | def _clean_content(self, html, max_size, force_html, force_ascii): 323 | """Clean up downloaded content 324 | 325 | html: 326 | the input to clean 327 | max_size: 328 | the maximum size of data allowed 329 | force_html: 330 | content must be HTML 331 | force_ascii: 332 | content must be ASCII 333 | """ 334 | if max_size is not None and len(html) > max_size: 335 | common.logger.info('Webpage is too big: %s' % len(html)) 336 | html = '' # too big to store 337 | elif force_html and not common.is_html(html): 338 | common.logger.info('Webpage is not html') 339 | html = '' # non-html content 340 | elif force_ascii: 341 | html = common.to_ascii(html) # remove non-ascii characters 342 | return html 343 | 344 | 345 | def get_proxy(self, proxies=None): 346 | """Return random proxy if available 347 | """ 348 | if proxies: 349 | proxy = random.choice(proxies) 350 | elif self.settings.proxies: 351 | # select next available proxy 352 | proxy = random.choice(self.settings.proxies) 353 | else: 354 | proxy = None 355 | return proxy 356 | 357 | 358 | # cache the user agent used for each proxy 359 | proxy_agents = {} 360 | def get_user_agent(self, proxy): 361 | """Get user agent for this proxy 362 | """ 363 | if self.settings.keep_ip_ua and proxy in Download.proxy_agents: 364 | # have used this proxy before so return same user agent 365 | user_agent = Download.proxy_agents[proxy] 366 | else: 367 | # assign random user agent to this proxy 368 | user_agent = alg.rand_agent() 369 | Download.proxy_agents[proxy] = user_agent 370 | return user_agent 371 | 372 | 373 | def invalid_response(self, html, pattern): 374 | """Return whether the response contains a regex error pattern 375 | """ 376 | return html is None or (pattern and not re.compile(pattern, re.DOTALL | re.IGNORECASE).search(html)) 377 | 378 | 379 | def fetch(self, url, headers=None, data=None, proxy=None, user_agent=None, opener=None, pattern=None, max_size=None, ssl_context=None): 380 | """Simply download the url and return the content 381 | """ 382 | self.error_content = None 383 | # create opener with headers 384 | if not opener: 385 | opener = common.build_opener() 386 | if proxy: 387 | # avoid duplicate ProxyHandler 388 | opener.add_handler(urllib2.ProxyHandler({urlparse.urlparse(url).scheme : proxy})) 389 | if ssl_context is not None: 390 | # add ssl context XXX does not work 391 | https_handler = urllib2.HTTPSHandler(context=ssl_context) 392 | opener.add_handler(https_handler) 393 | 394 | headers = headers or {} 395 | default_headers = settings.default_headers.copy() 396 | default_headers['User-Agent'] = user_agent or self.get_user_agent(proxy) 397 | if not max_size: 398 | default_headers['Accept-Encoding'] = 'gzip, deflate' 399 | lowercase_headers = [name.lower() for name in headers.keys()] 400 | for name, value in default_headers.items(): 401 | if name.lower() not in lowercase_headers: 402 | if name == 'Referer': 403 | value = url 404 | headers[name] = value 405 | if 'Host' in headers and not headers['Host']: 406 | del headers['Host'] # some websites raise an error when host is included 407 | 408 | if isinstance(data, dict): 409 | # encode data for POST 410 | data = urllib.urlencode(sorted(data.items())) 411 | common.logger.info('Downloading %s %s' % (url, data or '')) 412 | try: 413 | request = urllib2.Request(urllib.quote(url, safe='/:?&+=%()'), data, headers) 414 | with contextlib.closing(opener.open(request)) as response: 415 | if max_size is not None: 416 | content = response.read(max_size) 417 | else: 418 | content = response.read() 419 | if response.headers.get('Content-Encoding') == 'gzip': 420 | # data came back gzip-compressed so decompress it 421 | content = gzip.GzipFile(fileobj=StringIO.StringIO(content)).read() 422 | elif response.headers.get('Content-Encoding') == 'deflate': 423 | content = zlib.decompress(content) 424 | self.final_url = response.url # store where redirected to 425 | if self.invalid_response(content, pattern): 426 | # invalid result from download 427 | content = None 428 | common.logger.warning('Content did not match expected pattern: %s' % url) 429 | self.response_code = str(response.code) 430 | self.response_headers = dict(response.headers) 431 | except Exception, e: 432 | self.downloading_error = str(e) 433 | if hasattr(e, 'code'): 434 | self.response_code = str(e.code) 435 | else: 436 | m = re.search('\D(\d\d\d)\D', str(e)) 437 | if m: 438 | self.response_code = m.groups()[0] 439 | 440 | if hasattr(e, 'read'): 441 | try: 442 | self.error_content = e.read() 443 | except Exception, e: 444 | self.error_content = '' 445 | # so many kinds of errors are possible here so just catch them all 446 | common.logger.warning(u'Download error: {} {}'.format(url, self.response_code)) 447 | if self.settings.acceptable_errors and self.response_code in self.settings.acceptable_errors: 448 | content, self.final_url = self.settings.default, url 449 | else: 450 | content, self.final_url = None, url 451 | return content 452 | 453 | 454 | _domains = adt.HashDict() 455 | def throttle(self, url, delay, proxy=None, variance=0.5): 456 | """Delay a minimum time for each domain per proxy by storing last access time 457 | 458 | url 459 | what intend to download 460 | delay 461 | the minimum amount of time (in seconds) to wait after downloading content from this domain 462 | proxy 463 | the proxy to download through 464 | variance 465 | the amount of randomness in delay, 0-1 466 | """ 467 | if delay > 0: 468 | key = ':'.join([str(proxy), self.throttle_additional_key or '', common.get_domain(url)]) 469 | if key in Download._domains: 470 | while datetime.datetime.now() < Download._domains.get(key): 471 | time.sleep(SLEEP_TIME) 472 | # update domain timestamp to when can query next 473 | Download._domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5))) 474 | 475 | 476 | def reload_proxies(self, timeout=600): 477 | """Check periodically for updated proxy file 478 | 479 | timeout: 480 | the number of seconds before check for updated proxies 481 | """ 482 | if self.settings.proxy_file and time.time() - self.last_load_time > timeout: 483 | self.last_load_time = time.time() 484 | if os.path.exists(self.settings.proxy_file): 485 | if os.stat(self.settings.proxy_file).st_mtime != self.last_mtime: 486 | self.last_mtime = os.stat(self.settings.proxy_file).st_mtime 487 | self.settings.proxies = common.read_list(self.settings.proxy_file) 488 | common.logger.debug('Reloaded proxies from updated file.') 489 | 490 | 491 | def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None, api_key=None): 492 | gm = GoogleMaps(self) 493 | return gm.geocode(address, delay, read_cache, num_retries, language, api_key=api_key) 494 | 495 | def places(self, api_key, keyword, latitude, longitude, radius=10000, delay=5, num_retries=1, language='en'): 496 | gm = GoogleMaps(self) 497 | return gm.places(api_key, keyword, latitude, longitude, radius, delay, num_retries, language) 498 | 499 | 500 | def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1): 501 | return DataCrawler(self, alg.extract_emails).find(website, max_depth, max_urls, max_emails) 502 | 503 | def get_phones(self, website, max_depth=1, max_urls=10, max_phones=1): 504 | return DataCrawler(self, alg.extract_phones).find(website, max_depth, max_urls, max_phones) 505 | 506 | 507 | def gcache_get(self, url, **kwargs): 508 | """Download webpage via google cache 509 | """ 510 | return self.get('http://www.google.com/search?&q=cache%3A' + urllib.quote(url), **kwargs) 511 | 512 | 513 | def gtrans_get(self, url, **kwargs): 514 | """Download webpage via Google Translation 515 | """ 516 | url = 'http://translate.google.com/translate?sl=nl&anno=2&u=%s' % urllib.quote(url) 517 | html = self.get(url, **kwargs) 518 | if html: 519 | m = re.compile(r']*src="([^"]+)"[^<>]*name=c', re.DOTALL|re.IGNORECASE).search(html) 520 | if m: 521 | frame_src = urlparse.urljoin(url, common.unescape(m.groups()[0].strip())) 522 | # force to check redirect here 523 | html = self.get(frame_src, **kwargs) 524 | if html: 525 | # remove google translations content 526 | return re.compile(r'', re.DOTALL|re.IGNORECASE).sub('', html) 527 | return self.settings.default 528 | 529 | 530 | def archive_get(self, url, timestamp=None, **kwargs): 531 | """Download webpage via the archive.org cache 532 | 533 | url: 534 | The webpage to download 535 | timestamp: 536 | When passed a datetime object will download the cached webpage closest to this date, 537 | If passed a string will use this as timestamp 538 | Else when None (default) will download the most recent archived page. 539 | """ 540 | if hasattr(timestamp, 'strftime'): 541 | formatted_ts = timestamp.strftime('%Y%m%d%H%M%S') 542 | elif isinstance(timestamp, basestring): 543 | formatted_ts = timestamp 544 | else: 545 | formatted_ts = '2' # will return most recent archive 546 | html = self.get('https://web.archive.org/web/%s/%s' % (formatted_ts, url), **kwargs) 547 | if not html and timestamp is None: 548 | # not cached, so get live version 549 | html = self.get('http://liveweb.archive.org/' + url) 550 | match = re.search('

.*?', re.DOTALL).sub('', html) 558 | html = re.compile('', re.DOTALL).sub('', html) 559 | html = re.sub('/web/\d+/', '', html) 560 | return html 561 | 562 | 563 | def whois(self, url, timeout=10): 564 | """Return text of this whois query 565 | """ 566 | domain = common.get_domain(url) 567 | if domain: 568 | text = '' 569 | key = 'whois_%s' % domain 570 | try: 571 | if self.cache: 572 | text = self.cache[key] 573 | else: 574 | raise KeyError() 575 | except KeyError: 576 | # try local whois command 577 | r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) 578 | start = time.time() 579 | while r.poll() is None: 580 | time.sleep(0.5) 581 | if time.time() - start > timeout: 582 | try: 583 | r.kill() 584 | except Exception, e: 585 | pass 586 | break 587 | if r.poll() != 1: 588 | text = r.communicate()[0] 589 | 590 | if '@' in text: 591 | if self.cache: 592 | self.cache[key] = text 593 | return text 594 | 595 | 596 | def save_as(self, url, filename=None, save_dir='images', override=False): 597 | """Download url and save to disk if does not already exist 598 | 599 | url: 600 | the webpage to download 601 | filename: 602 | output file to save to if not set then will save to file based on URL 603 | override: 604 | whether to download if output file already exists 605 | """ 606 | save_path = os.path.join(save_dir, filename or '%s.%s' % (hashlib.md5(url).hexdigest(), common.get_extension(url))) 607 | if not os.path.exists(save_path) or override: 608 | # need to download 609 | _bytes = self.get(url, num_redirects=0, write_cache=False) 610 | if _bytes: 611 | if not os.path.exists(save_dir): 612 | os.makedirs(save_dir) 613 | open(save_path, 'wb').write(_bytes) 614 | else: 615 | return None 616 | return save_path 617 | 618 | 619 | def get_redirect(url, html): 620 | """Check for meta redirects and return redirect URL if found 621 | """ 622 | match = re.compile(']*?url=(.*?)["\']', re.IGNORECASE).search(html) 623 | if match: 624 | return urlparse.urljoin(url, common.unescape(match.groups()[0].strip())) 625 | 626 | 627 | class GoogleMaps: 628 | def __init__(self, D): 629 | self.D = D 630 | 631 | def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None, api_key=None): 632 | """Geocode address using Google's API and return dictionary of useful fields 633 | 634 | address: 635 | what to pass to geocode API 636 | delay: 637 | how long to delay between API requests 638 | read_cache: 639 | whether to load content from cache when exists 640 | num_retries: 641 | the number of times to try downloading 642 | language: 643 | the language to set 644 | """ 645 | try: 646 | address = address.encode('utf-8') 647 | except UnicodeDecodeError: 648 | common.logger.debug('Geocode failed to parse address and needed to cast to ascii: ' + address) 649 | address = common.to_ascii(address) 650 | address = re.sub('%C2%9\d', '', urllib.quote_plus(address)) 651 | geocode_url = 'http://maps.google.com/maps/api/geocode/json?address=%s&sensor=false%s' % (address, '&language=' + language if language else '') 652 | try: 653 | # legacy data without api key 654 | geocode_html = self.D.cache[geocode_url] 655 | if geocode_html: 656 | self.D.response_code = '200' 657 | else: 658 | raise KeyError() 659 | except KeyError: 660 | geocode_url = 'https://maps.google.com/maps/api/geocode/json?address=%s&key=%s&sensor=false%s' % (address, api_key or '', '&language=' + language if language else '') 661 | geocode_html = self.D.get(geocode_url, delay=delay, read_cache=read_cache, num_retries=num_retries) 662 | geocode_data = self.load_result(geocode_url, geocode_html) 663 | for result in geocode_data.get('results', []): 664 | return self.parse_location(result) 665 | return collections.defaultdict(str) 666 | 667 | 668 | def places(self, api_key, keyword, latitude, longitude, radius=10000, delay=5, num_retries=1, language='en'): 669 | """Search the Google Place API for this keyword and location 670 | 671 | api_key is the Google Places API key: https://developers.google.com/places/documentation/#Authentication 672 | radius around the location can be a maximum 50000 673 | 674 | Returns a list of up to 200 matching places 675 | """ 676 | search_url_template = 'https://maps.googleapis.com/maps/api/place/radarsearch/json?key={0}&location={1},{2}&radius={3}&keyword={4}&sensor=false' 677 | place_url_template = 'https://maps.googleapis.com/maps/api/place/details/json?key={0}&reference={1}&language={2}&sensor=false' 678 | 679 | search_url = search_url_template.format(api_key, latitude, longitude, radius, keyword).replace(' ', '+') 680 | search_html = self.D.get(search_url, delay=delay, num_retries=num_retries) 681 | search_results = self.load_result(search_url, search_html) 682 | 683 | place_results = [] 684 | # iterate search results 685 | for search_result in search_results.get('results', []): 686 | reference = search_result['reference'] 687 | # found a new place 688 | place_url = place_url_template.format(api_key, reference, language) 689 | place_html = self.D.get(place_url, delay=delay, num_retries=num_retries) 690 | 691 | place = self.load_result(place_url, place_html) 692 | if place: 693 | place = place['result'] 694 | result = self.parse_location(place) 695 | result['name'] = place['name'] 696 | result['categories'] = place['types'] 697 | result['phone'] = place.get('formatted_phone_number', '') 698 | result['website'] = place.get('website', '') 699 | place_results.append(result) 700 | return place_results 701 | 702 | 703 | def load_result(self, url, html): 704 | """Parse the result from API 705 | 706 | If JSON is well formed and status is OK then will return result 707 | Else will return an empty dict 708 | """ 709 | if html: 710 | try: 711 | search_data = json.loads(html) 712 | except ValueError as e: 713 | common.logger.debug(str(e)) 714 | else: 715 | status = search_data['status'] 716 | if status == 'OK': 717 | return search_data 718 | elif status == 'ZERO_RESULTS': 719 | pass 720 | elif status == 'OVER_QUERY_LIMIT': 721 | # error geocoding - try again later 722 | common.logger.info('Over query limit') 723 | self.D.cache[url] = '' 724 | elif status in ('REQUEST_DENIED', 'INVALID_REQUEST'): 725 | common.logger.info('{0}: {1}'.format(status, url)) 726 | return {} 727 | 728 | 729 | def parse_location(self, result): 730 | """Parse address data from Google's geocoding response into a more usable flat structure 731 | 732 | Example: https://developers.google.com/maps/documentation/geocoding/#JSON 733 | """ 734 | results = collections.defaultdict(str) 735 | for e in result['address_components']: 736 | # parse address compenents into flat layer 737 | types, value, abbrev = e['types'], e['long_name'], e['short_name'] 738 | if 'street_number' in types: 739 | results['number'] = value 740 | elif 'route' in types: 741 | results['street'] = value 742 | elif 'postal_code' in types: 743 | results['postcode'] = value 744 | elif 'locality' in types: 745 | results['suburb'] = value 746 | elif 'administrative_area_level_1' in types: 747 | results['state'] = value 748 | results['state_code'] = abbrev 749 | elif 'administrative_area_level_2' in types: 750 | results['county'] = value 751 | elif 'administrative_area_level_3' in types: 752 | results['district'] = value 753 | elif 'country' in types: 754 | results['country'] = value 755 | results['country_code'] = abbrev 756 | 757 | # extract addresses 758 | results['full_address'] = result['formatted_address'] 759 | if 'street' in results: 760 | results['address'] = (results['number'] + ' ' + results['street']).strip() 761 | 762 | results['lat'] = result['geometry']['location']['lat'] 763 | results['lng'] = result['geometry']['location']['lng'] 764 | results['types'] = result['types'] 765 | return results 766 | 767 | 768 | 769 | class StopCrawl(Exception): 770 | """Raise this exception to interrupt crawl 771 | """ 772 | pass 773 | 774 | 775 | def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs): 776 | """Download these urls in parallel 777 | 778 | url: 779 | the webpage to download 780 | urls: 781 | the webpages to download 782 | num_threads: 783 | the number of threads to download urls with 784 | cb: 785 | Called after each download with the HTML of the download. 786 | The arguments are the url and downloaded html. 787 | Whatever URLs are returned are added to the crawl queue. 788 | dl: 789 | A callback for customizing the download. 790 | Takes the download object and url and should return the HTML. 791 | depth: 792 | True for depth first search 793 | """ 794 | running = True 795 | lock = threading.Lock() 796 | def add_iter_urls(): 797 | if lock.acquire(False): 798 | for url in url_iter or []: 799 | download_queue.append(url) 800 | break 801 | lock.release() 802 | 803 | 804 | def process_queue(): 805 | """Thread for downloading webpages 806 | """ 807 | D = Download(**kwargs) 808 | 809 | while True: 810 | try: 811 | url = download_queue.pop() if depth else download_queue.popleft() 812 | 813 | except IndexError: 814 | add_iter_urls() 815 | break 816 | 817 | else: 818 | # download this url 819 | html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) 820 | if cb: 821 | try: 822 | # use callback to process downloaded HTML 823 | result = cb(D, url, html) 824 | 825 | except StopCrawl: 826 | common.logger.info('Stopping crawl signal') 827 | self.running = False 828 | 829 | except Exception: 830 | # catch any callback error to avoid losing thread 831 | common.logger.exception('\nIn callback for: ' + str(url)) 832 | 833 | else: 834 | # add these URL's to crawl queue 835 | for link in result or []: 836 | download_queue.append(link) 837 | 838 | # update the crawler state 839 | # no download or error so must have read from cache 840 | num_caches = 0 if D.num_downloads or D.num_errors else 1 841 | state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue)) 842 | 843 | download_queue = collections.deque() 844 | if urls: 845 | download_queue.extend(urls) 846 | if url: 847 | download_queue.append(url) 848 | add_iter_urls() 849 | common.logger.debug('Start new crawl') 850 | 851 | # initiate the state file with the number of URL's already in the queue 852 | state = State() 853 | state.update(queue_size=len(download_queue)) 854 | 855 | # wait for all download threads to finish 856 | threads = [] 857 | while running and (threads or download_queue): 858 | for thread in threads: 859 | if not thread.is_alive(): 860 | threads.remove(thread) 861 | while len(threads) < num_threads and download_queue: 862 | # cat start more threads 863 | thread = threading.Thread(target=process_queue) 864 | thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c 865 | thread.start() 866 | threads.append(thread) 867 | time.sleep(SLEEP_TIME) 868 | # save the final state after threads finish 869 | state.save() 870 | 871 | 872 | 873 | class State: 874 | """Save state of crawl to disk 875 | 876 | output_file: 877 | where to save the state 878 | timeout: 879 | how many seconds to wait between saving the state 880 | """ 881 | def __init__(self, output_file=None, timeout=10): 882 | # where to save state to 883 | self.output_file = output_file or settings.status_file 884 | # how long to wait between saving state 885 | self.timeout = timeout 886 | # track the number of downloads and errors 887 | self.num_downloads = self.num_errors = self.num_caches = self.queue_size = 0 888 | # data to save to disk 889 | self.data = {} 890 | # whether data needs to be saved to dosk 891 | self.flush = False 892 | # track time duration of crawl 893 | self.start_time = time.time() 894 | self.last_time = 0 895 | # a lock to prevent multiple threads writing at once 896 | self.lock = threading.Lock() 897 | 898 | def update(self, num_downloads=0, num_errors=0, num_caches=0, queue_size=0): 899 | """Update the state with these values 900 | 901 | num_downloads: 902 | the number of downloads completed successfully 903 | num_errors: 904 | the number of errors encountered while downloading 905 | num_caches: 906 | the number of webpages read from cache instead of downloading 907 | queue_size: 908 | the number of URL's in the queue 909 | """ 910 | self.num_downloads += num_downloads 911 | self.num_errors += num_errors 912 | self.num_caches += num_caches 913 | self.queue_size = queue_size 914 | self.data['num_downloads'] = self.num_downloads 915 | self.data['num_errors'] = self.num_errors 916 | self.data['num_caches'] = self.num_caches 917 | self.data['queue_size'] = self.queue_size 918 | 919 | if time.time() - self.last_time > self.timeout: 920 | self.lock.acquire() 921 | self.save() 922 | self.lock.release() 923 | 924 | def save(self): 925 | """Save state to disk 926 | """ 927 | self.last_time = time.time() 928 | self.data['duration_secs'] = int(self.last_time - self.start_time) 929 | self.flush = False 930 | text = json.dumps(self.data) 931 | tmp_file = '%s.%d' % (self.output_file, os.getpid()) 932 | fp = open(tmp_file, 'wb') 933 | fp.write(text) 934 | # ensure all content is written to disk 935 | fp.flush() 936 | fp.close() 937 | try: 938 | if os.name == 'nt': 939 | # on windows can not rename if file exists 940 | if os.path.exists(self.output_file): 941 | os.remove(self.output_file) 942 | # atomic copy to new location so state file is never partially written 943 | os.rename(tmp_file, self.output_file) 944 | except OSError: 945 | pass 946 | 947 | 948 | 949 | class CrawlerCallback: 950 | """Example callback to crawl a website 951 | """ 952 | def __init__(self, output_file=None, max_links=100, max_depth=1, allowed_urls='', banned_urls='^$', robots=None, crawl_existing=True): 953 | """ 954 | output_file: 955 | where to save scraped data 956 | max_links: 957 | the maximum number of links to follow per page 958 | max_depth: 959 | the maximum depth to follow links into website (use None for no limit) 960 | allowed_urls: 961 | a regex for allowed urls, defaults to all urls 962 | banned_urls: 963 | a regex for banned urls, defaults to no urls 964 | robots: 965 | RobotFileParser object to determine which urls allowed to crawl 966 | crawl_existing: 967 | sets whether to crawl content already downloaded previously in the cache 968 | """ 969 | self.found = adt.HashDict(int) # track depth of found URLs 970 | if output_file: 971 | self.writer = common.UnicodeWriter(output_file) 972 | else: 973 | self.writer = None 974 | self.max_links = max_links 975 | self.max_depth = max_depth 976 | self.allowed_urls = re.compile(allowed_urls) 977 | self.banned_urls = re.compile(banned_urls) 978 | self.robots = robots 979 | self.crawl_existing = crawl_existing 980 | 981 | 982 | def __call__(self, D, url, html): 983 | # override this method to add scraping code ... 984 | return self.crawl(D, url, html) 985 | 986 | 987 | def normalize(self, url, link): 988 | """Normalize the link to avoid duplicates 989 | 990 | >>> cb = CrawlerCallback() 991 | >>> cb.normalize('http://example.com', '../abc.html') 992 | 'http://example.com/abc.html' 993 | >>> cb.normalize('http://example.com', 'abc.html#link') 994 | 'http://example.com/abc.html' 995 | >>> cb.normalize('http://example.com', 'abc.html?a=1&b=2') 996 | 'http://example.com/abc.html?a=1&b=2' 997 | """ 998 | link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates 999 | link = common.unescape(link) # parse escaped characters such as & 1000 | link = urlparse.urljoin(url, link) # support relative links 1001 | while urlparse.urlsplit(link).path.startswith('/..'): 1002 | # remove invalid parent directory 1003 | link = link.replace('/..', '', 1) 1004 | return link 1005 | 1006 | 1007 | def crawl(self, D, url, html): 1008 | """Crawl website html and return list of URLs crawled 1009 | """ 1010 | def valid(link): 1011 | """Check if should crawl this link 1012 | """ 1013 | # check if a media file 1014 | if common.get_extension(link) not in common.MEDIA_EXTENSIONS: 1015 | # check if a proper HTTP link 1016 | if link.lower().startswith('http'): 1017 | # only crawl within website 1018 | if common.same_domain(domain, link): 1019 | # passes regex 1020 | if self.allowed_urls.match(link) and not self.banned_urls.match(link): 1021 | # not blocked by robots.txt 1022 | if not self.robots or self.robots.can_fetch(settings.user_agent, link): 1023 | # allowed to recrawl 1024 | if self.crawl_existing or (D.cache and link not in D.cache): 1025 | return True 1026 | return False 1027 | 1028 | domain = common.get_domain(url) 1029 | depth = self.found[url] 1030 | outstanding = [] 1031 | if depth != self.max_depth: 1032 | # extract links to continue crawling 1033 | links_re = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE) 1034 | for link in links_re.findall(html): 1035 | try: 1036 | link = self.normalize(url, link) 1037 | except UnicodeDecodeError as e: 1038 | # unicode error when joining url 1039 | common.logger.info(e) 1040 | else: 1041 | if link not in self.found: 1042 | self.found[link] = depth + 1 1043 | if valid(link): 1044 | # is a new link 1045 | outstanding.append(link) 1046 | if len(outstanding) == self.max_links: 1047 | break 1048 | return outstanding 1049 | 1050 | 1051 | class DataCrawler: 1052 | """Crawl a website and return all matches extracted using a given function 1053 | """ 1054 | def __init__(self, D, extract_fn): 1055 | """ 1056 | extract_fn: 1057 | a function to parse given HTML and return a list of matches 1058 | """ 1059 | self.D = D 1060 | self.extract_fn = extract_fn 1061 | 1062 | def link_score(self, link): 1063 | """Return how valuable this link is for ordering crawling 1064 | The lower the better""" 1065 | link = link.lower() 1066 | total = 0 1067 | if 'contact' in link: 1068 | pass # this page is top priority 1069 | elif 'about' in link: 1070 | total += 10 1071 | elif 'help' in link: 1072 | total += 20 1073 | else: 1074 | # generic page 1075 | total += 100 1076 | # bias towards shorter links 1077 | total += len(link) 1078 | return total 1079 | 1080 | def find(self, website, max_depth, max_urls, max_results): 1081 | """ 1082 | website: 1083 | the URL of website to crawl 1084 | max_depth: 1085 | how many links deep to follow before stop crawl 1086 | max_urls: 1087 | how many URL's to download before stop crawl 1088 | max_results: 1089 | The maximum number of results to extract before stop crawl. 1090 | If None then extract all results found in crawl. 1091 | """ 1092 | # check for redirect URL 1093 | self.D.get(website) 1094 | redirect_url = self.D.cache.meta(website).get('url') if self.D.cache else self.final_url 1095 | website = redirect_url or website 1096 | 1097 | domain = urlparse.urlparse(website).netloc 1098 | scraped = adt.HashDict() 1099 | c = CrawlerCallback(max_depth=max_depth) 1100 | outstanding = [(0, website)] # list of URLs and their score 1101 | results = [] 1102 | while outstanding and (max_urls is None or len(scraped) < max_urls) \ 1103 | and (max_results is None or len(results) < max_results): 1104 | _, url = outstanding.pop(0) 1105 | scraped[url] = True 1106 | html = self.D.get(url, num_retries=0) 1107 | 1108 | if html: 1109 | for result in self.extract_fn(html): 1110 | if result not in results: 1111 | results.append(result) 1112 | if len(results) == max_results: 1113 | break 1114 | # crawl the linked URLs 1115 | for link in c.crawl(self, url, html): 1116 | if urlparse.urlparse(link).netloc == domain: 1117 | if link not in scraped: 1118 | # insert sort this new record so crawl most promising first 1119 | score = self.link_score(link) 1120 | for i, (other_score, other_link) in enumerate(outstanding): 1121 | if score < other_score: 1122 | outstanding.insert(i, ((score, link))) 1123 | break 1124 | else: 1125 | outstanding.append((score, link)) 1126 | return results 1127 | -------------------------------------------------------------------------------- /pdict.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | pdict has a dictionary like interface and a sqlite backend 3 | It uses pickle to store Python objects and strings, which are then compressed 4 | Multithreading is supported 5 | """ 6 | 7 | import os 8 | import sys 9 | import datetime 10 | import time 11 | import sqlite3 12 | import zlib 13 | import itertools 14 | import threading 15 | import md5 16 | import shutil 17 | import glob 18 | try: 19 | import cPickle as pickle 20 | except ImportError: 21 | import pickle 22 | try: 23 | # gdbm produces best performance 24 | import gdbm as dbm 25 | except ImportError: 26 | import anydbm as dbm 27 | 28 | DEFAULT_LIMIT = 1000 29 | DEFAULT_TIMEOUT = 10000 30 | 31 | 32 | 33 | def opendb(*argv, **kwargs): 34 | try: 35 | db = PersistentDict(*argv, **kwargs) 36 | except sqlite3.DatabaseError: 37 | db = DbmDict(*argv, **kwargs) 38 | #except dbm.error: 39 | return db 40 | 41 | 42 | class PersistentDict: 43 | """Stores and retrieves persistent data through a dict-like interface 44 | Data is stored compressed on disk using sqlite3 45 | 46 | filename: 47 | where to store sqlite database. Uses in memory by default. 48 | compress_level: 49 | between 1-9 (in my test levels 1-3 produced a 1300kb file in ~7 seconds while 4-9 a 288kb file in ~9 seconds) 50 | expires: 51 | a timedelta object of how old data can be before expires. By default is set to None to disable. 52 | timeout: 53 | how long should a thread wait for sqlite to be ready (in ms) 54 | isolation_level: 55 | None for autocommit or else 'DEFERRED' / 'IMMEDIATE' / 'EXCLUSIVE' 56 | 57 | >>> cache = PersistentDict() 58 | >>> url = 'http://google.com/abc' 59 | >>> html = 'abc' 60 | >>> 61 | >>> url in cache 62 | False 63 | >>> len(cache) 64 | 0 65 | >>> cache[url] = html 66 | >>> url in cache 67 | True 68 | >>> len(cache) 69 | 1 70 | >>> cache[url] == html 71 | True 72 | >>> cache.get(url)['value'] == html 73 | True 74 | >>> cache.meta(url) 75 | {} 76 | >>> cache.meta(url, 'meta') 77 | >>> cache.meta(url) 78 | 'meta' 79 | >>> del cache[url] 80 | >>> url in cache 81 | False 82 | >>> os.remove(cache.filename) 83 | """ 84 | def __init__(self, filename='cache.db', compress_level=6, expires=None, timeout=DEFAULT_TIMEOUT, isolation_level=None): 85 | """initialize a new PersistentDict with the specified database file. 86 | """ 87 | self.filename = filename 88 | self.compress_level, self.expires, self.timeout, self.isolation_level = \ 89 | compress_level, expires, timeout, isolation_level 90 | self.conn = sqlite3.connect(filename, timeout=timeout, isolation_level=isolation_level, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) 91 | self.conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace') 92 | sql = """ 93 | CREATE TABLE IF NOT EXISTS config ( 94 | key TEXT NOT NULL PRIMARY KEY UNIQUE, 95 | value BLOB, 96 | meta BLOB, 97 | status INTEGER, 98 | updated timestamp DEFAULT (datetime('now', 'localtime')) 99 | ); 100 | """ 101 | self.conn.execute(sql) 102 | self.conn.execute("CREATE INDEX IF NOT EXISTS keys ON config (key);") 103 | 104 | 105 | def __copy__(self): 106 | """make a copy of current cache settings 107 | """ 108 | return PersistentDict(filename=self.filename, compress_level=self.compress_level, expires=self.expires, 109 | timeout=self.timeout, isolation_level=self.isolation_level) 110 | 111 | 112 | def __contains__(self, key): 113 | """check the database to see if a key exists 114 | """ 115 | row = self.conn.execute("SELECT updated FROM config WHERE key=?;", (key,)).fetchone() 116 | return row and self.is_fresh(row[0]) 117 | 118 | 119 | def contains(self, keys, ignore_expires=False): 120 | """check if a list of keys exist 121 | 122 | >>> # try 0 second expiration so expires immediately 123 | >>> cache = PersistentDict(expires=datetime.timedelta(seconds=0)) 124 | >>> cache['a'] = 1; 125 | >>> cache.contains(['a', 'b']) 126 | [] 127 | >>> cache.contains(['a', 'b'], ignore_expires=True) 128 | [u'a'] 129 | >>> os.remove(cache.filename) 130 | """ 131 | results = [] 132 | c = self.conn.cursor() 133 | c.execute("SELECT key, updated FROM config WHERE key IN (%s);" % ','.join(len(keys)*'?'), keys) 134 | for row in c: 135 | if ignore_expires or self.is_fresh(row[1]): 136 | results.append(row[0]) 137 | return results 138 | 139 | 140 | def __iter__(self): 141 | """iterate each key in the database 142 | """ 143 | c = self.conn.cursor() 144 | c.execute("SELECT key FROM config;") 145 | for row in c: 146 | yield row[0] 147 | 148 | 149 | def __nonzero__(self): 150 | return True 151 | 152 | 153 | def __len__(self): 154 | """Return the number of entries in the cache 155 | """ 156 | c = self.conn.cursor() 157 | c.execute("SELECT count(*) FROM config;") 158 | return c.fetchone()[0] 159 | 160 | 161 | def __getitem__(self, key): 162 | """return the value of the specified key or raise KeyError if not found 163 | """ 164 | row = self.conn.execute("SELECT value, updated FROM config WHERE key=?;", (key,)).fetchone() 165 | if row: 166 | if self.is_fresh(row[1]): 167 | value = row[0] 168 | return self.deserialize(value) 169 | else: 170 | raise KeyError("Key `%s' is stale" % key) 171 | else: 172 | raise KeyError("Key `%s' does not exist" % key) 173 | 174 | 175 | def __delitem__(self, key): 176 | """remove the specifed value from the database 177 | """ 178 | self.conn.execute("DELETE FROM config WHERE key=?;", (key,)) 179 | 180 | 181 | def __setitem__(self, key, value): 182 | """set the value of the specified key 183 | """ 184 | updated = datetime.datetime.now() 185 | self.conn.execute("INSERT OR REPLACE INTO config (key, value, meta, updated) VALUES(?, ?, ?, ?);", ( 186 | key, self.serialize(value), self.serialize({}), updated) 187 | ) 188 | 189 | 190 | def serialize(self, value): 191 | """convert object to a compressed pickled string to save in the db 192 | """ 193 | return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), self.compress_level)) 194 | 195 | def deserialize(self, value): 196 | """convert compressed pickled string from database back into an object 197 | """ 198 | if value: 199 | return pickle.loads(zlib.decompress(value)) 200 | 201 | 202 | def is_fresh(self, t): 203 | """returns whether this datetime has expired 204 | """ 205 | return self.expires is None or datetime.datetime.now() - t < self.expires 206 | 207 | 208 | def get(self, key, default=None): 209 | """Get data at key and return default if not defined 210 | """ 211 | data = default 212 | if key: 213 | row = self.conn.execute("SELECT value, meta, updated FROM config WHERE key=?;", (key,)).fetchone() 214 | if row: 215 | if self.is_fresh(row[2]): 216 | value = row[0] 217 | data = dict( 218 | value=self.deserialize(value), 219 | meta=self.deserialize(row[1]), 220 | updated=row[2] 221 | ) 222 | return data 223 | 224 | 225 | def meta(self, key, value=None): 226 | """Get / set meta for this value 227 | 228 | if value is passed then set the meta attribute for this key 229 | if not then get the existing meta data for this key 230 | """ 231 | if value is None: 232 | # want to get meta 233 | row = self.conn.execute("SELECT meta FROM config WHERE key=?;", (key,)).fetchone() 234 | if row: 235 | return self.deserialize(row[0]) 236 | else: 237 | raise KeyError("Key `%s' does not exist" % key) 238 | else: 239 | # want to set meta 240 | self.conn.execute("UPDATE config SET meta=?, updated=? WHERE key=?;", (self.serialize(value), datetime.datetime.now(), key)) 241 | 242 | 243 | def clear(self): 244 | """Clear all cached data 245 | """ 246 | self.conn.execute("DELETE FROM config;") 247 | 248 | 249 | def merge(self, db, override=False): 250 | """Merge this databases content 251 | override determines whether to override existing keys 252 | """ 253 | for key in db.keys(): 254 | if override or key not in self: 255 | self[key] = db[key] 256 | 257 | 258 | def vacuum(self): 259 | self.conn.execute('VACUUM') 260 | 261 | 262 | class DbmDict: 263 | """Experimental new version of PersistentDict that uses the dbm modules instead 264 | This allows lazy writes instead of a transaction for each write 265 | 266 | filename: 267 | where to store sqlite database. Uses in memory by default. 268 | compress_level: 269 | between 1-9 (in my test levels 1-3 produced a 1300kb file in ~7 seconds while 4-9 a 288kb file in ~9 seconds) 270 | 271 | >>> filename = 'dbm.db' 272 | >>> cache = DbmDict(filename) 273 | >>> url = 'http://google.com/abc' 274 | >>> html = 'abc' 275 | >>> 276 | >>> url in cache 277 | False 278 | >>> cache[url] = html 279 | >>> url in cache 280 | True 281 | >>> cache[url] == html 282 | True 283 | >>> cache.meta(url) 284 | {} 285 | >>> cache.meta(url, 'meta') 286 | >>> cache.meta(url) 287 | 'meta' 288 | >>> urls = list(cache) 289 | >>> del cache[url] 290 | >>> url in cache 291 | False 292 | >>> os.remove(filename) 293 | """ 294 | def __init__(self, filename='dbm.db', compress_level=6): 295 | """initialize a new PersistentDict with the specified database file. 296 | """ 297 | self.filename, self.compress_level = filename, compress_level 298 | self.db = dbm.open(filename, 'c') 299 | self.lock = threading.Lock() 300 | 301 | 302 | def __copy__(self): 303 | """make a copy of current cache settings 304 | """ 305 | return PersistentDict(filename=self.filename, compress_level=self.compress_level) 306 | 307 | 308 | def __contains__(self, key): 309 | """check the database to see if a key exists 310 | """ 311 | with self.lock: 312 | return self.db.has_key(key) 313 | 314 | 315 | def __iter__(self): 316 | """iterate each key in the database 317 | """ 318 | with self.lock: 319 | key = self.db.firstkey() 320 | while key != None: 321 | yield key 322 | with self.lock: 323 | key = self.db.nextkey(key) 324 | 325 | 326 | def __getitem__(self, key): 327 | """return the value of the specified key or raise KeyError if not found 328 | """ 329 | with self.lock: 330 | value = self.db[key] 331 | return self.deserialize(value) 332 | 333 | 334 | def __delitem__(self, key): 335 | """remove the specifed value from the database 336 | """ 337 | with self.lock: 338 | del self.db[key] 339 | 340 | 341 | def __setitem__(self, key, value): 342 | """set the value of the specified key 343 | """ 344 | value = self.serialize(value) 345 | with self.lock: 346 | self.db[key] = value 347 | 348 | 349 | def serialize(self, value): 350 | """convert object to a compressed pickled string to save in the db 351 | """ 352 | return zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), self.compress_level) 353 | 354 | 355 | def deserialize(self, value): 356 | """convert compressed pickled string from database back into an object 357 | """ 358 | if value: 359 | return pickle.loads(zlib.decompress(value)) 360 | 361 | 362 | def get(self, key, default=None): 363 | """Get data at key and return default if not defined 364 | """ 365 | try: 366 | value = self[key] 367 | except KeyError: 368 | value = default 369 | return value 370 | 371 | 372 | def meta(self, key, value=None, prefix='__meta__'): 373 | """Get / set meta for this value 374 | 375 | if value is passed then set the meta attribute for this key 376 | if not then get the existing meta data for this key 377 | """ 378 | key = prefix + key 379 | if value is None: 380 | # get the meta data 381 | return self.get(key, {}) 382 | else: 383 | # set the meta data 384 | self[key] = value 385 | 386 | 387 | def clear(self): 388 | """Clear all cached data 389 | """ 390 | for key in self: 391 | del self[key] 392 | 393 | 394 | def merge(self, db, override=False): 395 | """Merge this databases content 396 | override determines whether to override existing keys 397 | """ 398 | for key in db: 399 | if override or key not in self: 400 | self[key] = db[key] 401 | 402 | class Queue: 403 | """Stores queue of outstanding URL's on disk 404 | 405 | >>> filename = 'queue.db' 406 | >>> queue = Queue(filename) 407 | >>> keys = [('a', 1), ('b', 2), ('c', 1)] 408 | >>> queue.push(keys) # add new keys 409 | >>> len(queue) 410 | 3 411 | >>> queue.push(keys) # trying adding duplicate keys 412 | >>> len(queue) 413 | 3 414 | >>> queue.clear(keys=['a']) 415 | 1 416 | >>> queue.pull(limit=1) 417 | [u'b'] 418 | >>> queue.clear() # remove all queue 419 | 1 420 | >>> os.remove(filename) 421 | """ 422 | size = None # track the size of the queue 423 | counter = itertools.count(1).next # counter gives a unique status for each pull() 424 | 425 | def __init__(self, filename, timeout=DEFAULT_TIMEOUT, isolation_level=None): 426 | self._conn = sqlite3.connect(filename, timeout=timeout, isolation_level=isolation_level, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) 427 | self._conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace') 428 | sql = """ 429 | CREATE TABLE IF NOT EXISTS queue ( 430 | key TEXT NOT NULL PRIMARY KEY UNIQUE, 431 | status INTEGER, 432 | priority INTEGER 433 | ); 434 | """ 435 | self._conn.execute(sql) 436 | self._conn.execute("CREATE INDEX IF NOT EXISTS priorities ON queue (priority);") 437 | if Queue.size is None: 438 | self._update_size() 439 | 440 | 441 | def __len__(self): 442 | """Get number of records queued 443 | """ 444 | return Queue.size 445 | 446 | def _update_size(self): 447 | """Calculate the number of records queued 448 | """ 449 | row = self._conn.execute("SELECT count(*) FROM queue WHERE status=?;", (False,)).fetchone() 450 | Queue.size = row[0] 451 | 452 | 453 | def push(self, key_map): 454 | """Add these keys to the queue 455 | Will not insert if key already exists. 456 | 457 | key_map: 458 | a list of (key, priority) tuples 459 | """ 460 | if key_map: 461 | c = self._conn.cursor() 462 | c.execute("BEGIN TRANSACTION") 463 | c.executemany("INSERT OR IGNORE INTO queue (key, priority, status) VALUES(?, ?, ?);", [(key, priority, False) for key, priority in key_map]) 464 | c.execute("END TRANSACTION") 465 | self._update_size() 466 | 467 | 468 | def pull(self, limit=DEFAULT_LIMIT): 469 | """Get queued keys up to limit 470 | """ 471 | status = Queue.counter() 472 | self._conn.execute('UPDATE queue SET status=? WHERE key in (SELECT key FROM queue WHERE status=? ORDER BY priority DESC LIMIT ?);', (status, False, limit)) 473 | rows = self._conn.execute('SELECT key FROM queue WHERE status=? LIMIT ?', (status, limit)) 474 | keys = [row[0] for row in rows] 475 | Queue.size -= len(keys) 476 | if Queue.size < 0: 477 | Queue.size = 0 478 | return keys 479 | 480 | 481 | def clear(self, keys=None): 482 | """Remove keys from queue. 483 | If keys is None remove all. 484 | 485 | Returns the number of keys removed 486 | """ 487 | prev_size = len(self) 488 | c = self._conn.cursor() 489 | if keys: 490 | c.execute("BEGIN TRANSACTION") 491 | c.executemany("DELETE FROM queue WHERE key=?;", [(key,) for key in keys]) 492 | c.execute("END TRANSACTION") 493 | self._update_size() 494 | else: 495 | c.execute("DELETE FROM queue;") 496 | Queue.size = 0 497 | return prev_size - len(self) 498 | 499 | 500 | 501 | class FSCache: 502 | """ 503 | Dictionary interface that stores cached 504 | values in the file system rather than in memory. 505 | The file path is formed from an md5 hash of the key. 506 | 507 | folder: 508 | the root level folder for the cache 509 | 510 | >>> fscache = FSCache('.') 511 | >>> url = 'http://google.com/abc' 512 | >>> html = 'abc' 513 | >>> url in fscache 514 | False 515 | >>> fscache[url] = html 516 | >>> url in fscache 517 | True 518 | >>> fscache.get(url) == html 519 | True 520 | >>> fscache.get(html) == '' 521 | True 522 | >>> fscache.clear() 523 | """ 524 | PARENT_DIR = 'fscache' 525 | FILE_NAME = 'index.html' 526 | 527 | def __init__(self, folder): 528 | self.folder = os.path.join(folder, FSCache.PARENT_DIR) 529 | 530 | 531 | def __contains__(self, key): 532 | """Does data for this key exist 533 | """ 534 | return os.path.exists(self._key_path(key)) 535 | 536 | 537 | def __getitem__(self, key): 538 | path = self._key_path(key) 539 | try: 540 | fp = open(path, 'rb') 541 | except IOError: 542 | # key does not exist 543 | raise KeyError('%s does not exist' % key) 544 | else: 545 | # get value in key 546 | return fp.read() 547 | 548 | 549 | def __setitem__(self, key, value): 550 | """Save value at this key to this value 551 | """ 552 | path = self._key_path(key) 553 | folder = os.path.dirname(path) 554 | if not os.path.exists(folder): 555 | os.makedirs(folder) 556 | open(path, 'wb').write(value) 557 | 558 | 559 | def __delitem__(self, key): 560 | """Remove the value at this key and any empty parent sub-directories 561 | """ 562 | path = self._key_path(key) 563 | try: 564 | os.remove(path) 565 | os.removedirs(os.path.dirname(path)) 566 | except OSError: 567 | pass 568 | 569 | def _key_path(self, key): 570 | """The fils system path for this key 571 | """ 572 | # create unique hash for this key 573 | try: 574 | key = key.encode('utf-8') 575 | except UnicodeDecodeError: 576 | pass 577 | h = md5.md5(key).hexdigest() 578 | # create file system path 579 | path = os.path.join(self.folder, os.path.sep.join(h), FSCache.FILE_NAME) 580 | return path 581 | 582 | 583 | def get(self, key, default=''): 584 | """Get data at this key and return default if does not exist 585 | """ 586 | try: 587 | value = self[key] 588 | except KeyError: 589 | value = default 590 | return value 591 | 592 | 593 | def clear(self): 594 | """Remove all the cached values 595 | """ 596 | if os.path.exists(self.folder): 597 | shutil.rmtree(self.folder) 598 | 599 | 600 | 601 | if __name__ == '__main__': 602 | import tempfile 603 | import webbrowser 604 | from optparse import OptionParser 605 | parser = OptionParser(usage='usage: %prog [options] ') 606 | parser.add_option('-k', '--key', dest='key', help='The key to use') 607 | parser.add_option('-v', '--value', dest='value', help='The value to store') 608 | parser.add_option('-b', '--browser', action='store_true', dest='browser', default=False, help='View content of this key in a web browser') 609 | parser.add_option('-c', '--clear', action='store_true', dest='clear', default=False, help='Clear all data for this cache') 610 | parser.add_option('-s', '--size', action='store_true', dest='size', default=False, help='Display size of database') 611 | options, args = parser.parse_args() 612 | if not args: 613 | parser.error('Must specify the cache file') 614 | cache = PersistentDict(args[0]) 615 | 616 | if options.value: 617 | # store thie value 618 | if options.key: 619 | cache[options.key] = options.value 620 | else: 621 | parser.error('Must specify the key') 622 | elif options.browser: 623 | if options.key: 624 | value = cache[options.key] 625 | filename = tempfile.NamedTemporaryFile().name 626 | fp = open(filename, 'w') 627 | fp.write(str(value)) 628 | fp.flush() 629 | webbrowser.open(filename) 630 | else: 631 | parser.error('Must specify the key') 632 | elif options.key: 633 | print cache[options.key] 634 | elif options.clear: 635 | if raw_input('Really? Clear the cache? (y/n) ') == 'y': 636 | cache.clear() 637 | print 'cleared' 638 | elif options.size: 639 | print len(cache) 640 | else: 641 | parser.error('No options selected') 642 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | __doc__ = 'default application wide settings' 2 | 3 | import sys 4 | import os 5 | import logging 6 | 7 | 8 | # default location to store output state files 9 | dirname, filename = os.path.split(sys.argv[0]) 10 | state_dir = os.path.join(dirname, '.' + filename.replace('.py', '')) 11 | if not os.path.exists(state_dir): 12 | try: 13 | os.mkdir(state_dir) 14 | except OSError as e: 15 | state_dir = '' 16 | #print 'Unable to create state directory:', e 17 | cache_file = os.path.relpath(os.path.join(state_dir, 'cache.db')) # file to use for pdict cache 18 | queue_file = os.path.relpath(os.path.join(state_dir, 'queue.db')) # file to use for pdict queue 19 | status_file = os.path.join(state_dir, 'status.js') # where to store state of crawl 20 | log_file = os.path.join(state_dir, 'webscraping.log') # default logging file 21 | 22 | log_level = logging.INFO # logging level 23 | default_encoding = 'utf-8' 24 | default_headers = { 25 | 'Referer': '', 26 | 'Accept-Language': 'en-us,en;q=0.5', 27 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 28 | } 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from distutils.core import setup 3 | 4 | def read(filename): 5 | return open(os.path.join(os.path.dirname(__file__), filename)).read() 6 | 7 | setup( 8 | name='webscraping', 9 | version='1.7.1', 10 | packages=['webscraping'], 11 | package_dir={'webscraping':'.'}, # look for package contents in current directory 12 | author='Richard Penman', 13 | author_email='richard@webscraping.com', 14 | description='Pure python library aimed to make web scraping easier', 15 | long_description=read('README.rst'), 16 | url='https://github.com/richardpenman/webscraping', 17 | classifiers = [ 18 | 'Environment :: Web Environment', 19 | 'Intended Audience :: Developers', 20 | 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', 21 | 'Operating System :: OS Independent', 22 | 'Programming Language :: Python', 23 | 'Topic :: Internet :: WWW/HTTP' 24 | ], 25 | license='lgpl' 26 | ) 27 | -------------------------------------------------------------------------------- /webkit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __doc__ = 'Interface to qt webkit for loading and interacting with JavaScript dependent webpages' 4 | 5 | import sys, os, re, urllib2, random, itertools, json 6 | from time import time, sleep 7 | from datetime import datetime 8 | 9 | # for using native Python strings 10 | import sip 11 | sip.setapi('QString', 2) 12 | from PyQt4.QtGui import QApplication, QDesktopServices, QImage, QPainter, QMouseEvent, QKeyEvent, QKeySequence 13 | from PyQt4.QtCore import Qt, QByteArray, QUrl, QTimer, QEventLoop, QIODevice, QObject, QPoint, QEvent 14 | from PyQt4.QtWebKit import QWebFrame, QWebView, QWebElement, QWebPage, QWebSettings, QWebInspector 15 | from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache 16 | 17 | # maximum number of bytes to read from a POST request 18 | MAX_POST_SIZE = 2 ** 25 19 | 20 | import alg, common, pdict, settings 21 | 22 | 23 | class NetworkAccessManager(QNetworkAccessManager): 24 | def __init__(self, proxy, use_cache): 25 | """Subclass QNetworkAccessManager for finer control network operations 26 | 27 | proxy: the string of a proxy to download through 28 | use_cache: whether to cache replies so that can load faster with the same content subsequent times 29 | """ 30 | super(NetworkAccessManager, self).__init__() 31 | self.setProxy(proxy) 32 | self.sslErrors.connect(self.sslErrorHandler) 33 | # the requests that are still active 34 | self.active_requests = [] 35 | self.cache = pdict.PersistentDict(settings.cache_file) if use_cache else None 36 | 37 | 38 | def shutdown(self): 39 | """Network is shutting down event 40 | """ 41 | # prevent new requests 42 | self.setNetworkAccessible(QNetworkAccessManager.NotAccessible) 43 | # abort existing requests 44 | for request in self.active_requests: 45 | request.abort() 46 | request.deleteLater() 47 | 48 | 49 | def setProxy(self, proxy): 50 | """Parse proxy components from proxy 51 | """ 52 | if proxy: 53 | fragments = common.parse_proxy(proxy) 54 | if fragments['host']: 55 | QNetworkAccessManager.setProxy(self, 56 | QNetworkProxy(QNetworkProxy.HttpProxy, 57 | fragments['host'], int(fragments['port']), 58 | fragments['username'], fragments['password'] 59 | ) 60 | ) 61 | else: 62 | common.logger.info('Invalid proxy: ' + str(proxy)) 63 | 64 | 65 | def createRequest(self, operation, request, post): 66 | """Override creating a network request 67 | """ 68 | url = request.url().toString() 69 | if str(request.url().path()).endswith('.ttf'): 70 | # block fonts, which can cause webkit to crash 71 | common.logger.debug(u'Blocking: {}'.format(url)) 72 | request.setUrl(QUrl()) 73 | 74 | data = post if post is None else post.peek(MAX_POST_SIZE) 75 | key = u'{} {}'.format(url, data) 76 | use_cache = not url.startswith('file') 77 | if self.cache is not None and use_cache and key in self.cache: 78 | common.logger.debug(u'Load from cache: {}'.format(key)) 79 | content, headers, attributes = self.cache[key] 80 | reply = CachedNetworkReply(self, request.url(), content, headers, attributes) 81 | else: 82 | common.logger.debug(u'Request: {} {}'.format(url, post or '')) 83 | reply = QNetworkAccessManager.createRequest(self, operation, request, post) 84 | reply.error.connect(self.catch_error) 85 | self.active_requests.append(reply) 86 | reply.destroyed.connect(self.active_requests.remove) 87 | # save reference to original request 88 | reply.content = QByteArray() 89 | reply.readyRead.connect(self._save_content(reply)) 90 | if self.cache is not None and use_cache: 91 | reply.finished.connect(self._cache_content(reply, key)) 92 | reply.orig_request = request 93 | reply.data = self.parse_data(data) 94 | return reply 95 | 96 | 97 | def _save_content(self, r): 98 | """Save copy of reply content before is lost 99 | """ 100 | def save_content(): 101 | r.content.append(r.peek(r.size())) 102 | return save_content 103 | 104 | def _cache_content(self, r, key): 105 | """Cache downloaded content 106 | """ 107 | def cache_content(): 108 | headers = [(header, r.rawHeader(header)) for header in r.rawHeaderList()] 109 | attributes = [] 110 | attributes.append((QNetworkRequest.HttpStatusCodeAttribute, r.attribute(QNetworkRequest.HttpStatusCodeAttribute).toInt())) 111 | attributes.append((QNetworkRequest.HttpReasonPhraseAttribute, r.attribute(QNetworkRequest.HttpReasonPhraseAttribute).toByteArray())) 112 | #attributes.append((QNetworkRequest.RedirectionTargetAttribute, r.attribute(QNetworkRequest.RedirectionTargetAttribute).toUrl())) 113 | attributes.append((QNetworkRequest.ConnectionEncryptedAttribute, r.attribute(QNetworkRequest.ConnectionEncryptedAttribute).toBool())) 114 | #attributes.append((QNetworkRequest.CacheLoadControlAttribute, r.attribute(QNetworkRequest.CacheLoadControlAttribute).toInt())) 115 | #attributes.append((QNetworkRequest.CacheSaveControlAttribute, r.attribute(QNetworkRequest.CacheSaveControlAttribute).toBool())) 116 | #attributes.append((QNetworkRequest.SourceIsFromCacheAttribute, r.attribute(QNetworkRequest.SourceIsFromCacheAttribute).toBool())) 117 | #print 'save cache:', key, len(r.content), len(headers), attributes 118 | self.cache[key] = r.content, headers, attributes 119 | return cache_content 120 | 121 | 122 | def parse_data(self, data): 123 | """Parse this posted data into a list of key/value pairs 124 | """ 125 | if data is None: 126 | result = [] 127 | else: 128 | try: 129 | result = json.loads(unicode(data)) 130 | if isinstance(result, dict): 131 | result = result.items() 132 | if not isinstance(result, list): 133 | common.logger.info(u'Unexpected data format: {}'.format(result)) 134 | result = [] 135 | except ValueError: 136 | url = QUrl('') 137 | url.setEncodedQuery(data) 138 | result = url.queryItems() 139 | return result 140 | 141 | 142 | def catch_error(self, eid): 143 | """Interpret the HTTP error ID received 144 | """ 145 | if eid not in (5, 301): 146 | errors = { 147 | 0 : 'no error condition. Note: When the HTTP protocol returns a redirect no error will be reported. You can check if there is a redirect with the QNetworkRequest::RedirectionTargetAttribute attribute.', 148 | 1 : 'the remote server refused the connection (the server is not accepting requests)', 149 | 2 : 'the remote server closed the connection prematurely, before the entire reply was received and processed', 150 | 3 : 'the remote host name was not found (invalid hostname)', 151 | 4 : 'the connection to the remote server timed out', 152 | 5 : 'the operation was canceled via calls to abort() or close() before it was finished.', 153 | 6 : 'the SSL/TLS handshake failed and the encrypted channel could not be established. The sslErrors() signal should have been emitted.', 154 | 7 : 'the connection was broken due to disconnection from the network, however the system has initiated roaming to another access point. The request should be resubmitted and will be processed as soon as the connection is re-established.', 155 | 101 : 'the connection to the proxy server was refused (the proxy server is not accepting requests)', 156 | 102 : 'the proxy server closed the connection prematurely, before the entire reply was received and processed', 157 | 103 : 'the proxy host name was not found (invalid proxy hostname)', 158 | 104 : 'the connection to the proxy timed out or the proxy did not reply in time to the request sent', 159 | 105 : 'the proxy requires authentication in order to honour the request but did not accept any credentials offered (if any)', 160 | 201 : 'the access to the remote content was denied (similar to HTTP error 401)', 161 | 202 : 'the operation requested on the remote content is not permitted', 162 | 203 : 'the remote content was not found at the server (similar to HTTP error 404)', 163 | 204 : 'the remote server requires authentication to serve the content but the credentials provided were not accepted (if any)', 164 | 205 : 'the request needed to be sent again, but this failed for example because the upload data could not be read a second time.', 165 | 301 : 'the Network Access API cannot honor the request because the protocol is not known', 166 | 302 : 'the requested operation is invalid for this protocol', 167 | 99 : 'an unknown network-related error was detected', 168 | 199 : 'an unknown proxy-related error was detected', 169 | 299 : 'an unknown error related to the remote content was detected', 170 | 399 : 'a breakdown in protocol was detected (parsing error, invalid or unexpected responses, etc.)', 171 | } 172 | common.logger.debug('Error %d: %s (%s)' % (eid, errors.get(eid, 'unknown error'), self.sender().url().toString())) 173 | 174 | 175 | def sslErrorHandler(self, reply, errors): 176 | common.logger.info('SSL errors: {}'.format(errors)) 177 | reply.ignoreSslErrors() 178 | 179 | 180 | 181 | class CachedNetworkReply(QNetworkReply): 182 | def __init__(self, parent, url, content, headers, attributes): 183 | super(CachedNetworkReply, self).__init__(parent) 184 | self.setUrl(url) 185 | self.content = content 186 | self.offset = 0 187 | for header, value in headers: 188 | self.setRawHeader(header, value) 189 | #self.setHeader(QNetworkRequest.ContentLengthHeader, len(content)) 190 | for attribute, value in attributes: 191 | self.setAttribute(attribute, value) 192 | self.setOpenMode(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered) 193 | # trigger signals that content is ready 194 | QTimer.singleShot(0, self.readyRead) 195 | QTimer.singleShot(0, self.finished) 196 | 197 | def bytesAvailable(self): 198 | return len(self.content) - self.offset 199 | 200 | def isSequential(self): 201 | return True 202 | 203 | def abort(self): 204 | pass # qt requires that this be defined 205 | 206 | def readData(self, size): 207 | """Return up to size bytes from buffer 208 | """ 209 | if self.offset >= len(self.content): 210 | return '' 211 | number = min(size, len(self.content) - self.offset) 212 | data = self.content[self.offset : self.offset + number] 213 | self.offset += number 214 | return str(data) 215 | 216 | 217 | 218 | class WebPage(QWebPage): 219 | def __init__(self, user_agent, confirm=True): 220 | """Override QWebPage to set User-Agent and JavaScript messages 221 | 222 | user_agent: the User Agent to submit 223 | confirm: default response to confirm dialog boxes 224 | """ 225 | super(WebPage, self).__init__() 226 | self.user_agent = user_agent 227 | self.confirm = confirm 228 | self.setForwardUnsupportedContent(True) 229 | 230 | def userAgentForUrl(self, url): 231 | """Use same user agent for all URL's 232 | """ 233 | return self.user_agent 234 | 235 | def javaScriptAlert(self, frame, message): 236 | """Override default JavaScript alert popup and send to log 237 | """ 238 | common.logger.debug('Alert: ' + message) 239 | 240 | 241 | def javaScriptConfirm(self, frame, message): 242 | """Override default JavaScript confirm popup and send to log 243 | """ 244 | common.logger.debug('Confirm: ' + message) 245 | return self.confirm 246 | 247 | 248 | def javaScriptPrompt(self, frame, message, default): 249 | """Override default JavaScript prompt popup and send to log 250 | """ 251 | common.logger.debug('Prompt: {} {}'.format(message, default)) 252 | 253 | 254 | def javaScriptConsoleMessage(self, message, line_number, source_id): 255 | """Override default JavaScript console and send to log 256 | """ 257 | common.logger.debug('Console: {} {} {}'.format(message, line_number, source_id)) 258 | 259 | 260 | def shouldInterruptJavaScript(self): 261 | """Disable javascript interruption dialog box 262 | """ 263 | return True 264 | 265 | 266 | 267 | class Browser(QWebView): 268 | def __init__(self, gui=False, user_agent=None, proxy=None, load_images=True, load_javascript=True, load_java=True, load_plugins=True, timeout=20, delay=5, app=None, use_cache=False): 269 | """Widget class that contains the address bar, webview for rendering webpages, and a table for displaying results 270 | 271 | user_agent: the user-agent when downloading content 272 | proxy: a QNetworkProxy to download through 273 | load_images: whether to download images 274 | load_javascript: whether to enable javascript 275 | load_java: whether to enable java 276 | load_plugins: whether to enable browser plugins 277 | timeout: the maximum amount of seconds to wait for a request 278 | delay: the minimum amount of seconds to wait between requests 279 | app: QApplication object so that can instantiate multiple browser objects 280 | use_cache: whether to cache all replies 281 | """ 282 | # must instantiate the QApplication object before any other Qt objects 283 | self.app = app or QApplication(sys.argv) 284 | super(Browser, self).__init__() 285 | 286 | page = WebPage(user_agent or alg.rand_agent()) 287 | manager = NetworkAccessManager(proxy, use_cache) 288 | page.setNetworkAccessManager(manager) 289 | self.setPage(page) 290 | page.networkAccessManager().finished.connect(self.finished) 291 | # set whether to enable plugins, images, and java 292 | self.settings().setAttribute(QWebSettings.AutoLoadImages, load_images) 293 | self.settings().setAttribute(QWebSettings.JavascriptEnabled, load_javascript) 294 | self.settings().setAttribute(QWebSettings.JavaEnabled, load_java) 295 | self.settings().setAttribute(QWebSettings.PluginsEnabled, load_plugins) 296 | self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True) 297 | self.timeout = timeout 298 | self.delay = delay 299 | if gui: 300 | self.showNormal() 301 | self.raise_() 302 | 303 | 304 | def __del__(self): 305 | self.setPage(None) 306 | 307 | 308 | def home(self): 309 | """Go back to initial page in history 310 | """ 311 | history = self.history() 312 | history.goToItem(history.itemAt(0)) 313 | 314 | 315 | def save(self): 316 | """Save the current HTML state to disk 317 | """ 318 | for i in itertools.count(1): 319 | filename = os.path.join(settings.state_dir, 'state{}.html'.format(i)) 320 | if not os.path.exists(filename): 321 | html = self.current_html() 322 | open(filename, 'w').write(common.to_unicode(html)) 323 | print 'save', filename 324 | break 325 | 326 | 327 | def set_proxy(self, proxy): 328 | """Shortcut to set the proxy 329 | """ 330 | self.page().networkAccessManager().setProxy(proxy) 331 | 332 | 333 | def current_url(self): 334 | """Return current URL 335 | """ 336 | return str(self.url().toString()) 337 | 338 | 339 | def current_html(self): 340 | """Return current rendered HTML 341 | """ 342 | return common.to_unicode(unicode(self.page().mainFrame().toHtml())) 343 | 344 | 345 | def current_text(self): 346 | """Return text from the current rendered HTML 347 | """ 348 | return common.to_unicode(unicode(self.page().mainFrame().toPlainText())) 349 | 350 | 351 | def get(self, url, html=None, headers=None, data=None): 352 | """Load given url in webkit and return html when loaded 353 | 354 | url: the URL to load 355 | html: optional HTML to set instead of downloading 356 | headers: the headers to attach to the request 357 | data: the data to POST 358 | """ 359 | if isinstance(url, basestring): 360 | # convert string to Qt's URL object 361 | url = QUrl(url) 362 | if html: 363 | # load pre downloaded HTML 364 | self.setContent(html, baseUrl=url) 365 | return html 366 | 367 | t1 = time() 368 | loop = QEventLoop() 369 | self.loadFinished.connect(loop.quit) 370 | # need to make network request 371 | request = QNetworkRequest(url) 372 | if headers: 373 | # add headers to request when defined 374 | for header, value in headers: 375 | request.setRawHeader(header, value) 376 | fn = super(Browser, self) 377 | if data: 378 | # POST request 379 | fn.load(request, QNetworkAccessManager.PostOperation, data) 380 | else: 381 | # GET request 382 | fn.load(request) 383 | 384 | # set a timeout on the download loop 385 | timer = QTimer() 386 | timer.setSingleShot(True) 387 | timer.timeout.connect(loop.quit) 388 | timer.start(self.timeout * 1000) 389 | loop.exec_() # delay here until download finished or timeout 390 | 391 | if timer.isActive(): 392 | # downloaded successfully 393 | timer.stop() 394 | parsed_html = self.current_html() 395 | self.wait(self.delay - (time() - t1)) 396 | else: 397 | # did not download in time 398 | common.logger.debug('Timed out: {}'.format(url.toString())) 399 | parsed_html = '' 400 | return parsed_html 401 | 402 | 403 | def wait(self, timeout=1): 404 | """Wait for delay time 405 | """ 406 | deadline = time() + timeout 407 | while time() < deadline: 408 | sleep(0) 409 | self.app.processEvents() 410 | 411 | 412 | def wait_quiet(self, timeout=20): 413 | """Wait until all requests have completed up to a maximum timeout. 414 | Returns True if all requests complete before the timeout. 415 | """ 416 | self.wait() 417 | deadline = time() + timeout 418 | manager = self.page().networkAccessManager() 419 | while time() < deadline and manager.active_requests: 420 | sleep(0) 421 | self.app.processEvents() 422 | self.app.processEvents() 423 | return manager.active_requests == [] 424 | 425 | 426 | def wait_load(self, pattern, timeout=60): 427 | """Wait for this content to be loaded up to maximum timeout. 428 | Returns True if pattern was loaded before the timeout. 429 | """ 430 | deadline = time() + timeout 431 | while time() < deadline: 432 | sleep(0) 433 | self.app.processEvents() 434 | if self.find(pattern): 435 | return True 436 | return False 437 | 438 | 439 | def wait_steady(self, timeout=60): 440 | """Wait for the DOM to be steady, defined as no changes over a 1 second period 441 | Returns True if DOM is steady before timeout, else False 442 | """ 443 | deadline = time() + timeout 444 | while time() < deadline: 445 | orig_html = self.current_html() 446 | self.wait(1) 447 | cur_html = self.current_html() 448 | if orig_html == cur_html: 449 | return True # DOM is steady 450 | return False 451 | 452 | 453 | def js(self, script): 454 | """Shortcut to execute javascript on current document and return result 455 | """ 456 | self.app.processEvents() 457 | return self.page().mainFrame().evaluateJavaScript(script).toString() 458 | 459 | 460 | def click(self, pattern='input', native=False): 461 | """Click all elements that match the pattern. 462 | 463 | Uses standard CSS pattern matching: http://www.w3.org/TR/CSS2/selector.html 464 | Returns the number of elements clicked 465 | """ 466 | es = self.find(pattern) 467 | for e in es: 468 | if native: 469 | # get position of element 470 | e_pos = e.geometry().center() 471 | # scroll to element position 472 | self.page().mainFrame().setScrollPosition(e_pos) 473 | scr_pos = self.page().mainFrame().scrollPosition() 474 | point_to_click = e_pos - scr_pos 475 | # create click on absolute coordinates 476 | press = QMouseEvent(QMouseEvent.MouseButtonPress, point_to_click, Qt.LeftButton, Qt.LeftButton, Qt.NoModifier) 477 | release = QMouseEvent(QMouseEvent.MouseButtonRelease, point_to_click, Qt.LeftButton, Qt.LeftButton, Qt.NoModifier) 478 | QApplication.postEvent(self, press) 479 | QApplication.postEvent(self, release) 480 | else: 481 | self.click_by_user_event_simulation(e) 482 | return len(es) 483 | 484 | 485 | def keys(self, pattern, text, native=False, blur=False): 486 | """Simulate typing by focusing on elements that match the pattern and triggering key events. 487 | If native is True then will use GUI key event simulation, else JavaScript. 488 | If blur is True then will blur focus at the end of typing. 489 | Returns the number of elements matched. 490 | """ 491 | es = self.find(pattern) 492 | for e in es: 493 | if native: 494 | key_map = {'\t': Qt.Key_Tab, '\n': Qt.Key_Enter, 'DOWN': Qt.Key_Down, 'UP': Qt.Key_Up} 495 | self.click_by_gui_simulation(e) 496 | self.wait(0.1) 497 | for c in text: 498 | key = key_map.get(c, QKeySequence(c)[0]) 499 | press = QKeyEvent(QEvent.KeyPress, key, Qt.NoModifier) 500 | release = QKeyEvent(QEvent.KeyRelease, key, Qt.NoModifier) 501 | QApplication.postEvent(self, press) 502 | QApplication.postEvent(self, release) 503 | else: 504 | #e.evaluateJavaScript("this.focus()") 505 | #self.click_by_user_event_simulation(e) 506 | self.fill(pattern, text, es=[e]) 507 | for event_name in ('focus', 'keydown', 'change', 'keyup', 'keypress'): 508 | self.trigger_js_event(e, event_name) 509 | if blur: 510 | e.evaluateJavaScript("this.blur()") 511 | return len(es) 512 | 513 | 514 | def attr(self, pattern, name, value=None): 515 | """For the elements that match this pattern, set attribute if value is defined, else return the value. 516 | """ 517 | if value is None: 518 | # want to get attribute 519 | return str(self.page().mainFrame().findFirstElement(pattern).attribute(name)) 520 | else: 521 | es = self.find(pattern) 522 | for e in es: 523 | e.setAttribute(name, value) 524 | return len(es) 525 | 526 | 527 | def fill(self, pattern, value, es=None): 528 | """Set text of the matching form elements to value, and return the number of elements matched. 529 | """ 530 | es = es or self.find(pattern) 531 | for e in es: 532 | tag = str(e.tagName()).lower() 533 | if tag == 'input' or tag == "select": 534 | e.evaluateJavaScript('this.value = "{}"'.format(value)) 535 | e.setAttribute('value', value) 536 | else: 537 | e.setPlainText(value) 538 | return len(es) 539 | 540 | 541 | def find(self, pattern): 542 | """Returns the elements matching this CSS pattern. 543 | """ 544 | if isinstance(pattern, basestring): 545 | matches = self.page().mainFrame().findAllElements(pattern).toList() 546 | elif isinstance(pattern, list): 547 | matches = pattern 548 | elif isinstance(pattern, QWebElement): 549 | matches = [pattern] 550 | else: 551 | common.logger.warning('Unknown pattern: ' + str(pattern)) 552 | matches = [] 553 | return matches 554 | 555 | 556 | def screenshot(self, output_file): 557 | """Take screenshot of current webpage and save results 558 | """ 559 | frame = self.page().mainFrame() 560 | self.page().setViewportSize(frame.contentsSize()) 561 | image = QImage(self.page().viewportSize(), QImage.Format_ARGB32) 562 | painter = QPainter(image) 563 | frame.render(painter) 564 | painter.end() 565 | common.logger.debug('saving: ' + output_file) 566 | image.save(output_file) 567 | 568 | 569 | def trigger_js_event(self, element, event_name): 570 | """Triggers a JavaScript level event on an element. 571 | 572 | Takes a QWebElement as input, and a string name of the event (e.g. "click"). 573 | 574 | Implementation is taken from Artemis: 575 | https://github.com/cs-au-dk/Artemis/blob/720f051c4afb4cd69e560f8658ebe29465c59362/artemis-code/src/runtime/input/forms/formfieldinjector.cpp#L294 576 | """ 577 | # TODO: Strictly we should create an appropriate event type as listed in: 578 | # https://developer.mozilla.org/en-US/docs/Web/Events 579 | # https://developer.mozilla.org/en-US/docs/Web/API/Document/createEvent#Notes 580 | # For now we use generic "Event". 581 | event_type = "Event"; 582 | event_init_method = "initEvent"; 583 | bubbles = "true"; 584 | cancellable = "true"; 585 | injection = "var event = document.createEvent('{}'); event.{}('{}', {}, {}); this.dispatchEvent(event);".format(event_type, event_init_method, event_name, bubbles, cancellable); 586 | element.evaluateJavaScript(injection); 587 | 588 | 589 | def click_by_user_event_simulation(self, element): 590 | """Uses JS-level events to simulate a full user click. 591 | 592 | Takes a QWebElement as input. 593 | 594 | Implementation is taken from Artemis: 595 | https://github.com/cs-au-dk/Artemis/blob/720f051c4afb4cd69e560f8658ebe29465c59362/artemis-code/src/runtime/input/clicksimulator.cpp#L42 596 | """ 597 | self.trigger_js_event(element, "mouseover"); 598 | self.trigger_js_event(element, "mousemove"); 599 | self.trigger_js_event(element, "mousedown"); 600 | self.trigger_js_event(element, "focus"); 601 | self.trigger_js_event(element, "mouseup"); 602 | self.trigger_js_event(element, "click"); 603 | self.trigger_js_event(element, "mousemove"); 604 | self.trigger_js_event(element, "mouseout"); 605 | self.trigger_js_event(element, "blur"); 606 | 607 | 608 | def finished(self, reply): 609 | """Override the reply finished signal to check the result of each request 610 | """ 611 | pass 612 | 613 | 614 | 615 | if __name__ == '__main__': 616 | # initiate webkit and show gui 617 | # once script is working you can disable the gui 618 | w = Browser(gui=True) 619 | # load webpage 620 | w.get('http://duckduckgo.com') 621 | # fill search textbox 622 | w.fill('input[id=search_form_input_homepage]', 'web scraping') 623 | # take screenshot of webpage 624 | w.screenshot('duckduckgo.jpg') 625 | # click search button 626 | w.click('input[id=search_button_homepage]') 627 | # show webpage for 10 seconds 628 | w.wait(10) 629 | -------------------------------------------------------------------------------- /xpath.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module implements a subset of the XPath standard: 3 | - tags 4 | - indices 5 | - attributes 6 | - descendants 7 | 8 | This was created because I needed a pure Python XPath parser. 9 | 10 | Generally XPath solutions will normalize the HTML into XHTML before selecting nodes. 11 | However this module tries to navigate the HTML structure directly without normalizing by searching for the next closing tag. 12 | """ 13 | 14 | #TODO: 15 | # - parent 16 | # - search by text: text() == '...' 17 | # - return xpath for most similar to text 18 | # - multiple filters for a tag 19 | 20 | import itertools, re, sys, urllib, urllib2, urlparse 21 | from optparse import OptionParser 22 | import adt, common, settings 23 | 24 | 25 | class Doc: 26 | """Wrapper around a parsed webpage 27 | 28 | html: 29 | The content of webpage to parse 30 | remove: 31 | A list of tags to remove 32 | 33 | >>> doc = Doc('

ghi') 34 | >>> doc.search('/div/a') 35 | ['LINK 1', 'LINK 3'] 36 | >>> doc.search('/div/a[@class="link"]') 37 | ['LINK 1'] 38 | >>> doc.search('/div[1]//a') 39 | ['LINK 1', 'LINK 2'] 40 | >>> doc.search('/div/a/@class') 41 | ['link', ''] 42 | >>> doc.search('/div[-1]/a') 43 | ['LINK 3'] 44 | 45 | >>> # test searching unicode 46 | >>> doc = Doc(u'google') 47 | >>> doc.get('//a[@class="flink"]') 48 | u'google' 49 | 50 | >>> # test finding just the first instance for a large amount of content 51 | >>> doc = Doc('
content
' * 10000) 52 | >>> doc.get('//span') 53 | 'content' 54 | 55 | >>> # test extracting attribute of self closing tag 56 | >>> Doc('
').get('/div/img/@src') 57 | 'img.png' 58 | 59 | >>> # test extracting attribute after self closing tag 60 | >>> Doc('

content


').get('/div/p') 61 | 'content' 62 | """ 63 | 64 | # regex to find a tag 65 | _tag_regex = re.compile('<([\w\:]+)') 66 | # regex to find an attribute 67 | _attributes_regex = re.compile('([\w\:-]+)\s*=\s*(".*?"|\'.*?\'|\S+)', re.DOTALL) 68 | # regex to find content of a tag 69 | _content_regex = re.compile('<.*?>(.*)$', re.DOTALL) 70 | 71 | 72 | def __init__(self, html, remove=None): 73 | #self.html = self._clean(html, remove) 74 | self.html = html 75 | self.num_searches = 0 76 | 77 | def get(self, xpath): 78 | """Return the first result from this XPath selection 79 | """ 80 | results = self._xpath(self.parse(xpath), self.html, limit=1) 81 | return common.first(results) 82 | 83 | def search(self, xpath): 84 | """Return all results from this XPath selection 85 | """ 86 | return self._xpath(self.parse(xpath), self.html, limit=sys.maxint) 87 | 88 | 89 | def _xpath(self, path, html, limit): 90 | """Recursively search HTML for content at XPath 91 | """ 92 | counter, separator, tag, index, attributes = path.pop(0) 93 | if counter == 0: 94 | self.num_searches += 1 95 | 96 | results = [] 97 | if tag == '..': 98 | # parent 99 | raise common.WebScrapingError('.. not yet supported') 100 | results.append(self.get_parent(html)) 101 | elif tag == 'text()': 102 | # extract child text 103 | text = self._get_content(self._get_html(html)) 104 | results.append(common.remove_tags(text, keep_children=False)) 105 | # check if next tag is selecting attribute 106 | elif tag.startswith('@'): 107 | attr = tag[1:].lower() 108 | #parent = self.get_parent(context) 109 | value = self._get_attributes(html).get(attr, '') 110 | results.append(value) 111 | else: 112 | # have tag 113 | if counter > 0: 114 | # get child html when not at root 115 | html = self._get_content(html) 116 | 117 | # search direct children if / and all descendants if // 118 | search_fn = self._find_children if separator == '' else self._find_descendants 119 | matches = search_fn(html, tag) 120 | 121 | # support negative indices 122 | if index is not None and index < 0: 123 | matches = list(matches) 124 | index += len(matches) + 1 125 | 126 | for child_i, child in enumerate(matches): 127 | # check if matches index 128 | if index is None or index == child_i + 1: 129 | # check if matches attributes 130 | if not attributes or self._match_attributes(attributes, self._get_attributes(child)): 131 | if path: 132 | results.extend(self._xpath(path[:], child, limit)) 133 | else: 134 | # final node 135 | results.append(self._get_content(child)) 136 | if len(results) > limit: 137 | break 138 | 139 | #if not children: 140 | # attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' 141 | # common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) 142 | return results 143 | 144 | 145 | 146 | def _clean(self, html, remove): 147 | """Remove specified unhelpful tags and comments 148 | """ 149 | self.remove = remove 150 | html = re.compile('', re.DOTALL).sub('', html) # remove comments 151 | if remove: 152 | # XXX combine tag list into single regex, if can match same at start and end 153 | for tag in remove: 154 | html = re.compile('<' + tag + '[^>]*?/>', re.DOTALL | re.IGNORECASE).sub('', html) 155 | html = re.compile('<' + tag + '[^>]*?>.*?', re.DOTALL | re.IGNORECASE).sub('', html) 156 | html = re.compile('<' + tag + '[^>]*?>', re.DOTALL | re.IGNORECASE).sub('', html) 157 | return html 158 | 159 | 160 | def parse(self, xpath): 161 | """Parse the xpath into: counter, separator, tag, index, and attributes 162 | 163 | >>> doc = Doc('') 164 | >>> doc.parse('/div[1]//span[@class="text"]') 165 | [(0, '', 'div', 1, []), (1, '/', 'span', None, [('class', 'text')])] 166 | >>> doc.parse('//li[-2]') 167 | [(0, '/', 'li', -2, [])] 168 | >>> doc.parse('//option[@selected]') 169 | [(0, '/', 'option', None, [('selected', None)])] 170 | >>> doc.parse('/div[@id="content"]//span[1][@class="text"][@title=""]/a') 171 | [(0, '', 'div', None, [('id', 'content')]), (1, '/', 'span', 1, [('class', 'text'), ('title', '')]), (2, '', 'a', None, [])] 172 | """ 173 | tokens = [] 174 | counter = 0 175 | for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath): 176 | index, attributes = None, [] 177 | if '[' in token: 178 | tag = token[:token.find('[')] 179 | for attribute in re.compile('\[(.*?)\]').findall(token): 180 | try: 181 | index = int(attribute) 182 | except ValueError: 183 | match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search(attribute) 184 | if match: 185 | key, value = match.groups() 186 | attributes.append((key.lower(), value.lower())) 187 | else: 188 | match = re.compile('@(.*?)$').search(attribute) 189 | if match: 190 | attributes.append((match.groups()[0].lower(), None)) 191 | else: 192 | raise common.WebScrapingError('Unknown format: ' + attribute) 193 | else: 194 | tag = token 195 | tokens.append((counter, separator, tag, index, attributes)) 196 | counter += 1 197 | return tokens 198 | 199 | 200 | def _get_attributes(self, html): 201 | """Extract the attributes of the passed HTML tag 202 | 203 | >>> doc = Doc('') 204 | >>> doc._get_attributes('
content SPAN
') 205 | {'max-width': '20', 'class': 'abc', 'id': 'ID', 'name': 'MY NAME'} 206 | >>> doc._get_attributes('') 207 | {'width': '200', 'class': 'textelien', 'valign': 'top'} 208 | >>> doc._get_attributes('
abc
') 347 | '
abc
' 348 | >>> doc._jump_next_tag('
abc
') 349 | '
abc
' 350 | """ 351 | while 1: 352 | match = Doc._tag_regex.search(html) 353 | if match: 354 | return html[match.start():] 355 | else: 356 | return None 357 | 358 | 359 | def _get_tag(self, html): 360 | """Find tag type at this location 361 | 362 | >>> doc = Doc('') 363 | >>> doc._get_tag('
abc
') 364 | 'div' 365 | >>> doc._get_tag('
') 366 | >>> doc._get_tag('div') 367 | """ 368 | match = Doc._tag_regex.match(html) 369 | if match: 370 | return match.groups()[0] 371 | else: 372 | return None 373 | 374 | 375 | def _split_tag(self, html): 376 | """Extract starting tag and contents from HTML 377 | 378 | >>> doc = Doc('') 379 | >>> doc._split_tag('
abc
def
abc
ghi
jkl
') 380 | ('
abc
def
abc
', 'ghi
jkl
') 381 | >>> doc._split_tag('
abc
') 382 | ('
', '
abc
') 383 | >>> doc._split_tag('
abc
def
abc') 384 | ('
abc
def
abc
', '') 385 | >>> # test efficiency of splits 386 | >>> a = [doc._split_tag('
abc
def
abc') for i in range(10000)] 387 | """ 388 | i = None 389 | tag = self._get_tag(html) 390 | depth = 0 # how far nested 391 | for match in re.compile('' % tag, re.DOTALL | re.IGNORECASE).finditer(html): 392 | if html[match.start() + 1] == '/': 393 | depth -= 1 # found closing tag 394 | elif tag in common.EMPTY_TAGS: 395 | pass # this tag type does not close 396 | elif html[match.end() - 2] == '/': 397 | pass # tag starts and ends (eg
) 398 | else: 399 | depth += 1 # found opening tag 400 | if depth == 0: 401 | # found top level match 402 | i = match.end() 403 | break 404 | if i is None: 405 | # all html is within this tag 406 | return html + '' % tag, '' 407 | else: 408 | return html[:i], html[i:] 409 | 410 | 411 | def _parent_tag(self, html): 412 | """Find parent tag of this current tag 413 | 414 | >> doc = Doc('

empty

') 415 | >> doc._parent_tag('empty') 416 | '
empty
' 417 | >> doc = Doc('

empty
') 418 | >> doc._parent_tag('empty') 419 | '

empty
' 420 | """ 421 | raise Exception('Not implemented') 422 | #index = self.html.find(html) 423 | #while index >= 0: 424 | # index = self.html.rfind('<', start=0, end=index) 425 | 426 | 427 | try: 428 | import lxml.html 429 | except ImportError: 430 | class Tree: 431 | def __init__(*args, **kwargs): 432 | raise ImportError('lxml not installed') 433 | else: 434 | # if lxml is supported create wrapper 435 | class Tree: 436 | def __init__(self, html, **kwargs): 437 | if isinstance(html, lxml.html.HtmlElement): 438 | # input is already a passed lxml tree 439 | self.doc = html 440 | else: 441 | try: 442 | self.doc = lxml.html.fromstring(html) 443 | except lxml.etree.LxmlError: 444 | self.doc = None 445 | 446 | def __eq__(self, html): 447 | return self.orig_html is html 448 | 449 | 450 | def xpath(self, path): 451 | return [] if self.doc is None else self.doc.xpath(path) 452 | 453 | def get(self, path): 454 | es = self.xpath(path) 455 | if es: 456 | return self.tostring(es[0]) 457 | return '' 458 | 459 | def search(self, path): 460 | return [self.tostring(e) for e in self.xpath(path)] 461 | 462 | def tostring(self, node): 463 | try: 464 | parts = [node.text] + [unicode(c) if isinstance(c, basestring) else lxml.etree.tostring(c) for c in node] + [node.tail] 465 | return ''.join(filter(None, parts)) or str(node) 466 | except AttributeError: 467 | return unicode(node) 468 | 469 | 470 | def get(html, xpath, remove=None): 471 | """Return first element from XPath search of HTML 472 | """ 473 | return Doc(html, remove=remove).get(xpath) 474 | 475 | def search(html, xpath, remove=None): 476 | """Return all elements from XPath search of HTML 477 | """ 478 | return Doc(html, remove=remove).search(xpath) 479 | 480 | def find_children(html, tag, remove=None): 481 | """Find children with this tag type 482 | """ 483 | return Doc(html, remove=remove)._find_children(html, tag) 484 | 485 | 486 | 487 | class Form: 488 | """Helper class for filling and submitting forms 489 | """ 490 | def __init__(self, form): 491 | self.data = {} 492 | for input_name, input_value in zip(search(form, '//input/@name'), search(form, '//input/@value')): 493 | self.data[input_name] = input_value 494 | for text_name, text_value in zip(search(form, '//textarea/@name'), search(form, '//textarea')): 495 | self.data[text_name] = text_value 496 | for select_name, select_contents in zip(search(form, '//select/@name'), search(form, '//select')): 497 | self.data[select_name] = get(select_contents, '/option[@selected]/@value') 498 | if '' in self.data: 499 | del self.data[''] 500 | 501 | def __getitem__(self, key): 502 | return self.data[key] 503 | 504 | def __setitem__(self, key, value): 505 | self.data[key] = value 506 | 507 | def __str__(self): 508 | return urllib.urlencode(self.data) 509 | 510 | def submit(self, D, action, **argv): 511 | return D.get(url=action, data=self.data, **argv) 512 | 513 | 514 | 515 | js_re = re.compile('location.href ?= ?[\'"](.*?)[\'"]') 516 | def get_links(html, url=None, local=True, external=True): 517 | """Return all links from html and convert relative to absolute if source url is provided 518 | 519 | html: 520 | HTML to parse 521 | url: 522 | optional URL for determining path of relative links 523 | local: 524 | whether to include links from same domain 525 | external: 526 | whether to include linkes from other domains 527 | """ 528 | def normalize_link(link): 529 | if urlparse.urlsplit(link).scheme in ('http', 'https', ''): 530 | if '#' in link: 531 | link = link[:link.index('#')] 532 | if url: 533 | link = urlparse.urljoin(url, link) 534 | if not local and common.same_domain(url, link): 535 | # local links not included 536 | link = None 537 | if not external and not common.same_domain(url, link): 538 | # external links not included 539 | link = None 540 | else: 541 | link = None # ignore mailto, etc 542 | return link 543 | a_links = search(html, '//a/@href') 544 | i_links = search(html, '//iframe/@src') 545 | js_links = js_re.findall(html) 546 | links = [] 547 | for link in a_links + i_links + js_links: 548 | try: 549 | link = normalize_link(link) 550 | except UnicodeError: 551 | pass 552 | else: 553 | if link and link not in links: 554 | links.append(link) 555 | return links 556 | --------------------------------------------------------------------------------