├── .gitignore
├── MANIFEST.in
├── README.rst
├── __init__.py
├── adt.py
├── alg.py
├── async.py
├── common.py
├── docs
    ├── Makefile
    ├── conf.py
    ├── examples.rst
    ├── index.rst
    ├── introduction.rst
    └── reference.rst
├── download.py
├── pdict.py
├── settings.py
├── setup.py
├── webkit.py
└── xpath.py


/.gitignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | *.pyc
3 | *~
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Documentation is hosted at
2 | `docs.webscraping.com <http://docs.webscraping.com/>`__.
3 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | Website: 
 3 |     http://code.google.com/p/webscraping/
 4 | 
 5 | License: 
 6 |     LGPL
 7 | """
 8 | 
 9 | if __name__ == '__main__':
10 |     import doctest
11 |     for name in ['adt', 'alg', 'common', 'download', 'pdict', 'settings', 'webkit', 'xpath']:
12 |         module = __import__(name)
13 |         print name
14 |         print doctest.testmod(module)
15 | 


--------------------------------------------------------------------------------
/adt.py:
--------------------------------------------------------------------------------
 1 | __doc__ = 'High level abstract datatypes'
 2 | 
 3 | from datetime import datetime, timedelta
 4 | from collections import defaultdict, deque
 5 | try:
 6 |     import pybloom
 7 | except ImportError:
 8 |     pass
 9 | 
10 | 
11 | class Bag(dict):
12 |     """Dictionary object with attribute like access
13 | 
14 |     >>> b = Bag()
15 |     >>> b.name = 'company'
16 |     >>> b.name
17 |     'company'
18 |     >>> b.address
19 |     """
20 |     def __init__(self, *args, **kwargs):
21 |         dict.__init__(self, *args, **kwargs)
22 | 
23 |     def __getattr__(self, name):
24 |         return self.get(name)
25 | 
26 |     def __setattr__(self, name, value):
27 |         self[name] = value
28 | 
29 | 
30 | class HashDict:
31 |     """For storing large quantities of keys where don't need the original value of the key
32 |     Instead each key is hashed and hashes are compared for equality
33 | 
34 |     >>> hd = HashDict()
35 |     >>> url = 'http://webscraping.com'
36 |     >>> hd[url] = True
37 |     >>> url in hd
38 |     True
39 |     >>> 'other url' in hd
40 |     False
41 |     >>> len(hd)
42 |     1
43 |     """
44 |     def __init__(self, default_factory=str):
45 |         self.d = defaultdict(default_factory)
46 | 
47 |     def __len__(self):
48 |         """How many keys are stored in the HashDict
49 |         """
50 |         return self.d.__len__()
51 | 
52 |     def __contains__(self, name):
53 |         return self.d.__contains__(self.get_hash(name))
54 | 
55 |     def __getitem__(self, name):
56 |         return self.d.__getitem__(self.get_hash(name))
57 | 
58 |     def __setitem__(self, name, value):
59 |         return self.d.__setitem__(self.get_hash(name), value)
60 | 
61 |     def add(self, name):
62 |         self[name] = True
63 | 
64 |     def get(self, name, default=None):
65 |         """Get the value at this key
66 | 
67 |         Returns default if key does not exist
68 |         """
69 |         return self.d.get(self.get_hash(name), default)
70 | 
71 |     def get_hash(self, value):
72 |         """get the hash value of this value
73 |         """
74 |         return hash(value)
75 | 
76 | 
77 | class Bloom:
78 |     """A bloom filter is a space efficient way to tell if an element is in a set.
79 |     False positive are possible - set by err rate - but false negatives are not.
80 |     """
81 |     def __init__(self, start_items=10000, err_rate=0.0001):
82 |         self.bloom = pybloom.ScalableBloomFilter(10000, err, 4)
83 | 
84 |     def __contains__(self, key):
85 |         return key in self.bloom
86 | 
87 |     def add(self, key):
88 |         return self.bloom.add(key)
89 | 


--------------------------------------------------------------------------------
/alg.py:
--------------------------------------------------------------------------------
  1 | __doc__ = 'High level functions for interpreting useful data from input'
  2 | 
  3 | import csv, logging, math, os, random, re
  4 | import common, xpath
  5 | 
  6 | 
  7 | def get_excerpt(html, try_meta=False, max_chars=255):
  8 |     """Extract excerpt from this HTML by finding the largest text block
  9 | 
 10 |     try_meta: 
 11 |         indicates whether to try extracting from meta description tag
 12 |     max_chars: 
 13 |         the maximum number of characters for the excerpt
 14 |     """
 15 |     # try extracting meta description tag
 16 |     excerpt = ''
 17 |     if try_meta:
 18 |         excerpt = xpath.get(html, '/html/head/meta[@name="description"]/@content')
 19 |     if not excerpt:
 20 |         # remove these tags and then find biggest text block
 21 |         bad_tags = 'hr', 'br', 'script', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 22 |         content = common.remove_tags(xpath.get(html, '/html/body', remove=bad_tags))
 23 |         if content:
 24 |             excerpt = max((len(p.strip()), p) for p in content.splitlines())[1]
 25 |     return common.unescape(excerpt.strip())[:max_chars]
 26 | 
 27 | 
 28 | IGNORED_EMAILS = 'username@location.com', 'johndoe@domain.com'
 29 | def extract_emails(html, ignored=IGNORED_EMAILS):
 30 |     """Remove common obfuscations from HTML and then extract all emails
 31 | 
 32 |     ignored: 
 33 |         list of dummy emails to ignore
 34 | 
 35 |     >>> extract_emails('')
 36 |     []
 37 |     >>> extract_emails('hello contact@webscraping.com world')
 38 |     ['contact@webscraping.com']
 39 |     >>> extract_emails('hello contact@<!-- trick comment -->webscraping.com world')
 40 |     ['contact@webscraping.com']
 41 |     >>> extract_emails('hello contact AT webscraping DOT com world')
 42 |     ['contact@webscraping.com']
 43 |     >>> extract_emails(' info+hn@gmail.com ')
 44 |     ['info+hn@gmail.com']
 45 |     >>> extract_emails('<a href="mailto:first.last@mail.co.uk">Contact</a>')
 46 |     ['first.last@mail.co.uk']
 47 |     """
 48 |     emails = []
 49 |     if html:
 50 |         email_re = re.compile('([\w\.\-\+]{1,64})@(\w[\w\.-]{1,255})\.(\w+)')
 51 |         # remove comments, which can obfuscate emails
 52 |         html = re.compile('<!--.*?-->', re.DOTALL).sub('', html).replace('mailto:', '')
 53 |         for user, domain, ext in email_re.findall(html):
 54 |             if ext.lower() not in common.MEDIA_EXTENSIONS and len(ext)>=2 and not re.compile('\d').search(ext) and domain.count('.')<=3:
 55 |                 email = '%s@%s.%s' % (user, domain, ext)
 56 |                 if email not in emails:
 57 |                     emails.append(email)
 58 | 
 59 |         # look for obfuscated email
 60 |         for user, domain, ext in re.compile('([\w\.\-\+]{1,64})\s?.?AT.?\s?([\w\.-]{1,255})\s?.?DOT.?\s?(\w+)', re.IGNORECASE).findall(html):
 61 |             if ext.lower() not in common.MEDIA_EXTENSIONS and len(ext)>=2 and not re.compile('\d').search(ext) and domain.count('.')<=3:
 62 |                 email = '%s@%s.%s' % (user, domain, ext)
 63 |                 if email not in emails:
 64 |                     emails.append(email)
 65 |     return [email for email in emails if email not in ignored]
 66 | 
 67 | 
 68 | def extract_phones(html):
 69 |     """Extract phone numbers from this HTML
 70 | 
 71 |     >>> extract_phones('Phone: (123) 456-7890 <br>')
 72 |     ['(123) 456-7890']
 73 |     >>> extract_phones('Phone 123.456.7890 ')
 74 |     ['123.456.7890']
 75 |     >>> extract_phones('+1-123-456-7890<br />123 456 7890n')
 76 |     ['123-456-7890', '123 456 7890']
 77 |     >>> extract_phones('456-7890')
 78 |     []
 79 |     >>> extract_phones('<a href="tel:0234673460">Contact</a>')
 80 |     ['0234673460']
 81 |     """
 82 |     return [match.group() for match in re.finditer('(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}', html)] + re.findall('tel:(\d+)', html)
 83 | 
 84 | 
 85 | def parse_us_address(address):
 86 |     """Parse USA address into address, city, state, and zip code
 87 | 
 88 |     >>> parse_us_address('6200 20th Street, Vero Beach, FL 32966')
 89 |     ('6200 20th Street', 'Vero Beach', 'FL', '32966')
 90 |     """
 91 |     city = state = zipcode = ''
 92 |     addrs = map(lambda x:x.strip(), address.split(','))
 93 |     if addrs:
 94 |         m = re.compile('([A-Z]{2,})\s*(\d[\d\-\s]+\d)').search(addrs[-1])
 95 |         if m:
 96 |             state = m.groups()[0].strip()
 97 |             zipcode = m.groups()[1].strip()
 98 | 
 99 |             if len(addrs)>=3:
100 |                 city = addrs[-2].strip()
101 |                 address = ','.join(addrs[:-2])
102 |             else:
103 |                 address = ','.join(addrs[:-1])
104 |             
105 |     return address, city, state, zipcode
106 | 
107 | 
108 | def get_earth_radius(scale):
109 |     if scale is None:
110 |         return 1.0
111 |     elif scale == 'km':
112 |         return 6373.0
113 |     elif scale == 'miles':
114 |         return 3960.0
115 |     else:
116 |         raise common.WebScrapingError('Invalid scale: %s' % str(scale))
117 | 
118 | 
119 | def distance(p1, p2, scale=None):
120 |     """Calculate distance between 2 (latitude, longitude) points.
121 | 
122 |     scale:
123 |         By default the distance will be returned as a ratio of the earth's radius
124 |         Use 'km' to return distance in kilometres, 'miles' to return distance in miles
125 | 
126 |     >>> melbourne = -37.7833, 144.9667
127 |     >>> san_francisco = 37.7750, -122.4183
128 |     >>> int(distance(melbourne, san_francisco, 'km'))
129 |     12659
130 |     """
131 |     if p1 == p2:
132 |         return 0
133 |     lat1, long1 = p1
134 |     lat2, long2 = p2
135 |     # Convert latitude and longitude to 
136 |     # spherical coordinates in radians.
137 |     degrees_to_radians = math.pi / 180.0
138 |         
139 |     # phi = 90 - latitude
140 |     phi1 = (90.0 - lat1)*degrees_to_radians
141 |     phi2 = (90.0 - lat2)*degrees_to_radians
142 |         
143 |     # theta = longitude
144 |     theta1 = long1*degrees_to_radians
145 |     theta2 = long2*degrees_to_radians
146 |         
147 |     # Compute spherical distance from spherical coordinates.
148 |         
149 |     # For two locations in spherical coordinates 
150 |     # (1, theta, phi) and (1, theta, phi)
151 |     # cosine( arc length ) = 
152 |     #    sin phi sin phi' cos(theta-theta') + cos phi cos phi'
153 |     # distance = rho * arc length
154 |     
155 |     cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) + math.cos(phi1)*math.cos(phi2))
156 |     arc = math.acos(cos)
157 |     return arc * get_earth_radius(scale)
158 | 
159 | 
160 | def find_coordinates(ch_lat=100, ch_lng=100, ch_scale='miles', min_lat=-90, max_lat=90, min_lng=-180, max_lng=180):
161 |     """Find all latitude/longitude coordinates within bounding box, with given increments
162 |     """
163 |     cur_lat = min_lat
164 |     while cur_lat < max_lat:
165 |         cur_lng = min_lng
166 |         while cur_lng < max_lng:
167 |             yield cur_lat, cur_lng
168 |             _, cur_lng = move_coordinate(cur_lat, cur_lng, 0, ch_lng, ch_scale)
169 |         cur_lat, _ = move_coordinate(cur_lat, cur_lng, ch_lat, 0, ch_scale)
170 | 
171 | 
172 | def move_coordinate(lat, lng, ch_lat, ch_lng, ch_scale=None):
173 |     """Move latitude/longitude coordinate a given increment
174 |     """
175 |     r_earth = get_earth_radius(ch_scale)
176 |     new_lat = lat + (ch_lat / r_earth) * (180 / math.pi);
177 |     new_lng = lng + (ch_lng / r_earth) * (180 / math.pi) / math.cos(lat * math.pi/180.0)
178 |     return new_lat, new_lng
179 | 
180 | 
181 | def get_zip_codes(filename, min_distance=100, scale='miles', lat_key='Latitude', lng_key='Longitude', zip_key='Zip'):
182 |     """Reads CSV file of zip,lat,lng and returns zip codes that aren't within the minimum distance of each other
183 |     """
184 |     for zip_code, lat, lng in get_zip_lat_lngs(filename, min_distance, scale, lat_key, lng_key, zip_key):
185 |         yield zip_code
186 | 
187 | def get_zip_lat_lngs(filename, min_distance=100, scale='miles', lat_key='Latitude', lng_key='Longitude', zip_key='Zip'):
188 |     if min_distance > 0:
189 |         locations = []
190 |         for record in csv.DictReader(open(filename)):
191 |             lat, lng = float(record[lat_key]), float(record[lng_key])
192 |             for other_lat, other_lng in locations:
193 |                 if distance((lat, lng), (other_lat, other_lng), scale=scale) < min_distance:
194 |                     break
195 |             else:
196 |                 locations.append((lat, lng))
197 |                 yield record[zip_key], record[lat_key], record[lng_key]
198 |     else:
199 |         for record in csv.DictReader(open(filename)):
200 |             yield record[zip_key], record[lat_key], record[lng_key]
201 |         
202 | 
203 | def find_json_path(e, value, path=''):
204 |     """Find the JSON path that points to this value
205 |     """
206 |     results = []
207 |     if e == value:
208 |         results.append(path)
209 |     if isinstance(e, dict):
210 |         for k, v in e.items():
211 |             key_path = '{}["{}"]'.format(path, k)
212 |             results.extend(find_json_path(v, value, key_path))
213 |     elif isinstance(e, list):
214 |         for i, v in enumerate(e):
215 |             index_path = '{}[{}]'.format(path, i)
216 |             results.extend(find_json_path(v, value, index_path))
217 |     return results
218 | 
219 | 
220 | # support to generate a random user agent
221 | 
222 | # the operating system templates
223 | def linux_os():
224 |     dist = random.choice(['', ' U;', ' Ubuntu;'])
225 |     system = random.choice(['', ' x86_64', ' i686'])
226 |     return 'X11;%s Linux%s' % (dist, system)
227 | 
228 | 
229 | def osx_os():
230 |     return 'Macintosh; Intel Mac OS X 10.%d' % random.randint(6, 9)
231 | 
232 | 
233 | def windows_os():
234 |     system = random.choice(['', '; Win64; x64', '; WOW64'])
235 |     return 'Windows NT %d.%d%s' % (random.randint(5, 6), random.randint(0, 2), system)
236 | 
237 | 
238 | def rand_os():
239 |     return random.choice([linux_os, osx_os, windows_os])()
240 | 
241 | # the browser templates
242 | def firefox_browser(os_version):
243 |     browser_version = random.randint(20, 25)
244 |     return 'Mozilla/5.0 (%s; rv:%d.0) Gecko/20100101 Firefox/%d.0' % (os_version, browser_version, browser_version)
245 | 
246 | def ie_browser(os_version=None):
247 |     os_version = windows_os() # always use windows with IE
248 |     return 'Mozilla/5.0 (compatible; MSIE %d.0; %s; Trident/%d.0)' % (random.randint(8, 10), os_version, random.randint(5, 6))
249 | 
250 | def chrome_browser(os_version):
251 |     return 'Mozilla/5.0 (%s) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%d.0.%d.%d Safari/537.36' % (os_version, random.randint(28, 32), random.randint(1464, 1667), random.randint(0, 9))
252 | 
253 | 
254 | def rand_agent():
255 |     """Returns a random user agent across Firefox, IE, and Chrome on Linux, OSX, and Windows
256 |     """
257 |     browser = random.choice([firefox_browser, ie_browser, chrome_browser])
258 |     return browser(rand_os())
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/async.py:
--------------------------------------------------------------------------------
  1 | __doc__ = 'Helper methods to download and crawl web content using threads'
  2 | 
  3 | import sys
  4 | import time
  5 | import cookielib
  6 | import base64
  7 | import signal
  8 | import urlparse
  9 | import collections
 10 | 
 11 | from twisted.internet import reactor, defer, protocol, endpoints
 12 | from twisted.web import client, error, http, http_headers
 13 | from twisted.python import failure, log
 14 | 
 15 | import adt, common, download, settings
 16 | 
 17 | 
 18 | """
 19 | TODO
 20 | - support for POST
 21 | - efficient get request callback
 22 | """
 23 | 
 24 | 
 25 | def threaded_get(**kwargs):
 26 |     """Download using asynchronous single threaded twisted callbacks
 27 |     """
 28 |     tc = TwistedCrawler(**kwargs)
 29 |     tc.start()
 30 | 
 31 | 
 32 | class TwistedCrawler:
 33 |     def __init__(self, url=None, urls=None, url_iter=None, num_threads=20, cb=None, depth=True, max_errors=None, pattern=None, **kwargs):
 34 |         self.settings = adt.Bag(
 35 |             read_cache = True,
 36 |             write_cache = True,
 37 |             num_redirects = 5,
 38 |             num_retries = 2,
 39 |             timeout = 20,
 40 |             headers = {},
 41 |             num_threads = num_threads,
 42 |             cb = cb,
 43 |             url_iter = url_iter,
 44 |             depth = depth,
 45 |             pattern = pattern
 46 |         )
 47 |         self.settings.update(**kwargs)
 48 |         self.D = download.Download(**kwargs)
 49 |         self.kwargs = kwargs
 50 |         # queue of html to be written to cache
 51 |         self.cache_queue = []
 52 |         # URL's that are waiting to download
 53 |         self.download_queue = collections.deque()
 54 |         if urls:
 55 |             self.download_queue.extend(urls)
 56 |         if url:
 57 |             self.download_queue.append(url) # XXX create compressed dict data type for large in memory?
 58 |         # URL's currently downloading 
 59 |         self.processing = {}
 60 |         # defereds that are downloading
 61 |         self.downloading = []
 62 |         # URL's that have been found before
 63 |         self.found = adt.HashDict()
 64 |         for url in self.download_queue:
 65 |             self.found[url] = True
 66 |         self.state = download.State()
 67 |         self.max_errors = max_errors
 68 |         self.num_errors = 0 # counter for the number of subsequent errors
 69 | 
 70 | 
 71 |     def start(self):
 72 |         """Start the twisted event loop
 73 |         """
 74 |         # catch ctrl-c keyboard event and stop twisted
 75 |         signal.signal(signal.SIGINT, self.kill)
 76 |         self.running = True
 77 |         reactor.callWhenRunning(self.crawl)
 78 |         reactor.run()
 79 | 
 80 | 
 81 |     def stop(self):
 82 |         """Stop the twisted event loop
 83 |         """
 84 |         if self.running:
 85 |             common.logger.info('Twisted eventloop shutting down')
 86 |             self.running = False
 87 |             self.state.save()
 88 |             reactor.stop()
 89 | 
 90 | 
 91 |     def kill(self, *ignore):
 92 |         """Exit the script
 93 |         """
 94 |         for d in self.downloading:
 95 |             d.cancel()
 96 |         self.stop()
 97 |         sys.exit()
 98 | 
 99 | 
100 |     def is_finished(self):
101 |         """Call finish callback in case more processing to do
102 |         """
103 |         for url in self.settings.url_iter or []:
104 |             self.download_queue.append(url)
105 |             return False
106 |         return True
107 |             
108 | 
109 |     def crawl(self):
110 |         """Crawl more URLs if available
111 |         """
112 |         if self.download_queue or self.processing or self.cache_queue or not self.is_finished():
113 |             #print 'Running: %d, queue: %d, cache: %d, processing: %d, threads: %d' % (self.running, len(self.download_queue), len(self.cache_queue), len(self.processing), self.settings.num_threads)
114 |             while self.running and self.download_queue and len(self.processing) < self.settings.num_threads:
115 |                 url = str(self.download_queue.pop() if self.settings.depth else self.download_queue.popleft())
116 |                 self.processing[url] = ''
117 |                 downloaded = False
118 |                 if self.D.cache and self.settings.read_cache:
119 |                     key = self.D.get_key(url, self.settings.data)
120 |                     try:
121 |                         html = self.D.cache[key]
122 |                     except KeyError:
123 |                         pass 
124 |                     else:
125 |                         # html is available so scrape this directly
126 |                         if self.D.invalid_response(html, self.settings.pattern):
127 |                             # invalid result from download
128 |                             html = ''
129 |                         if html or self.settings.num_retries == 0:
130 |                             reactor.callLater(0, self.scrape, url, html)
131 |                             downloaded = True
132 | 
133 |                 if downloaded:
134 |                     # record cache load
135 |                     self.state.update(num_caches=1)
136 |                 else:
137 |                     # need to download this new URL
138 |                     self.download_start(url)
139 |                 self.state.update(queue_size=len(self.download_queue))
140 | 
141 |                 # XXX test inactive
142 |                 try:
143 |                     self.inactive_call.cancel()
144 |                 except AttributeError:
145 |                     pass # not defined yet
146 |                 self.inactive_call = reactor.callLater(5*60, self.inactive)
147 |                 # XXX
148 | 
149 |             if self.running:
150 |                 reactor.callLater(0, self.cache_downloads)
151 |                 reactor.callLater(0, self.crawl)
152 |         else:
153 |             # save the final state and exit
154 |             self.stop()
155 | 
156 | 
157 |     def inactive(self):
158 |         common.logger.error('crawler inactive')
159 |         common.logger.error('queue (%d): %s' % (len(self.download_queue), ', '.join(self.download_queue)))
160 |         common.logger.error('processing (%d): %s' % (len(self.processing), ', '.join(self.processing)))
161 |         self.stop()
162 | 
163 | 
164 |     def download_start(self, url, num_retries=0, redirects=None, proxy=None):
165 |         """Start URL download
166 |         """
167 |         redirects = redirects or []
168 |         redirects.append(url)
169 |         if not proxy:
170 |             proxy = self.D.get_proxy()
171 |             self.processing[redirects[0]] = proxy
172 | 
173 |         headers = {}
174 |         headers['User-Agent'] = [self.settings.get('user_agent', self.D.get_user_agent(proxy))]
175 |         for name, value in self.settings.headers.items() + settings.default_headers.items():
176 |             if name not in headers:
177 |                 if not value:
178 |                     if name == 'Referer':
179 |                         value = url
180 |                 headers[name] = [value]
181 |         agent = self.build_agent(proxy, headers)
182 |         data = None
183 |         d = agent.request('GET', url, http_headers.Headers(headers), data) 
184 |         d.addCallback(self.download_headers, url, num_retries, redirects)
185 |         d.addErrback(self.download_error, redirects[0])
186 |         d.addErrback(log.err)
187 | 
188 |         # timeout to stop download if hangs
189 |         timeout_call = reactor.callLater(self.settings.timeout, self.download_timeout, d, url)
190 |         self.downloading.append(d)
191 | 
192 |         def completed(ignore):
193 |             # remove timeout callback on completion
194 |             if timeout_call.active():
195 |                 timeout_call.cancel()
196 |                 self.downloading.remove(d)
197 |         d.addBoth(completed)
198 | 
199 | 
200 |     def download_headers(self, response, url, num_retries, redirects):
201 |         """Headers have been returned from download
202 |         """
203 |         common.logger.info('Downloading ' + url)
204 |         finished = defer.Deferred()
205 |         # XXX how to ignore processing body for errors?
206 |         response.deliverBody(DownloadPrinter(finished))
207 |         if self.handle_redirect(url, response, num_retries, redirects):
208 |             # redirect handled
209 |             pass
210 |         elif 400 <= response.code < 500:
211 |             raise TwistedError(response.phrase)
212 |         elif 500 <= response.code < 600:
213 |             # server error so try again
214 |             message = '%s (%d)' % (response.phrase, response.code)
215 |             self.handle_retry(url, message, num_retries, redirects)
216 |         elif self.running:
217 |             # handle download
218 |             finished.addCallbacks(self.download_complete, self.download_error, 
219 |                 callbackArgs=[num_retries, redirects], errbackArgs=[redirects[0]]
220 |             )
221 |             finished.addErrback(self.download_error, redirects[0])
222 | 
223 | 
224 |     def download_complete(self, html, num_retries, redirects):
225 |         """Body has completed downloading
226 |         """
227 |         redirect_url = download.get_redirect(redirects[0], html)
228 |         if redirect_url:
229 |             # meta redirect
230 |             proxy = self.processing[redirects[0]]
231 |             reactor.callLater(0, self.download_start, redirect_url, 0, redirects, proxy)
232 |         elif self.D.invalid_response(html, self.settings.pattern):
233 |             # invalid result from download
234 |             message = 'Content did not match expected pattern'
235 |             self.handle_retry(redirects[0], message, num_retries, redirects)
236 | 
237 |         else:
238 |             # successful download
239 |             self.num_errors = 0
240 |             self.state.update(num_downloads=1)
241 |             if self.D.cache and self.settings.write_cache:
242 |                 self.cache_queue.append((redirects, html))
243 |             reactor.callLater(0, self.scrape, redirects[0], html)
244 | 
245 | 
246 |     def download_timeout(self, d, url):
247 |         """Catch timeout error and cancel request
248 |         """
249 |         self.downloading.remove(d)
250 |         d.cancel()
251 | 
252 | 
253 |     def download_error(self, reason, url):
254 |         """Error received during download
255 |         """
256 |         # XXX how to properly pass error from download timeout cancel
257 |         error = reason.getErrorMessage() or 'Download timeout' 
258 |         common.logger.warning('Download error: %s: %s' % (error, url))
259 |         self.state.update(num_errors=1)
260 |         if self.D.cache and self.settings.write_cache:
261 |             self.cache_queue.append((url, ''))
262 |         del self.processing[url]
263 |         # check whether to give up the crawl
264 |         self.num_errors += 1
265 |         if self.max_errors is not None:
266 |             common.logger.debug('Errors: %d / %d' % (self.num_errors, self.max_errors))
267 |             if self.num_errors > self.max_errors:
268 |                 common.logger.error('Too many download errors, shutting down')
269 |                 self.stop()
270 | 
271 | 
272 |     def handle_retry(self, url, message, num_retries, redirects):
273 |         """Handle retrying a download error
274 |         """
275 |         if num_retries < self.settings.num_retries:
276 |             # retry the download
277 |             common.logger.info('Download retry: %d: %s' % (num_retries, url))
278 |             reactor.callLater(0, self.download_start, url, num_retries+1, redirects)
279 |         else:
280 |             # out of retries
281 |             raise TwistedError('Retry failure: %s' % message)
282 | 
283 | 
284 |     def handle_redirect(self, url, response, num_retries, redirects):
285 |         """Handle redirects - the builtin RedirectAgent does not handle relative redirects
286 |         """
287 |         if response.code in (301, 302, 303, 307):
288 |             # redirect HTTP code
289 |             locations = response.headers.getRawHeaders('location', [])
290 |             if locations:
291 |                 # a new redirect url
292 |                 if len(redirects) < self.settings.num_redirects:
293 |                     # can still redirect
294 |                     redirect_url = urlparse.urljoin(url, locations[0])
295 |                     if redirect_url != url:
296 |                         # new redirect URL
297 |                         redirects.append(url)
298 |                         reactor.callLater(0, self.download_start, redirect_url, num_retries, redirects)
299 |                         return True
300 |         return False
301 | 
302 | 
303 |     def scrape(self, url, html):
304 |         """Pass completed body to callback for scraping
305 |         """
306 |         del self.processing[url]
307 |         if self.settings.cb and self.running:
308 |             try:
309 |                 # get links crawled from webpage
310 |                 links = self.settings.cb(self.D, url, html) or []
311 |             except download.StopCrawl:
312 |                 common.logger.info('Stopping crawl signal')
313 |                 self.stop()
314 |             except Exception as e:
315 |                 common.logger.exception('\nIn callback for: ' + str(url))
316 |             else:
317 |                 # add new links to queue
318 |                 for link in links:
319 |                     cb_url = urlparse.urljoin(url, link)
320 |                     if cb_url not in self.found:
321 |                         self.found[cb_url] = True
322 |                         self.download_queue.append(cb_url)
323 | 
324 | 
325 |     def build_pool(self):
326 |         """Create connection pool
327 |         """
328 |         # XXX create limited number of instances
329 |         pool = client.HTTPConnectionPool(reactor, persistent=True)
330 |         # 1 connection for each proxy or thread
331 |         # XXX will this take too much memory?
332 |         pool.maxPersistentPerHost = len(self.D.settings.proxies) or self.settings.num_threads
333 |         pool.cachedConnectionTimeout = 240
334 |         return pool
335 | 
336 | 
337 |     #agents = {}
338 |     cookiejars = {}
339 |     def build_agent(self, proxy, headers):
340 |         """Build an agent for this request
341 |         """
342 |         fragments = common.parse_proxy(proxy)
343 |         pool = self.build_pool()
344 |         if fragments.host:
345 |             # add proxy authentication header
346 |             auth = base64.b64encode("%s:%s" % (fragments.username, fragments.password))
347 |             headers['Proxy-Authorization'] = ["Basic " + auth.strip()]
348 |             # generate the agent
349 |             endpoint = endpoints.TCP4ClientEndpoint(reactor, fragments.host, int(fragments.port), timeout=self.settings.timeout)
350 |             agent = client.ProxyAgent(endpoint, reactor=reactor, pool=pool)
351 |         else:
352 |             agent = client.Agent(reactor, connectTimeout=self.settings.timeout, pool=pool)
353 | 
354 |         agent = client.ContentDecoderAgent(agent, [('gzip', client.GzipDecoder)])
355 |         # XXX if use same cookie for all then works...
356 |         # cookies usually empty
357 |         if proxy in self.cookiejars:
358 |             cj = self.cookiejars[proxy]
359 |         else:
360 |             cj = cookielib.CookieJar()
361 |             self.cookiejars[proxy] = cj
362 |         agent = client.CookieAgent(agent, cj)
363 |         return agent
364 | 
365 | 
366 |     def cache_downloads(self):
367 |         """Cache the downloaded HTML
368 |         """
369 |         if self.cache_queue:
370 |             while self.cache_queue:
371 |                 redirects, html = self.cache_queue.pop()
372 |                 common.logger.debug('Cached: %d' % len(self.cache_queue))
373 |                 url = redirects[0]
374 |                 self.D[url] = html
375 |                 final_url = redirects[-1]
376 |                 if url != final_url:
377 |                     # store the redirect map
378 |                     self.D.cache.meta(start_url, dict(url=final_url))
379 |  
380 | 
381 | class TwistedError(Exception):
382 |     pass
383 | 
384 | 
385 | class DownloadPrinter(protocol.Protocol):
386 |     """Collect together body requests
387 |     """
388 |     def __init__(self, finished):
389 |         self.finished = finished
390 |         self.data = []
391 | 
392 |     def dataReceived(self, page):
393 |         self.data.append(page)
394 | 
395 |     def connectionLost(self, reason):
396 |         if str(reason.value) not in ('', 'Response body fully received'):
397 |             common.logger.info('Download body error: ' + str(reason.value))
398 |         html = ''.join(self.data)
399 |         self.finished.callback(html)
400 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __doc__ = 'Common web scraping related functions'
  3 | 
  4 | import os
  5 | import re
  6 | import sys
  7 | import csv
  8 | csv.field_size_limit(sys.maxint)
  9 | import time
 10 | import glob
 11 | import json
 12 | import string
 13 | import urllib
 14 | import string
 15 | import urllib2
 16 | import urlparse
 17 | import cookielib
 18 | import itertools
 19 | import htmlentitydefs
 20 | import logging
 21 | import logging.handlers
 22 | import threading
 23 | import collections
 24 | from datetime import datetime, timedelta
 25 | try:
 26 |     # should use pysqlite2 to read the cookies.sqlite on Windows
 27 |     # otherwise will raise the "sqlite3.DatabaseError: file is encrypted or is not a database" exception
 28 |     from pysqlite2 import dbapi2 as sqlite3
 29 | except ImportError:
 30 |     import sqlite3 
 31 | import adt
 32 | import settings
 33 | 
 34 | try:
 35 |     import json
 36 | except ImportError:
 37 |     import simplejson as json
 38 | 
 39 | 
 40 | class WebScrapingError(Exception):
 41 |     pass
 42 | 
 43 | 
 44 | # known media file extensions
 45 | MEDIA_EXTENSIONS = ['ai', 'aif', 'aifc', 'aiff', 'asc', 'avi', 'bcpio', 'bin', 'c', 'cc', 'ccad', 'cdf', 'class', 'cpio', 'cpt', 'csh', 'css', 'csv', 'dcr', 'dir', 'dms', 'doc', 'drw', 'dvi', 'dwg', 'dxf', 'dxr', 'eps', 'etx', 'exe', 'ez', 'f', 'f90', 'fli', 'flv', 'gif', 'gtar', 'gz', 'h', 'hdf', 'hh', 'hqx', 'ice', 'ico', 'ief', 'iges', 'igs', 'imq', 'ips', 'ipx', 'jpe', 'jpeg', 'jpg', 'js', 'kar', 'latex', 'lha', 'lsp', 'lzh', 'm', 'man', 'me', 'mesh', 'mid', 'midi', 'mif', 'mime', 'mov', 'movie', 'mp2', 'mp3', 'mpe', 'mpeg', 'mpg', 'mpga', 'ms', 'msh', 'nc', 'oda', 'pbm', 'pdb', 'pdf', 'pgm', 'pgn', 'png', 'pnm', 'pot', 'ppm', 'pps', 'ppt', 'ppz', 'pre', 'prt', 'ps', 'qt', 'ra', 'ram', 'ras', 'raw', 'rgb', 'rm', 'roff', 'rpm', 'rtf', 'rtx', 'scm', 'set', 'sgm', 'sgml', 'sh', 'shar', 'silo', 'sit', 'skd', 'skm', 'skp', 'skt', 'smi', 'smil', 'snd', 'sol', 'spl', 'src', 'step', 'stl', 'stp', 'sv4cpio', 'sv4crc', 'swf', 't', 'tar', 'tcl', 'tex', 'texi', 'tif', 'tiff', 'tr', 'tsi', 'tsp', 'tsv', 'unv', 'ustar', 'vcd', 'vda', 'viv', 'vivo', 'vrml', 'w2p', 'wav', 'wmv', 'wrl', 'xbm', 'xlc', 'xll', 'xlm', 'xls', 'xlw', 'xml', 'xpm', 'xsl', 'xwd', 'xyz', 'zip']
 46 | 
 47 | # tags that do not contain content
 48 | EMPTY_TAGS = 'br', 'hr', 'meta', 'link', 'base', 'img', 'embed', 'param', 'area', 'col', 'input'
 49 | 
 50 | 
 51 | def to_ascii(html):
 52 |     """Return ascii part of html
 53 |     """
 54 |     return ''.join(c for c in (html or '') if ord(c) < 128)
 55 | 
 56 | def to_int(s, default=0):
 57 |     """Return integer from this string
 58 | 
 59 |     >>> to_int('90')
 60 |     90
 61 |     >>> to_int('-90.2432')
 62 |     -90
 63 |     >>> to_int('a90a')
 64 |     90
 65 |     >>> to_int('a')
 66 |     0
 67 |     >>> to_int('a', 90)
 68 |     90
 69 |     """
 70 |     return int(to_float(s, default))
 71 | 
 72 | def to_float(s, default=0.0):
 73 |     """Return float from this string
 74 | 
 75 |     >>> to_float('90.45')
 76 |     90.45
 77 |     >>> to_float('')
 78 |     0.0
 79 |     >>> to_float('90')
 80 |     90.0
 81 |     >>> to_float('..9')
 82 |     0.0
 83 |     >>> to_float('.9')
 84 |     0.9
 85 |     >>> to_float(None)
 86 |     0.0
 87 |     >>> to_float(1)
 88 |     1.0
 89 |     """
 90 |     result = default
 91 |     if s:
 92 |         valid = string.digits + '.-'
 93 |         try:
 94 |             result = float(''.join(c for c in str(s) if c in valid))
 95 |         except ValueError:
 96 |             pass # input does not contain a number
 97 |     return result
 98 | 
 99 |     
100 | def to_unicode(obj, encoding=settings.default_encoding):
101 |     """Convert obj to unicode
102 |     """
103 |     if isinstance(obj, basestring):
104 |         if not isinstance(obj, unicode):
105 |             obj = obj.decode(encoding, 'ignore')
106 |     return obj
107 | 
108 | 
109 | def html_to_unicode(html, charset=settings.default_encoding):
110 |     """Convert html to unicode, decoding by specified charset when available
111 |     """
112 |     m = re.compile(r'<meta[^<>]*charset=\s*([a-z\d\-]+)', re.IGNORECASE).search(html)
113 |     if m:
114 |         charset = m.groups()[0].strip().lower()
115 |         
116 |     return to_unicode(html, charset)
117 |     
118 |     
119 | def is_html(html):
120 |     """Returns whether content is likely HTML based on search for common tags
121 |     """
122 |     try:
123 |         result = re.search('html|head|body', html) is not None
124 |     except TypeError:
125 |         result = False
126 |     return result
127 | 
128 | 
129 | def is_url(text):
130 |     """Returns whether passed text is a URL
131 | 
132 |     >>> is_url('abc')
133 |     False
134 |     >>> is_url('webscraping.com')
135 |     False
136 |     >>> is_url('http://webscraping.com/blog')
137 |     True
138 |     """
139 |     return re.match('https?://', text) is not None
140 | 
141 | 
142 | def unique(l):
143 |     """Remove duplicates from list, while maintaining order
144 | 
145 |     >>> unique([3,6,4,4,6])
146 |     [3, 6, 4]
147 |     >>> unique([])
148 |     []
149 |     >>> unique([3,6,4])
150 |     [3, 6, 4]
151 |     """
152 |     checked = []
153 |     for e in l:
154 |         if e not in checked:
155 |             checked.append(e)
156 |     return checked
157 | 
158 | 
159 | def flatten(l):
160 |     """Flatten a list of lists into a single list
161 | 
162 |     >>> flatten([[1,2,3], [4,5,6]])
163 |     [1, 2, 3, 4, 5, 6]
164 |     """
165 |     return [item for sublist in l for item in sublist]
166 | 
167 | 
168 | def nth(l, i, default=''):
169 |     """Return nth item from list or default value if out of range
170 |     """
171 |     try:
172 |         return l[i] 
173 |     except IndexError:
174 |         return default
175 | 
176 | def first(l, default=''):
177 |     """Return first element from list or default value if out of range
178 | 
179 |     >>> first([1,2,3])
180 |     1
181 |     >>> first([], None)
182 |     
183 |     """
184 |     return nth(l, i=0, default=default)
185 | 
186 | def last(l, default=''):
187 |     """Return last element from list or default value if out of range
188 |     """
189 |     return nth(l, i=-1, default=default)
190 | 
191 | 
192 | def pad(l, size, default=None, end=True):
193 |     """Return list of given size
194 |     Insert elements of default value if too small
195 |     Remove elements if too large
196 |     Manipulate end of list if end is True, else start
197 | 
198 |     >>> pad(range(5), 5)
199 |     [0, 1, 2, 3, 4]
200 |     >>> pad(range(5), 3)
201 |     [0, 1, 2]
202 |     >>> pad(range(5), 7, -1)
203 |     [0, 1, 2, 3, 4, -1, -1]
204 |     >>> pad(range(5), 7, end=False)
205 |     [None, None, 0, 1, 2, 3, 4]
206 |     """
207 |     while len(l) < size:
208 |         if end:
209 |             l.append(default)
210 |         else:
211 |             l.insert(0, default)
212 |     while len(l) > size:
213 |         if end:
214 |             l.pop()
215 |         else:
216 |             l.pop(0)
217 |     return l
218 | 
219 | 
220 | def remove_tags(html, keep_children=True):
221 |     """Remove HTML tags leaving just text
222 |     If keep children is True then keep text within child tags
223 | 
224 |     >>> remove_tags('hello <b>world</b>!')
225 |     'hello world!'
226 |     >>> remove_tags('hello <b>world</b>!', False)
227 |     'hello !'
228 |     >>> remove_tags('hello <br>world<br />!', False)
229 |     'hello world!'
230 |     >>> remove_tags('<span><b></b></span>test</span>', False)
231 |     'test'
232 |     """
233 |     html = re.sub('<(%s)[^>]*>' % '|'.join(EMPTY_TAGS), '', html)
234 |     if not keep_children:
235 |         for tag in unique(re.findall('<(\w+?)\W', html)):
236 |             if tag not in EMPTY_TAGS:
237 |                 html = re.compile('<\s*%s.*?>.*?</\s*%s\s*>' % (tag, tag), re.DOTALL).sub('', html)
238 |     return re.compile('<[^<]*?>').sub('', html)
239 |     
240 |     
241 | def unescape(text, encoding=settings.default_encoding, keep_unicode=False):
242 |     """Interpret escape characters
243 | 
244 |     >>> unescape('&lt;hello&nbsp;&amp;%20world&gt;')
245 |     '<hello & world>'
246 |     """
247 |     if not text:
248 |         return ''
249 |     try:
250 |         text = to_unicode(text, encoding)
251 |     except UnicodeError:
252 |         pass
253 | 
254 |     def fixup(m):
255 |         text = m.group(0)
256 |         if text[:2] == '&#':
257 |             # character reference
258 |             try:
259 |                 if text[:3] == '&#x':
260 |                     return unichr(int(text[3:-1], 16))
261 |                 else:
262 |                     return unichr(int(text[2:-1]))
263 |             except ValueError:
264 |                 pass
265 |         else:
266 |             # named entity
267 |             try:
268 |                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1].lower()])
269 |             except KeyError:
270 |                 pass
271 |         return text # leave as is
272 |     text = re.sub('&#?\w+;', fixup, text)
273 |     text = urllib.unquote(text)
274 |     if keep_unicode:
275 |         return text
276 |     try:
277 |         text = text.encode(encoding, 'ignore')
278 |     except UnicodeError:
279 |         pass
280 |     
281 |     if encoding != 'utf-8':
282 |         return text
283 | 
284 |     # remove annoying characters
285 |     chars = {
286 |         '\xc2\x82' : ',',        # High code comma
287 |         '\xc2\x84' : ',,',       # High code double comma
288 |         '\xc2\x85' : '...',      # Tripple dot
289 |         '\xc2\x88' : '^',        # High carat
290 |         '\xc2\x91' : '\x27',     # Forward single quote
291 |         '\xc2\x92' : '\x27',     # Reverse single quote
292 |         '\xc2\x93' : '\x22',     # Forward double quote
293 |         '\xc2\x94' : '\x22',     # Reverse double quote
294 |         '\xc2\x95' : ' ',  
295 |         '\xc2\x96' : '-',        # High hyphen
296 |         '\xc2\x97' : '--',       # Double hyphen
297 |         '\xc2\x99' : ' ',
298 |         '\xc2\xa0' : ' ',
299 |         '\xc2\xa6' : '|',        # Split vertical bar
300 |         '\xc2\xab' : '<<',       # Double less than
301 |         '\xc2\xae' : '®',
302 |         '\xc2\xbb' : '>>',       # Double greater than
303 |         '\xc2\xbc' : '1/4',      # one quarter
304 |         '\xc2\xbd' : '1/2',      # one half
305 |         '\xc2\xbe' : '3/4',      # three quarters
306 |         '\xca\xbf' : '\x27',     # c-single quote
307 |         '\xcc\xa8' : '',         # modifier - under curve
308 |         '\xcc\xb1' : ''          # modifier - under line
309 |     }
310 |     def replace_chars(match):
311 |         char = match.group(0)
312 |         return chars[char]
313 | 
314 |     return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)
315 | 
316 |    
317 | def normalize(s, encoding=settings.default_encoding, newlines=False):
318 |     """Normalize the string by removing tags, unescaping, and removing surrounding whitespace
319 |     
320 |     >>> normalize('<span>Tel.:   029&nbsp;-&nbsp;12345678   </span>')
321 |     'Tel.: 029 - 12345678'
322 |     """
323 |     if isinstance(s, basestring):
324 |         # remove tags and set encoding
325 |         s = unescape(remove_tags(s), encoding=encoding, keep_unicode=isinstance(s, unicode))
326 |         if newlines:
327 |             # keep multiple newlines
328 |             s = re.sub('[\n\r]+', '\n', s)
329 |             s = re.sub('[ \t\f\v]+', ' ', s)
330 |         else:
331 |             # replace all subsequent whitespace with single space
332 |             s = re.sub('[\s]+', ' ', s) 
333 |         s = re.compile('<!--.*?-->', re.DOTALL).sub('', s).strip()
334 |     return s
335 | 
336 | 
337 | def regex_get(html, pattern, index=None, normalized=True, flag=re.DOTALL|re.IGNORECASE, default='', one=False):
338 |     """Helper method to extract content from regular expression
339 |     
340 |     >>> regex_get('<div><span>Phone: 029&nbsp;01054609</span><span></span></div>', r'<span>Phone:([^<>]+)')
341 |     '029 01054609'
342 |     >>> regex_get('<div><span>Phone: 029&nbsp;01054609</span><span></span></div>', r'<span>Phone:\s*(\d+)&nbsp;(\d+)')
343 |     ['029', '01054609']
344 |     """
345 |     m = re.compile(pattern, flag).search(html)
346 |     if m:
347 |         if len(m.groups()) == 1:
348 |             return normalize(m.groups()[0]) if normalized else m.groups()[0]
349 |         elif index != None:
350 |             return normalize(m.groups()[index]) if normalized else m.groups()[index]
351 |         else:
352 |             return [normalize(item) if normalized else item for item in m.groups()]
353 |     return default
354 | 
355 | 
356 | def parse_jsonp(s):
357 |     try:
358 |         rindex = s.index('(')
359 |         lindex = s.rindex(')')
360 |     except IndexError:
361 |         pass
362 |     else:
363 |         return json.loads(s[rindex+1 : lindex])
364 | 
365 | 
366 | def safe(s):
367 |     """Return characters in string that are safe for URLs
368 |     
369 |     >>> safe('U@#$_#^&*-2')
370 |     'U_-2'
371 |     """
372 |     safe_chars = string.letters + string.digits + '-_ '
373 |     return ''.join(c for c in s if c in safe_chars).replace(' ', '-')
374 | 
375 | 
376 | def pretty(s):
377 |     """Return pretty version of string for display
378 |     
379 |     >>> pretty('hello_world')
380 |     'Hello World'
381 |     """
382 |     return re.sub('[-_]', ' ', s.title())
383 | 
384 | 
385 | def pretty_paragraph(s):
386 |     """Return pretty version of text in paragraph for display
387 |     """
388 |     s = re.sub('<(br|hr|/li)[^>]*>', '\n', s, re.IGNORECASE)
389 |     s = unescape(remove_tags(s))
390 |     def fixup(m):
391 |         text = m.group(0)
392 |         if '\r' in text or '\n' in text: return '\n'
393 |         return ' '
394 |     return re.sub('\s+', fixup, s).strip()
395 |     
396 | 
397 | def get_extension(url):
398 |     """Return extension from given URL
399 | 
400 |     >>> get_extension('hello_world.JPG')
401 |     'jpg'
402 |     >>> get_extension('http://www.google-analytics.com/__utm.gif?utmwv=1.3&utmn=420639071')
403 |     'gif'
404 |     """
405 |     return os.path.splitext(urlparse.urlsplit(url).path)[-1].lower().replace('.', '')
406 | 
407 | 
408 | def get_domain(url):
409 |     """Extract the domain from the given URL
410 | 
411 |     >>> get_domain('http://www.google.com.au/tos.html')
412 |     'google.com.au'
413 |     >>> get_domain('www.google.com')
414 |     'google.com'
415 |     """
416 |     m = re.compile(r"^.*://(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})").search(url)
417 |     if m:
418 |         # an IP address
419 |         return m.groups()[0]
420 |     
421 |     suffixes = 'ac', 'ad', 'ae', 'aero', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', 'arpa', 'as', 'asia', 'at', 'au', 'aw', 'ax', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi', 'biz', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', 'by', 'bz', 'ca', 'cat', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', 'cm', 'cn', 'co', 'com', 'coop', 'cr', 'cu', 'cv', 'cx', 'cy', 'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'edu', 'ee', 'eg', 'er', 'es', 'et', 'eu', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gb', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gov', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'im', 'in', 'info', 'int', 'io', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jobs', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mil', 'mk', 'ml', 'mm', 'mn', 'mo', 'mobi', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'name', 'nc', 'ne', 'net', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'org', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'pro', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'st', 'su', 'sv', 'sy', 'sz', 'tc', 'td', 'tel', 'tf', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 'to', 'tp', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'xn', 'ye', 'yt', 'za', 'zm', 'zw'
422 |     url = re.sub('^.*://', '', url).partition('/')[0].lower()
423 |     domain = []
424 |     for section in url.split('.'):
425 |         if section in suffixes:
426 |             domain.append(section)
427 |         else:
428 |             domain = [section]
429 |     return '.'.join(domain)
430 | 
431 | 
432 | def same_domain(url1, url2):
433 |     """Return whether URLs belong to same domain
434 |     
435 |     >>> same_domain('http://www.google.com.au', 'code.google.com')
436 |     True
437 |     >>> same_domain('http://www.facebook.com', 'http://www.myspace.com')
438 |     False
439 |     """
440 |     server1 = get_domain(url1)
441 |     server2 = get_domain(url2)
442 |     return server1 and server2 and (server1 in server2 or server2 in server1)
443 | 
444 | 
445 | def pretty_duration(dt):
446 |     """Return english description of this time difference
447 |     
448 |     >>> from datetime import timedelta
449 |     >>> pretty_duration(timedelta(seconds=1))
450 |     '1 second'
451 |     >>> pretty_duration(timedelta(hours=1))
452 |     '1 hour'
453 |     >>> pretty_duration(timedelta(days=2))
454 |     '2 days'
455 |     """
456 |     if isinstance(dt, datetime):
457 |         # convert datetime to timedelta
458 |         dt = datetime.now() - dt
459 |     if not isinstance(dt, timedelta):
460 |         return ''
461 |     if dt.days >= 2*365: 
462 |         return '%d years' % int(dt.days / 365) 
463 |     elif dt.days >= 365: 
464 |         return '1 year' 
465 |     elif dt.days >= 60: 
466 |         return '%d months' % int(dt.days / 30) 
467 |     elif dt.days > 21: 
468 |         return '1 month' 
469 |     elif dt.days >= 14: 
470 |         return '%d weeks' % int(dt.days / 7) 
471 |     elif dt.days >= 7: 
472 |         return '1 week' 
473 |     elif dt.days > 1: 
474 |         return '%d days' % dt.days 
475 |     elif dt.days == 1: 
476 |         return '1 day' 
477 |     elif dt.seconds >= 2*60*60: 
478 |         return '%d hours' % int(dt.seconds / 3600) 
479 |     elif dt.seconds >= 60*60: 
480 |         return '1 hour' 
481 |     elif dt.seconds >= 2*60: 
482 |         return '%d minutes' % int(dt.seconds / 60) 
483 |     elif dt.seconds >= 60: 
484 |         return '1 minute' 
485 |     elif dt.seconds > 1: 
486 |         return '%d seconds' % dt.seconds 
487 |     elif dt.seconds == 1: 
488 |         return '1 second' 
489 |     else: 
490 |         return ''
491 | 
492 | 
493 | def parse_proxy(proxy):
494 |     """Parse a proxy into its fragments
495 |     Returns a dict with username, password, host, and port
496 | 
497 |     >>> f = parse_proxy('login:pw@66.197.208.200:8080')
498 |     >>> f.username
499 |     'login'
500 |     >>> f.password
501 |     'pw'
502 |     >>> f.host
503 |     '66.197.208.200'
504 |     >>> f.port
505 |     '8080'
506 |     >>> f = parse_proxy('66.197.208.200')
507 |     >>> f.username == f.password == f.port == ''
508 |     True
509 |     >>> f.host
510 |     '66.197.208.200'
511 |     """
512 |     fragments = adt.Bag()
513 |     if isinstance(proxy, basestring):
514 |         match = re.match('((?P<username>\w+):(?P<password>\w+)@)?(?P<host>\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})(:(?P<port>\d+))?', proxy)
515 |         if match:
516 |             groups = match.groupdict()
517 |             fragments.username = groups.get('username') or ''
518 |             fragments.password = groups.get('password') or ''
519 |             fragments.host = groups.get('host')
520 |             fragments.port = groups.get('port') or ''
521 |     return fragments
522 | 
523 | 
524 | def read_list(file):
525 |     """Return file as list if exists
526 |     """
527 |     l = []
528 |     if os.path.exists(file):
529 |         l.extend(open(file).read().splitlines())
530 |     else:
531 |         logger.debug('%s not found' % file)
532 |     return l
533 | 
534 | 
535 | class UnicodeWriter:
536 |     """A CSV writer that produces Excel-compatible CSV files from unicode data.
537 |     
538 |     file: 
539 |         can either be a filename or a file object
540 |     encoding:
541 |         the encoding to use for output
542 |     mode:
543 |         the mode for writing to file
544 |     unique:
545 |         if True then will only write unique rows to output
546 |     unique_by:
547 |         make the rows unique by these columns(the value is a list of indexs), default by all columns
548 |     quoting:
549 |         csv module quoting style to use
550 |     utf8_bom:
551 |         whether need to add the BOM
552 |     auto_repair:
553 |         whether need to remove the invalid rows automatically
554 |     
555 |     >>> from StringIO import StringIO
556 |     >>> fp = StringIO()
557 |     >>> writer = UnicodeWriter(fp, quoting=csv.QUOTE_MINIMAL)
558 |     >>> writer.writerow(['a', '1'])
559 |     >>> writer.flush()
560 |     >>> fp.seek(0)
561 |     >>> fp.read().strip()
562 |     'a,1'
563 |     """
564 |     def __init__(self, file, encoding=settings.default_encoding, mode='wb', unique=False, unique_by=None, quoting=csv.QUOTE_ALL, utf8_bom=False, auto_repair=False, **argv):
565 |         self.encoding = encoding
566 |         self.unique = unique
567 |         self.unique_by = unique_by
568 |         if hasattr(file, 'write'):
569 |             self.fp = file
570 |         else:
571 |             if auto_repair:
572 |                 self._remove_invalid_rows(file=file, quoting=quoting, **argv)
573 |             if utf8_bom:
574 |                 self.fp = open(file, 'wb')
575 |                 self.fp.write('\xef\xbb\xbf')
576 |                 self.fp.close()
577 |                 self.fp = open(file, mode=mode.replace('w', 'a'))
578 |             else:
579 |                 self.fp = open(file, mode)
580 |         if self.unique:
581 |             self.rows = adt.HashDict() # cache the rows that have already been written
582 |             for row in csv.reader(open(self.fp.name)):
583 |                 self.rows[self._unique_key(row)] = True
584 |         self.writer = csv.writer(self.fp, quoting=quoting, **argv)
585 |         
586 |     def _unique_key(self, row):
587 |         """Generate the unique key
588 |         """
589 |         return '_'.join([str(row[i]) for i in self.unique_by]) if self.unique_by else str(row)
590 | 
591 |     def _remove_invalid_rows(self, file, **argv):
592 |         """Remove invalid csv rows e.g. newline inside string
593 |         """
594 |         if os.path.exists(file):
595 |             file_obj = open(file)
596 |             tmp_file = file + '.tmp'
597 |             tmp_file_obj = open(tmp_file, 'wb')
598 |             writer = csv.writer(tmp_file_obj, **argv)
599 |             try:
600 |                 for row in csv.reader(file_obj):
601 |                     writer.writerow(row)
602 |             except Exception, e:
603 |                 pass
604 |             file_obj.close()
605 |             tmp_file_obj.close()
606 |             os.remove(file)
607 |             os.rename(tmp_file, file)
608 | 
609 |     def _cell(self, s):
610 |         """Normalize the content for this cell
611 |         """
612 |         if isinstance(s, basestring):
613 |             if isinstance(s, unicode):
614 |                 s = s.encode(self.encoding, 'ignore')
615 |         elif s is None:
616 |             s = ''
617 |         else:
618 |             s = str(s)
619 |         return s
620 | 
621 |     def writerow(self, row):
622 |         """Write row to output
623 |         """
624 |         row = [self._cell(col) for col in row]
625 |         if self.unique:
626 |             if self._unique_key(row) not in self.rows:
627 |                 self.writer.writerow(row)
628 |                 self.rows[self._unique_key(row)] = True
629 |         else:
630 |             self.writer.writerow(row)
631 |             
632 |     def writerows(self, rows):
633 |         """Write multiple rows to output
634 |         """
635 |         for row in rows:
636 |             self.writerow(row)
637 | 
638 |     def flush(self):
639 |         """Flush output to disk
640 |         """
641 |         self.fp.flush()
642 |         if hasattr(self.fp, 'fileno'):
643 |             # this is a real file
644 |             os.fsync(self.fp.fileno())
645 |         
646 |     def close(self):
647 |         """Close the output file pointer
648 |         """
649 |         self.fp.close()
650 | 
651 | 
652 | def csv_to_xls(filename):
653 |     from xlsxwriter.workbook import Workbook
654 |     workbook = Workbook(filename[:-4] + '.xlsx')
655 |     worksheet = workbook.add_worksheet()
656 |     with open(filename, 'rt') as f:
657 |         reader = csv.reader(f)
658 |         for r, row in enumerate(reader):
659 |             for c, col in enumerate(row):
660 |                 worksheet.write(r, c, col.decode('utf-8'))
661 |     workbook.close()
662 | 
663 | 
664 | 
665 | # decrypt chrome cookies
666 | class Chrome:
667 |     def __init__(self):
668 |         import keyring
669 |         from Crypto.Protocol.KDF import PBKDF2
670 |         salt = b'saltysalt'
671 |         length = 16
672 |         # If running Chrome on OSX
673 |         if sys.platform == 'darwin':
674 |             my_pass = keyring.get_password('Chrome Safe Storage', 'Chrome')
675 |             my_pass = my_pass.encode('utf8')
676 |             iterations = 1003
677 |             self.cookie_file = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default/Cookies')
678 | 
679 |         # If running Chromium on Linux
680 |         elif 'linux' in sys.platform:
681 |             my_pass = 'peanuts'.encode('utf8')
682 |             iterations = 1
683 |             self.cookie_file = os.path.expanduser('~/.config/chromium/Default/Cookies')
684 |         else: 
685 |             raise Exception("This script only works on OSX or Linux.")
686 |         self.key = PBKDF2(my_pass, salt, length, iterations)
687 |     
688 |     def decrypt(self, value, encrypted_value):
689 |         if value or (encrypted_value[:3] != b'v10'):
690 |             return value
691 |     
692 |         from Crypto.Cipher import AES
693 |         
694 |         # Encrypted cookies should be prefixed with 'v10' according to the 
695 |         # Chromium code. Strip it off.
696 |         encrypted_value = encrypted_value[3:]
697 |  
698 |         # Strip padding by taking off number indicated by padding
699 |         # eg if last is '\x0e' then ord('\x0e') == 14, so take off 14.
700 |         # You'll need to change this function to use ord() for python2.
701 |         def clean(x):
702 |             return x[:-ord(x[-1])].decode('utf8')
703 | 
704 |         iv = b' ' * 16
705 |         cipher = AES.new(self.key, AES.MODE_CBC, IV=iv)
706 |         decrypted = cipher.decrypt(encrypted_value)
707 |         return clean(decrypted)
708 | 
709 | 
710 | # XXX merge common parts with firefox
711 | def chrome_cookie(filename=None, tmp_sqlite_file='cookies.sqlite', tmp_cookie_file='cookies.txt'):
712 |     if filename is None:
713 |         filename = os.path.expanduser("~/.config/google-chrome/Default/Cookies")
714 |     if not os.path.exists(filename):
715 |         raise WebScrapingError('Can not find chrome cookie file')
716 | 
717 |     open(tmp_sqlite_file, 'wb').write(open(filename, 'rb').read())
718 |     con = sqlite3.connect(tmp_sqlite_file)
719 |     cur = con.cursor()
720 |     cur.execute('SELECT host_key, path, secure, expires_utc, name, value, encrypted_value FROM cookies;')
721 |     # create standard cookies file that can be interpreted by cookie jar 
722 |     # XXX change to create directly without temp file
723 |     fp = open(tmp_cookie_file, 'w')
724 |     fp.write('# Netscape HTTP Cookie File\n')
725 |     fp.write('# http://www.netscape.com/newsref/std/cookie_spec.html\n')
726 |     fp.write('# This is a generated file!  Do not edit.\n')
727 |     ftstr = ['FALSE', 'TRUE']
728 |     chrome = Chrome()
729 |     for item in cur.fetchall():
730 |         value = chrome.decrypt(item[5], item[6])
731 |         row = u'%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (item[0], ftstr[item[0].startswith('.')], item[1], ftstr[item[2]], item[3], item[4], value)
732 |         fp.write(row)
733 | 
734 |     fp.close()
735 |     # close the connection before delete the sqlite file
736 |     con.close()
737 |     os.remove(tmp_sqlite_file)
738 |     
739 |     cookie_jar = cookielib.MozillaCookieJar()
740 |     cookie_jar.load(tmp_cookie_file)
741 |     os.remove(tmp_cookie_file)
742 | 
743 |     return cookie_jar
744 | 
745 | 
746 | 
747 | def firefox_cookie(file=None, tmp_sqlite_file='cookies.sqlite', tmp_cookie_file='cookies.txt'):
748 |     """Create a cookie jar from this FireFox 3 sqlite cookie database
749 | 
750 |     >>> cj = firefox_cookie()
751 |     >>> opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
752 |     >>> url = 'http://code.google.com/p/webscraping'
753 |     >>> html = opener.open(url).read()
754 |     """
755 |     if file is None:
756 |         try:
757 |             # add Windows version support
758 |             file = (glob.glob(os.path.join(os.environ.get('PROGRAMFILES', ''), 'Mozilla Firefox/profile/cookies.sqlite')) or \
759 |                     glob.glob(os.path.join(os.environ.get('PROGRAMFILES(X86)', ''), 'Mozilla Firefox/profile/cookies.sqlite')) or \
760 |                     glob.glob(os.path.expanduser('~/.mozilla/firefox/*.default/cookies.sqlite')) or \
761 |                     glob.glob(os.path.expanduser(r'~\AppData\Roaming\Mozilla\Firefox\Profiles\*.default\cookies.sqlite')))[0]
762 |         except IndexError:
763 |             raise WebScrapingError('Can not find filefox cookie file')
764 | 
765 |     # copy firefox cookie file locally to avoid locking problems
766 |     open(tmp_sqlite_file, 'wb').write(open(file, 'rb').read())
767 |     con = sqlite3.connect(tmp_sqlite_file)
768 |     cur = con.cursor()
769 |     cur.execute('select host, path, isSecure, expiry, name, value from moz_cookies')
770 | 
771 |     # create standard cookies file that can be interpreted by cookie jar 
772 |     fp = open(tmp_cookie_file, 'w')
773 |     fp.write('# Netscape HTTP Cookie File\n')
774 |     fp.write('# http://www.netscape.com/newsref/std/cookie_spec.html\n')
775 |     fp.write('# This is a generated file!  Do not edit.\n')
776 |     ftstr = ['FALSE', 'TRUE']
777 |     for item in cur.fetchall():
778 |         row = '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (item[0], ftstr[item[0].startswith('.')], item[1], ftstr[item[2]], item[3], item[4], item[5])
779 |         fp.write(row)
780 | 
781 |     # session cookies are saved into sessionstore.js
782 |     session_cookie_path = os.path.join(os.path.dirname(file), 'sessionstore.js')  
783 |     if os.path.exists(session_cookie_path):  
784 |         try:  
785 |             json_data = json.loads(open(session_cookie_path, 'rb').read().strip('()'))  
786 |         except Exception, e:  
787 |             print str(e)
788 |         else:
789 |             ftstr = ['FALSE', 'TRUE']
790 |             if 'windows' in json_data:  
791 |                 for window in json_data['windows']:
792 |                     if 'cookies' in window:
793 |                         for cookie in window['cookies']:
794 |                             row = "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (cookie.get('host', ''), ftstr[cookie.get('host', '').startswith('.')], \
795 |                                                                     cookie.get('path', ''), False, str(int(time.time()) + 3600 * 24 * 7), \
796 |                                                                     cookie.get('name', ''), cookie.get('value', ''))
797 |                             fp.write(row)
798 | 
799 |     fp.close()
800 |     # close the connection before delete the sqlite file
801 |     con.close()
802 |     
803 |     cookie_jar = cookielib.MozillaCookieJar()
804 |     cookie_jar.load(tmp_cookie_file)
805 | 
806 |     # remove temporary files
807 |     os.remove(tmp_sqlite_file)
808 |     os.remove(tmp_cookie_file)
809 |     return cookie_jar
810 | 
811 | 
812 | def build_opener(cj=None):
813 |     if cj is None:
814 |         cj = cookielib.CookieJar()
815 |     return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
816 | 
817 | 
818 | def start_threads(fn, num_threads=20, args=(), wait=True):
819 |     """Shortcut to start these threads with given args and wait for all to finish
820 |     """
821 |     threads = [threading.Thread(target=fn, args=args) for i in range(num_threads)]
822 |     # Start threads one by one         
823 |     for thread in threads: 
824 |         thread.start()
825 |     # Wait for all threads to finish
826 |     if wait:
827 |         for thread in threads: 
828 |             thread.join()
829 | 
830 | 
831 | class ConsoleHandler(logging.StreamHandler):
832 |     """Log to stderr for errors else stdout
833 |     """
834 |     def __init__(self):
835 |         logging.StreamHandler.__init__(self)
836 |         self.stream = None
837 | 
838 |     def emit(self, record):
839 |         if record.levelno >= logging.ERROR:
840 |             self.stream = sys.stderr
841 |         else:
842 |             self.stream = sys.stdout
843 |         logging.StreamHandler.emit(self, record)
844 | 
845 | 
846 | def get_logger(output_file, level=settings.log_level, maxbytes=0):
847 |     """Create a logger instance
848 | 
849 |     output_file:
850 |         file where to save the log
851 |     level:
852 |         the minimum logging level to save
853 |     maxbytes:
854 |         the maxbytes allowed for the log file size. 0 means no limit.
855 |     """
856 |     logger = logging.getLogger(output_file)
857 |     # avoid duplicate handlers
858 |     if not logger.handlers:
859 |         logger.setLevel(logging.DEBUG)
860 |         try:
861 |             if not maxbytes:
862 |                 file_handler = logging.FileHandler(output_file)
863 |             else:
864 |                 file_handler = logging.handlers.RotatingFileHandler(output_file, maxBytes=maxbytes)
865 |         except IOError:
866 |             pass # can not write file
867 |         else:
868 |             file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
869 |             logger.addHandler(file_handler)
870 | 
871 |         console_handler = ConsoleHandler()
872 |         console_handler.setLevel(level)
873 |         logger.addHandler(console_handler)
874 |     return logger
875 | logger = get_logger(settings.log_file, maxbytes=2*1024*1024*1024)
876 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Webscraping.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Webscraping.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Webscraping"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Webscraping"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Webscraping documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Dec 28 09:34:47 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.viewcode']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'webscraping'
 44 | copyright = u'2012, Richard Penman'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = ''
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = ''
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | #html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | #html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | #html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | #html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'webscrapingdoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 | # The paper size ('letterpaper' or 'a4paper').
174 | #'papersize': 'letterpaper',
175 | 
176 | # The font size ('10pt', '11pt' or '12pt').
177 | #'pointsize': '10pt',
178 | 
179 | # Additional stuff for the LaTeX preamble.
180 | #'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |   ('index', 'Webscraping.tex', u'Webscraping Documentation',
187 |    u'Richard Penman', 'manual'),
188 | ]
189 | 
190 | # The name of an image file (relative to this directory) to place at the top of
191 | # the title page.
192 | #latex_logo = None
193 | 
194 | # For "manual" documents, if this is true, then toplevel headings are parts,
195 | # not chapters.
196 | #latex_use_parts = False
197 | 
198 | # If true, show page references after internal links.
199 | #latex_show_pagerefs = False
200 | 
201 | # If true, show URL addresses after external links.
202 | #latex_show_urls = False
203 | 
204 | # Documents to append as an appendix to all manuals.
205 | #latex_appendices = []
206 | 
207 | # If false, no module index is generated.
208 | #latex_domain_indices = True
209 | 
210 | 
211 | # -- Options for manual page output --------------------------------------------
212 | 
213 | # One entry per manual page. List of tuples
214 | # (source start file, name, description, authors, manual section).
215 | man_pages = [
216 |     ('index', 'webscraping', u'Webscraping Documentation',
217 |      [u'Richard Penman'], 1)
218 | ]
219 | 
220 | # If true, show URL addresses after external links.
221 | #man_show_urls = False
222 | 
223 | 
224 | # -- Options for Texinfo output ------------------------------------------------
225 | 
226 | # Grouping the document tree into Texinfo files. List of tuples
227 | # (source start file, target name, title, author,
228 | #  dir menu entry, description, category)
229 | texinfo_documents = [
230 |   ('index', 'Webscraping', u'Webscraping Documentation',
231 |    u'Richard Penman', 'Webscraping', 'One line description of project.',
232 |    'Miscellaneous'),
233 | ]
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #texinfo_appendices = []
237 | 
238 | # If false, no module index is generated.
239 | #texinfo_domain_indices = True
240 | 
241 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
242 | #texinfo_show_urls = 'footnote'
243 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
  1 | Examples
  2 | ===========
  3 | 
  4 | 
  5 | Simple extraction
  6 | -----------------
  7 | 
  8 | Except project title from the Google Code page:
  9 | 
 10 | .. code-block:: python
 11 | 
 12 |     from webscraping import download, xpath
 13 |     D = download.Download()
 14 |     # download and cache the Google Code webpage
 15 |     html = D.get('http://code.google.com/p/webscraping')
 16 |     # use xpath to extract the project title
 17 |     project_title = xpath.get(html, '//div[@id="pname"]/a/span')
 18 | 
 19 | 
 20 | Blog scraper
 21 | ------------
 22 | 
 23 | Scrape all articles from a blog
 24 | 
 25 | .. code-block:: python
 26 | 
 27 |     import itertools
 28 |     import urlparse
 29 |     from webscraping import common, download, xpath
 30 | 
 31 |     DOMAIN = ...
 32 |     writer = common.UnicodeWriter('articles.csv')
 33 |     writer.writerow(['Title', 'Num reads', 'URL'])
 34 |     seen_urls = set() # track which articles URL's already seen, to prevent duplicates
 35 |     D = download.Download()
 36 | 
 37 |     # iterate each of the categories
 38 |     for category_link in ('/developer/knowledge-base?page=%d', '/developer/articles?page=%d'):
 39 |         # iterate the pages of a category
 40 |         for page in itertools.count():
 41 |             category_html = D.get(urlparse.urljoin(DOMAIN, category_link % page))
 42 |             article_links = xpath.search(category_html, '//div[@class="morelink"]/a/@href')
 43 |             num_new_articles = 0
 44 |             for article_link in article_links:
 45 |                 # scrape each article
 46 |                 url = urlparse.urljoin(DOMAIN, article_link)
 47 |                 if url not in seen_urls:
 48 |                     num_new_articles += 1
 49 |                     seen_urls.add(url)
 50 |                     html = D.get(url)
 51 |                     title = xpath.get(html, '//div[@class="feed-header-wrap"]/h2')
 52 |                     num_reads = xpath.get(html, '//li[@class="statistics_counter last"]/span').replace(' reads', '')
 53 |                     row = title, num_reads, url
 54 |                     writer.writerow(row)
 55 |             if num_new_articles == 0:
 56 |                 break # have found all articles for this category
 57 | 
 58 | 
 59 | Business directory threaded scraper
 60 | -----------------------------------
 61 | 
 62 | Scrape all businesses from this popular directory
 63 | 
 64 | .. code-block:: python
 65 | 
 66 |     import csv
 67 |     import re
 68 |     import string
 69 |     from webscraping import common, download, xpath
 70 | 
 71 |     DOMAIN = ...
 72 | 
 73 |     class BusinessDirectory:
 74 |         def __init__(self, output_file='businesses.csv'):
 75 |             self.writer = common.UnicodeWriter(output_file)
 76 |             self.writer.writerow(['Name', 'Address'])
 77 | 
 78 |         def __call__(self, D, url, html):
 79 |             urls = []
 80 |             if url == DOMAIN:
 81 |                 # crawl the index pages
 82 |                 urls = [DOMAIN + '/atoz/%s.html' % letter for letter in string.uppercase + '#']
 83 |             elif re.search('/atoz/\w\.html', url):
 84 |                 # crawl the categories
 85 |                 urls = [DOMAIN + link for link in xpath.search(html, '//div[@id="partitionContainer"]//a/@href')]
 86 |             elif re.search('/atoz/\w/\d+\.html', url):
 87 |                 # crawl the businesses
 88 |                 urls = [DOMAIN + link for link in xpath.search(html, '//div[@id="listingsContainer"]//a/@href')]
 89 |             else:
 90 |                 # scrape business details
 91 |                 name = xpath.get(html, '//h1[@class="listingName"]')
 92 |                 address = xpath.get(html, '//span[@class="listingAddressText"]')
 93 |                 row = name, address
 94 |                 self.writer.writerow(row)
 95 |             return urls
 96 | 
 97 |     download.threaded_get(url=DOMAIN, proxies=proxies, cb=BusinessDirectory())
 98 | 
 99 | 
100 | Daily deal threaded scraper
101 | ---------------------------
102 | 
103 | Scrape all deals from a popular daily deal website:
104 | 
105 | .. code-block:: python
106 | 
107 |     import re
108 |     import csv
109 |     import urlparse
110 |     from webscraping import common, download, xpath
111 |     
112 | 
113 |     DOMAIN = ...
114 |     writer = csv.writer(open('daily_deals.csv', 'w'))
115 |     writer.writerow(['Company', 'Address', 'Website', 'Email'])
116 |     
117 |     def daily_deal(D, url, html):
118 |         """This callback is called after each download
119 |         """
120 |         if url == DOMAIN:
121 |             # first download - get all the city deal pages
122 |             links = [link.replace('/deals/', '/all-deals/') for link in xpath.search(html, '//a[@class="jCityLink"]/@href')]
123 |         elif '/all-deals/' in url:
124 |             # city page downloaded - get all the deals
125 |             links = re.findall('"dealPermaLink":"(.*?)"', html)
126 |         else:
127 |             # deal page downloaded - extract the details
128 |             company = xpath.get(html, '//div[@class="merchantContact"]/h2')
129 |             website = xpath.get(html, '//div[@class="merchantContact"]/a/@href')
130 |             address = common.unescape(xpath.get(html, '//div[@class="merchantContact"]/text()')).replace('Returns:', '').strip()
131 |             if website:
132 |                 # crawl website for contact email
133 |                 email = '\n'.join(D.get_emails(website))
134 |             else:
135 |                 email = None
136 |             row = company, address, website, email
137 |             # write deal details to CSV
138 |             writer.writerow(row)
139 |             links = []
140 |     
141 |         return [urlparse.urljoin(DOMAIN, link) for link in links]
142 |     
143 |     # start the crawler
144 |     download.threaded_get(url=DOMAIN, proxy_file='proxies.txt', cb=daily_deal, num_retries=1)
145 | 
146 | 
147 | Navigate a website
148 | ------------------
149 | 
150 | Use webkit to navigate and interact with a website:
151 | 
152 | .. code-block:: python
153 | 
154 |     from webscraping import webkit
155 |     w = webkit.WebkitBrowser(gui=True) 
156 |     # load webpage
157 |     w.get('http://duckduckgo.com')
158 |     # fill search textbox 
159 |     w.fill('input[id=search_form_input_homepage]', 'webscraping')
160 |     # take screenshot of browser
161 |     w.screenshot('duckduckgo_search.jpg')
162 |     # click search button 
163 |     w.click('input[id=search_button_homepage]')
164 |     # wait on results page
165 |     w.wait(10)
166 |     # take another screenshot
167 |     w.screenshot('duckduckgo_results.jpg')
168 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Webscraping documentation master file, created by
 2 |    sphinx-quickstart on Fri Dec 28 09:34:47 2012.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | webscraping documentation
 7 | =======================================
 8 | 
 9 | .. toctree::
10 |    
11 |     introduction
12 |     examples
13 |     reference
14 |     
15 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | Background
 5 | ----------
 6 | 
 7 | For the last few years I have been specializing at web scraping and collected what I found useful into this library.
 8 | 
 9 | All code is pure Python and has been run across multiple Linux servers, Windows machines, as well as `Google App Engine <http://code.google.com/appengine/>`_.
10 | 
11 | 
12 | Install
13 | -------
14 | 
15 | Some options to install the webscraping package:
16 | 
17 | #. Checkout the repository: *hg clone https://code.google.com/p/webscraping/*
18 | #. Download the zip: https://pypi.python.org/pypi/webscraping/
19 | #. Install with pypi: *pip install webscraping*
20 | 
21 | The only dependency is python 2.5 or higher.
22 | 
23 | 
24 | License
25 | -------
26 | 
27 | This code is licensed under the `LGPL license <http://webscraping.com/blog/Open-sourced-web-scraping-code/>`_.
28 | 
29 | 
30 | Contact
31 | -------
32 | 
33 | richard@webscraping.com
34 | 


--------------------------------------------------------------------------------
/docs/reference.rst:
--------------------------------------------------------------------------------
 1 | .. Webscraping documentation master file, created by
 2 |    sphinx-quickstart on Fri Dec 28 09:34:47 2012.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Reference
 7 | =======================================
 8 | 
 9 | 
10 | :mod:`adt` Module
11 | -----------------
12 | .. automodule:: webscraping.adt
13 |     :members:
14 |     :undoc-members:
15 |     :show-inheritance:
16 | 
17 | 
18 | :mod:`alg` Module
19 | -----------------
20 | .. automodule:: webscraping.alg
21 |     :members:
22 |     :undoc-members:
23 |     :show-inheritance:
24 | 
25 | :mod:`common` Module
26 | --------------------
27 | .. automodule:: webscraping.common
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 
32 | :mod:`download` Module
33 | ----------------------
34 | .. automodule:: webscraping.download
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | :mod:`pdict` Module
40 | -------------------
41 | .. automodule:: webscraping.pdict
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | :mod:`webkit` Module
47 | --------------------
48 | .. automodule:: webscraping.webkit
49 |     :members:
50 |     :undoc-members:
51 |     :show-inheritance:
52 | 
53 | :mod:`xpath` Module
54 | -------------------
55 | .. automodule:: webscraping.xpath
56 |     :members:
57 |     :undoc-members:
58 |     :show-inheritance:
59 | 


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
   1 | __doc__ = 'Helper methods to download and crawl web content using threads'
   2 | 
   3 | import os
   4 | import re
   5 | import sys
   6 | import copy
   7 | import collections 
   8 | import random
   9 | import urllib
  10 | import urllib2
  11 | import urlparse
  12 | import StringIO
  13 | import time
  14 | import datetime
  15 | import subprocess
  16 | import socket
  17 | import gzip
  18 | import zlib
  19 | import thread
  20 | import threading
  21 | import contextlib
  22 | import tempfile
  23 | try:
  24 |     import hashlib
  25 | except ImportError:
  26 |     import md5 as hashlib
  27 | try:
  28 |     import cPickle as pickle
  29 | except:
  30 |     import pickle
  31 | try:
  32 |     import json
  33 | except ImportError:
  34 |     import simplejson as json
  35 | 
  36 | import adt
  37 | import alg
  38 | import common
  39 | import settings
  40 | try:
  41 |     import pdict
  42 | except ImportError:
  43 |     # sqlite not installed
  44 |     pdict = None
  45 | 
  46 | SLEEP_TIME = 0.1 # how long to sleep when waiting for network activity
  47 | DEFAULT_PRIORITY = 1 # default queue priority
  48 | 
  49 | 
  50 | 
  51 | class ProxyPerformance:
  52 |     """Track performance of proxies
  53 |     If 10 errors in a row that other proxies could handle then need to remove
  54 |     """
  55 |     def __init__(self):
  56 |         self.proxy_errors = collections.defaultdict(int)
  57 | 
  58 |     def success(self, proxy):
  59 |         """Successful download - so clear error count
  60 |         """
  61 |         self.proxy_errors[proxy] = 0
  62 | 
  63 |     def error(self, proxy):
  64 |         """Add to error count and returns number of consecutive errors for this proxy
  65 |         """
  66 |         if proxy:
  67 |             self.proxy_errors[proxy] += 1
  68 |         return self.proxy_errors[proxy]
  69 | 
  70 | 
  71 | 
  72 | class Download:
  73 |     """
  74 |     cache:
  75 |         a pdict object to use for the cache
  76 |     cache_file:
  77 |         filename to store cached data
  78 |     read_cache:
  79 |         whether to read from the cache
  80 |     write_cache:
  81 |         whether to write to the cache
  82 |     use_network:
  83 |         whether to download content not in the cache
  84 |     user_agent
  85 |         the User Agent to download content with
  86 |     timeout:
  87 |         the maximum amount of time to wait for http response
  88 |     delay:
  89 |         the minimum amount of time (in seconds) to wait after downloading content from a domain per proxy
  90 |     proxy_file:
  91 |         a filename to read proxies from
  92 |     max_proxy_errors:
  93 |         the maximum number of consecutive errors allowed per proxy before discarding
  94 |         an error is only counted if another proxy is able to successfully download the URL
  95 |         set to None to disable
  96 |     proxies:
  97 |         a list of proxies to cycle through when downloading content
  98 |     opener:
  99 |         an optional opener to use instead of using urllib2 directly
 100 |     headers:
 101 |         the headers to include in the request
 102 |     data:
 103 |         what to post at the URL
 104 |         if None (default) then a GET request will be made
 105 |     num_retries:
 106 |         how many times to try downloading a URL when get an error
 107 |     num_redirects:
 108 |         how many times the URL is allowed to be redirected, to avoid infinite loop
 109 |     force_html:
 110 |         whether to download non-text data
 111 |     force_ascii:
 112 |         whether to only return ascii characters
 113 |     max_size:
 114 |         maximum number of bytes that will be downloaded, or None to disable
 115 |     default:
 116 |         what to return when no content can be downloaded
 117 |     pattern:
 118 |         a regular expression that the downloaded HTML has to match to be considered a valid download
 119 |     acceptable_errors:
 120 |         a list contains all acceptable HTTP codes, don't try downloading for them e.g. no need to retry for 404 error
 121 |     throttle_additional_key:
 122 |         Sometimes the website limits the request only by session(rather than IP), we can use this parameter to keep each thread delaying independently
 123 |     keep_ip_ua:
 124 |         If it's True, one proxy IP will keep using the same User-agent, otherwise will use a random User-agent for each request.
 125 |     ssl_context:
 126 |         provide ssl context argument to urlopen
 127 |     """
 128 | 
 129 |     def __init__(self, cache=None, cache_file=None, read_cache=True, write_cache=True, cache_expires=None, use_network=True, 
 130 |             user_agent=None, timeout=30, delay=5, proxies=None, proxy_file=None, max_proxy_errors=5,
 131 |             opener=None, headers=None, data=None, num_retries=0, num_redirects=0, 
 132 |             force_html=False, force_ascii=False, max_size=None, default='', pattern=None, acceptable_errors=None, 
 133 |             throttle_additional_key=None, keep_ip_ua=True, ssl_context=None, **kwargs):
 134 |         socket.setdefaulttimeout(timeout)
 135 |         need_cache = read_cache or write_cache
 136 |         if pdict and need_cache:
 137 |             cache_file = cache_file or settings.cache_file
 138 |             self.cache = cache or pdict.PersistentDict(cache_file, expires=cache_expires)
 139 |         else:
 140 |             self.cache = None
 141 |             if need_cache:
 142 |                 common.logger.warning('Cache disabled because could not import pdict')
 143 | 
 144 |         self.settings = adt.Bag(
 145 |             read_cache = read_cache,
 146 |             write_cache = write_cache,
 147 |             use_network = use_network,
 148 |             delay = delay,
 149 |             proxies = (common.read_list(proxy_file) if proxy_file else []) or proxies or [],
 150 |             proxy_file = proxy_file,
 151 |             max_proxy_errors = max_proxy_errors,
 152 |             user_agent = user_agent,
 153 |             opener = opener,
 154 |             headers = headers,
 155 |             data = data,
 156 |             num_retries = num_retries,
 157 |             num_redirects = num_redirects,
 158 |             force_html = force_html,
 159 |             force_ascii = force_ascii,
 160 |             max_size = max_size,
 161 |             default = default,
 162 |             pattern = pattern,
 163 |             keep_ip_ua = keep_ip_ua,
 164 |             acceptable_errors = acceptable_errors,
 165 |             ssl_context = ssl_context
 166 |         )
 167 |         self.last_load_time = self.last_mtime = time.time()
 168 |         self.num_downloads = self.num_errors = 0
 169 |         self.throttle_additional_key = throttle_additional_key
 170 | 
 171 | 
 172 |     proxy_performance = ProxyPerformance()
 173 |     def get(self, url, **kwargs):
 174 |         """Download this URL and return the HTML. 
 175 |         By default HTML is cached so only have to download once.
 176 | 
 177 |         url:
 178 |             what to download
 179 |         kwargs:
 180 |             override any of the arguments passed to constructor
 181 |         """
 182 |         self.reload_proxies()
 183 |         self.proxy = None # the current proxy
 184 |         self.final_url = None # for tracking redirects
 185 |         self.response_code = '' # keep response code
 186 |         self.response_headers = {} # keep response headers
 187 |         self.downloading_error = None # keep downloading error
 188 |         self.num_downloads = self.num_errors = 0 # track the number of downloads made
 189 |                 
 190 |         # update settings with any local overrides
 191 |         settings = adt.Bag(self.settings)
 192 |         settings.update(kwargs)
 193 |         # check cache for whether this content is already downloaded
 194 |         key = self.get_key(url, settings.data)
 195 |         if self.cache and settings.read_cache:
 196 |             try:
 197 |                 html = self.cache[key]
 198 |                 if self.invalid_response(html, settings.pattern):
 199 |                     # invalid result from download
 200 |                     html = None
 201 |             except KeyError:
 202 |                 pass # have not downloaded yet
 203 |             else:
 204 |                 if not html and settings.num_retries > 0:
 205 |                     try:
 206 |                         meta = self.cache.meta(key)
 207 |                     except KeyError:
 208 |                         pass
 209 |                     else:
 210 |                         if meta.get('status', '').startswith('404'):
 211 |                             # don't retry 4XX errors
 212 |                             common.logger.debug('Ignoring URL with previous status {}'.format(meta['status']))
 213 |                             return settings.default
 214 |                     # try downloading again
 215 |                     common.logger.debug('Redownloading')
 216 |                     settings.num_retries -= 1
 217 |                 else:
 218 |                     # return previously downloaded content
 219 |                     return html or settings.default 
 220 |         if not settings.use_network:
 221 |             # only want previously cached content
 222 |             return settings.default 
 223 | 
 224 |         html = None
 225 |         failed_proxies = set() # record which proxies failed to download for this URL
 226 |         # attempt downloading content at URL
 227 |         while settings.num_retries >= 0 and html is None:
 228 |             settings.num_retries -= 1
 229 |             if settings.proxy:
 230 |                 self.proxy = settings.proxy
 231 |             else:
 232 |                 self.proxy = self.get_proxy(settings.proxies)
 233 |             # crawl slowly for each domain to reduce risk of being blocked
 234 |             self.throttle(url, delay=settings.delay, proxy=self.proxy) 
 235 |             html = self.fetch(url, headers=settings.headers, data=settings.data, proxy=self.proxy, user_agent=settings.user_agent, opener=settings.opener, pattern=settings.pattern, max_size=settings.max_size, ssl_context=settings.ssl_context)
 236 | 
 237 |             if html:
 238 |                 # successfully downloaded
 239 |                 self.num_downloads += 1
 240 |                 if settings.max_proxy_errors is not None:
 241 |                     Download.proxy_performance.success(self.proxy)
 242 |                     # record which proxies failed for this download
 243 |                     for proxy in failed_proxies:
 244 |                         if Download.proxy_performance.error(self.proxy) > settings.max_proxy_errors:
 245 |                             # this proxy has had too many errors so remove
 246 |                             common.logger.warning('Removing unstable proxy from list after %d consecutive errors: %s' % (settings.max_proxy_errors, self.proxy))
 247 |                             settings.proxies.remove(self.proxy)
 248 |             else:
 249 |                 # download failed - try again
 250 |                 self.num_errors += 1
 251 |                 failed_proxies.add(self.proxy)
 252 | 
 253 | 
 254 |         if html:
 255 |             if settings.num_redirects > 0:
 256 |                 # allowed to redirect
 257 |                 redirect_url = get_redirect(url=url, html=html)
 258 |                 if redirect_url:
 259 |                     # found a redirection
 260 |                     common.logger.debug('%s redirecting to %s' % (url, redirect_url))
 261 |                     settings.num_redirects -= 1
 262 |                     html = self.get(redirect_url, **settings) or ''
 263 |                     # make relative links absolute so will still work after redirect
 264 |                     relative_re = re.compile('(<\s*a[^>]+href\s*=\s*["\']?)(?!http)([^"\'>]+)', re.IGNORECASE)
 265 |                     try:
 266 |                         html = relative_re.sub(lambda m: m.group(1) + urlparse.urljoin(url, m.group(2)), html)
 267 |                     except UnicodeDecodeError:
 268 |                         pass
 269 |             html = self._clean_content(html=html, max_size=settings.max_size, force_html=settings.force_html, force_ascii=settings.force_ascii)
 270 | 
 271 |         if self.cache and settings.write_cache:
 272 |             # cache results
 273 |             self.cache[key] = html
 274 |             meta = {}
 275 |             if self.final_url and url != self.final_url:
 276 |                 # cache what URL was redirected to
 277 |                 meta['url'] = self.final_url
 278 |             if self.response_code and self.response_code != '200':
 279 |                 meta['status'] = self.response_code
 280 |             if meta:
 281 |                 self.cache.meta(key, meta)
 282 |         
 283 |         # return default if no content
 284 |         return html or settings.default 
 285 | 
 286 | 
 287 |     def exists(self, url):
 288 |         """Do a HEAD request to check whether webpage exists
 289 |         """
 290 |         success = False
 291 |         key = self.get_key(url, 'head')
 292 |         try:
 293 |             if self.cache and self.settings.read_cache:
 294 |                 success = self.cache[key]
 295 |             else:
 296 |                 raise KeyError('No cache')
 297 |         except KeyError:
 298 |             # have not downloaded yet
 299 |             request = urllib2.Request(url)
 300 |             request.get_method = lambda : 'HEAD'
 301 |             try:
 302 |                 response = urllib2.urlopen(request, context=self.settings.ssl_context)
 303 |             except Exception, e:
 304 |                 common.logger.warning('HEAD check miss: %s %s' % (url, e))
 305 |             else:
 306 |                 success = True
 307 |                 common.logger.info('HEAD check hit: %s' % url)
 308 |             if self.cache:
 309 |                 self.cache[key] = success
 310 |         return success
 311 | 
 312 | 
 313 |     def get_key(self, url, data=None):
 314 |         """Create key for caching this request
 315 |         """
 316 |         key = url
 317 |         if data:
 318 |             key += ' ' + str(data)
 319 |         return key
 320 | 
 321 | 
 322 |     def _clean_content(self, html, max_size, force_html, force_ascii):
 323 |         """Clean up downloaded content
 324 | 
 325 |         html:
 326 |             the input to clean
 327 |         max_size:
 328 |             the maximum size of data allowed
 329 |         force_html:
 330 |             content must be HTML
 331 |         force_ascii:
 332 |             content must be ASCII
 333 |         """
 334 |         if max_size is not None and len(html) > max_size:
 335 |             common.logger.info('Webpage is too big: %s' % len(html))
 336 |             html = '' # too big to store
 337 |         elif force_html and not common.is_html(html):
 338 |             common.logger.info('Webpage is not html')
 339 |             html = '' # non-html content
 340 |         elif force_ascii:
 341 |             html = common.to_ascii(html) # remove non-ascii characters
 342 |         return html
 343 | 
 344 | 
 345 |     def get_proxy(self, proxies=None):
 346 |         """Return random proxy if available
 347 |         """
 348 |         if proxies:
 349 |             proxy = random.choice(proxies)
 350 |         elif self.settings.proxies:
 351 |             # select next available proxy
 352 |             proxy = random.choice(self.settings.proxies)
 353 |         else:
 354 |             proxy = None
 355 |         return proxy
 356 | 
 357 | 
 358 |     # cache the user agent used for each proxy
 359 |     proxy_agents = {}
 360 |     def get_user_agent(self, proxy):
 361 |         """Get user agent for this proxy
 362 |         """
 363 |         if self.settings.keep_ip_ua and proxy in Download.proxy_agents:
 364 |             # have used this proxy before so return same user agent
 365 |             user_agent = Download.proxy_agents[proxy]
 366 |         else:
 367 |             # assign random user agent to this proxy
 368 |             user_agent = alg.rand_agent()
 369 |             Download.proxy_agents[proxy] = user_agent
 370 |         return user_agent
 371 | 
 372 | 
 373 |     def invalid_response(self, html, pattern):
 374 |         """Return whether the response contains a regex error pattern 
 375 |         """
 376 |         return html is None or (pattern and not re.compile(pattern, re.DOTALL | re.IGNORECASE).search(html))
 377 | 
 378 | 
 379 |     def fetch(self, url, headers=None, data=None, proxy=None, user_agent=None, opener=None, pattern=None, max_size=None, ssl_context=None):
 380 |         """Simply download the url and return the content
 381 |         """
 382 |         self.error_content = None
 383 |         # create opener with headers
 384 |         if not opener:
 385 |             opener = common.build_opener()
 386 |         if proxy:
 387 |             # avoid duplicate ProxyHandler
 388 |             opener.add_handler(urllib2.ProxyHandler({urlparse.urlparse(url).scheme : proxy}))
 389 |         if ssl_context is not None:
 390 |             # add ssl context XXX does not work
 391 |             https_handler = urllib2.HTTPSHandler(context=ssl_context)
 392 |             opener.add_handler(https_handler)
 393 |             
 394 |         headers = headers or {}
 395 |         default_headers = settings.default_headers.copy()
 396 |         default_headers['User-Agent'] = user_agent or self.get_user_agent(proxy)
 397 |         if not max_size:
 398 |             default_headers['Accept-Encoding'] = 'gzip, deflate'
 399 |         lowercase_headers = [name.lower() for name in headers.keys()]
 400 |         for name, value in default_headers.items():
 401 |             if name.lower() not in lowercase_headers:
 402 |                 if name == 'Referer':
 403 |                     value = url
 404 |                 headers[name] = value
 405 |         if 'Host' in headers and not headers['Host']:
 406 |             del headers['Host'] # some websites raise an error when host is included
 407 | 
 408 |         if isinstance(data, dict):
 409 |             # encode data for POST
 410 |             data = urllib.urlencode(sorted(data.items()))
 411 |         common.logger.info('Downloading %s %s' % (url, data or ''))
 412 |         try:
 413 |             request = urllib2.Request(urllib.quote(url, safe='/:?&+=%()'), data, headers)
 414 |             with contextlib.closing(opener.open(request)) as response:
 415 |                 if max_size is not None:
 416 |                     content = response.read(max_size)
 417 |                 else:
 418 |                     content = response.read()
 419 |                 if response.headers.get('Content-Encoding') == 'gzip':
 420 |                     # data came back gzip-compressed so decompress it          
 421 |                     content = gzip.GzipFile(fileobj=StringIO.StringIO(content)).read()
 422 |                 elif response.headers.get('Content-Encoding') == 'deflate':
 423 |                     content = zlib.decompress(content)
 424 |                 self.final_url = response.url # store where redirected to
 425 |                 if self.invalid_response(content, pattern):
 426 |                     # invalid result from download
 427 |                     content = None
 428 |                     common.logger.warning('Content did not match expected pattern: %s' % url)
 429 |                 self.response_code = str(response.code)
 430 |                 self.response_headers = dict(response.headers)
 431 |         except Exception, e:
 432 |             self.downloading_error = str(e)
 433 |             if hasattr(e, 'code'):
 434 |                 self.response_code = str(e.code)
 435 |             else:
 436 |                 m = re.search('\D(\d\d\d)\D', str(e))
 437 |                 if m:
 438 |                     self.response_code = m.groups()[0]
 439 |                 
 440 |             if hasattr(e, 'read'):
 441 |                 try:
 442 |                     self.error_content = e.read()
 443 |                 except Exception, e:
 444 |                     self.error_content = ''
 445 |             # so many kinds of errors are possible here so just catch them all
 446 |             common.logger.warning(u'Download error: {} {}'.format(url, self.response_code))
 447 |             if self.settings.acceptable_errors and self.response_code in self.settings.acceptable_errors:
 448 |                 content, self.final_url = self.settings.default, url
 449 |             else:
 450 |                 content, self.final_url = None, url
 451 |         return content
 452 | 
 453 | 
 454 |     _domains = adt.HashDict()
 455 |     def throttle(self, url, delay, proxy=None, variance=0.5):
 456 |         """Delay a minimum time for each domain per proxy by storing last access time
 457 | 
 458 |         url
 459 |             what intend to download
 460 |         delay
 461 |             the minimum amount of time (in seconds) to wait after downloading content from this domain
 462 |         proxy
 463 |             the proxy to download through
 464 |         variance
 465 |             the amount of randomness in delay, 0-1
 466 |         """
 467 |         if delay > 0:
 468 |             key = ':'.join([str(proxy), self.throttle_additional_key or '', common.get_domain(url)])
 469 |             if key in Download._domains:
 470 |                 while datetime.datetime.now() < Download._domains.get(key):
 471 |                     time.sleep(SLEEP_TIME)
 472 |             # update domain timestamp to when can query next
 473 |             Download._domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
 474 | 
 475 | 
 476 |     def reload_proxies(self, timeout=600):
 477 |         """Check periodically for updated proxy file
 478 | 
 479 |         timeout:
 480 |             the number of seconds before check for updated proxies
 481 |         """
 482 |         if self.settings.proxy_file and time.time() - self.last_load_time > timeout:
 483 |             self.last_load_time = time.time()
 484 |             if os.path.exists(self.settings.proxy_file):
 485 |                 if os.stat(self.settings.proxy_file).st_mtime != self.last_mtime:
 486 |                     self.last_mtime = os.stat(self.settings.proxy_file).st_mtime
 487 |                     self.settings.proxies = common.read_list(self.settings.proxy_file)
 488 |                     common.logger.debug('Reloaded proxies from updated file.')
 489 | 
 490 | 
 491 |     def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None, api_key=None):
 492 |         gm = GoogleMaps(self)
 493 |         return gm.geocode(address, delay, read_cache, num_retries, language, api_key=api_key)
 494 | 
 495 |     def places(self, api_key, keyword, latitude, longitude, radius=10000, delay=5, num_retries=1, language='en'):
 496 |         gm = GoogleMaps(self)
 497 |         return gm.places(api_key, keyword, latitude, longitude, radius, delay, num_retries, language)
 498 | 
 499 | 
 500 |     def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1):
 501 |         return DataCrawler(self, alg.extract_emails).find(website, max_depth, max_urls, max_emails)
 502 | 
 503 |     def get_phones(self, website, max_depth=1, max_urls=10, max_phones=1):
 504 |         return DataCrawler(self, alg.extract_phones).find(website, max_depth, max_urls, max_phones)
 505 | 
 506 | 
 507 |     def gcache_get(self, url, **kwargs):
 508 |         """Download webpage via google cache
 509 |         """
 510 |         return self.get('http://www.google.com/search?&q=cache%3A' + urllib.quote(url), **kwargs)
 511 | 
 512 | 
 513 |     def gtrans_get(self, url, **kwargs):
 514 |         """Download webpage via Google Translation
 515 |         """
 516 |         url = 'http://translate.google.com/translate?sl=nl&anno=2&u=%s' % urllib.quote(url)
 517 |         html = self.get(url, **kwargs)
 518 |         if html:
 519 |             m = re.compile(r'<iframe[^<>]*src="([^"]+)"[^<>]*name=c', re.DOTALL|re.IGNORECASE).search(html)
 520 |             if m:
 521 |                 frame_src = urlparse.urljoin(url, common.unescape(m.groups()[0].strip()))
 522 |                 # force to check redirect here
 523 |                 html = self.get(frame_src, **kwargs)
 524 |                 if html:
 525 |                     # remove google translations content
 526 |                     return re.compile(r'<span class="google-src-text".+?</span>', re.DOTALL|re.IGNORECASE).sub('', html)
 527 |         return self.settings.default
 528 | 
 529 |     
 530 |     def archive_get(self, url, timestamp=None, **kwargs):
 531 |         """Download webpage via the archive.org cache
 532 | 
 533 |         url:
 534 |             The webpage to download
 535 |         timestamp:
 536 |             When passed a datetime object will download the cached webpage closest to this date,
 537 |             If passed a string will use this as timestamp
 538 |             Else when None (default) will download the most recent archived page.
 539 |         """
 540 |         if hasattr(timestamp, 'strftime'):
 541 |             formatted_ts = timestamp.strftime('%Y%m%d%H%M%S')
 542 |         elif isinstance(timestamp, basestring):
 543 |             formatted_ts = timestamp
 544 |         else:
 545 |             formatted_ts = '2' # will return most recent archive
 546 |         html = self.get('https://web.archive.org/web/%s/%s' % (formatted_ts, url), **kwargs)
 547 |         if not html and timestamp is None:
 548 |             # not cached, so get live version
 549 |             html = self.get('http://liveweb.archive.org/' + url)
 550 |         match = re.search('<p class="impatient"><a href="(/web/\d+.*?)"', html)
 551 |         if match:
 552 |             redirect = match.groups()[0]
 553 |             html = self.get('https://web.archive.org' + redirect, **kwargs)
 554 | 
 555 |         if html:
 556 |             # remove wayback toolbar
 557 |             html = re.compile('<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*?<!-- END WAYBACK TOOLBAR INSERT -->', re.DOTALL).sub('', html)
 558 |             html = re.compile('<!--\s+FILE ARCHIVED ON.*?-->', re.DOTALL).sub('', html)
 559 |             html = re.sub('/web/\d+/', '', html)
 560 |         return html
 561 | 
 562 | 
 563 |     def whois(self, url, timeout=10):
 564 |         """Return text of this whois query
 565 |         """
 566 |         domain = common.get_domain(url)
 567 |         if domain:
 568 |             text = ''
 569 |             key = 'whois_%s' % domain
 570 |             try:
 571 |                 if self.cache:
 572 |                     text = self.cache[key]
 573 |                 else:
 574 |                     raise KeyError()
 575 |             except KeyError:
 576 |                 # try local whois command
 577 |                 r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
 578 |                 start = time.time()
 579 |                 while r.poll() is None:
 580 |                     time.sleep(0.5)
 581 |                     if time.time() - start > timeout:
 582 |                         try:
 583 |                             r.kill()
 584 |                         except Exception, e:
 585 |                             pass
 586 |                         break
 587 |                 if r.poll() != 1:
 588 |                     text = r.communicate()[0]
 589 |             
 590 |                 if '@' in text:
 591 |                     if self.cache:
 592 |                         self.cache[key] = text
 593 |             return text
 594 | 
 595 |         
 596 |     def save_as(self, url, filename=None, save_dir='images', override=False):
 597 |         """Download url and save to disk if does not already exist
 598 | 
 599 |         url:
 600 |             the webpage to download
 601 |         filename:
 602 |             output file to save to if not set then will save to file based on URL
 603 |         override:
 604 |             whether to download if output file already exists
 605 |         """
 606 |         save_path = os.path.join(save_dir, filename or '%s.%s' % (hashlib.md5(url).hexdigest(), common.get_extension(url)))
 607 |         if not os.path.exists(save_path) or override:
 608 |             # need to download
 609 |             _bytes = self.get(url, num_redirects=0, write_cache=False)
 610 |             if _bytes:
 611 |                 if not os.path.exists(save_dir):
 612 |                     os.makedirs(save_dir)
 613 |                 open(save_path, 'wb').write(_bytes)
 614 |             else:
 615 |                 return None
 616 |         return save_path
 617 | 
 618 | 
 619 | def get_redirect(url, html):
 620 |     """Check for meta redirects and return redirect URL if found
 621 |     """
 622 |     match = re.compile('<meta[^>]*?url=(.*?)["\']', re.IGNORECASE).search(html)
 623 |     if match:
 624 |         return urlparse.urljoin(url, common.unescape(match.groups()[0].strip())) 
 625 | 
 626 | 
 627 | class GoogleMaps:
 628 |     def __init__(self, D):
 629 |         self.D = D
 630 | 
 631 |     def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None, api_key=None):
 632 |         """Geocode address using Google's API and return dictionary of useful fields
 633 | 
 634 |         address:
 635 |             what to pass to geocode API
 636 |         delay:
 637 |             how long to delay between API requests
 638 |         read_cache:
 639 |             whether to load content from cache when exists
 640 |         num_retries:
 641 |             the number of times to try downloading
 642 |         language:
 643 |             the language to set
 644 |         """
 645 |         try:
 646 |             address = address.encode('utf-8')
 647 |         except UnicodeDecodeError:
 648 |             common.logger.debug('Geocode failed to parse address and needed to cast to ascii: ' + address)
 649 |             address = common.to_ascii(address)
 650 |         address = re.sub('%C2%9\d', '', urllib.quote_plus(address))
 651 |         geocode_url = 'http://maps.google.com/maps/api/geocode/json?address=%s&sensor=false%s' % (address, '&language=' + language if language else '')
 652 |         try:
 653 |             # legacy data without api key
 654 |             geocode_html = self.D.cache[geocode_url]
 655 |             if geocode_html:
 656 |                 self.D.response_code = '200'
 657 |             else:
 658 |                 raise KeyError()
 659 |         except KeyError:
 660 |             geocode_url = 'https://maps.google.com/maps/api/geocode/json?address=%s&key=%s&sensor=false%s' % (address, api_key or '', '&language=' + language if language else '')
 661 |             geocode_html = self.D.get(geocode_url, delay=delay, read_cache=read_cache, num_retries=num_retries)
 662 |         geocode_data = self.load_result(geocode_url, geocode_html)
 663 |         for result in geocode_data.get('results', []):
 664 |             return self.parse_location(result)
 665 |         return collections.defaultdict(str)
 666 | 
 667 | 
 668 |     def places(self, api_key, keyword, latitude, longitude, radius=10000, delay=5, num_retries=1, language='en'):
 669 |         """Search the Google Place API for this keyword and location
 670 | 
 671 |         api_key is the Google Places API key: https://developers.google.com/places/documentation/#Authentication
 672 |         radius around the location can be a maximum 50000
 673 | 
 674 |         Returns a list of up to 200 matching places
 675 |         """
 676 |         search_url_template = 'https://maps.googleapis.com/maps/api/place/radarsearch/json?key={0}&location={1},{2}&radius={3}&keyword={4}&sensor=false'
 677 |         place_url_template = 'https://maps.googleapis.com/maps/api/place/details/json?key={0}&reference={1}&language={2}&sensor=false'
 678 | 
 679 |         search_url = search_url_template.format(api_key, latitude, longitude, radius, keyword).replace(' ', '+')
 680 |         search_html = self.D.get(search_url, delay=delay, num_retries=num_retries)
 681 |         search_results = self.load_result(search_url, search_html)
 682 | 
 683 |         place_results = []
 684 |         # iterate search results
 685 |         for search_result in search_results.get('results', []):
 686 |             reference = search_result['reference']
 687 |             # found a new place
 688 |             place_url = place_url_template.format(api_key, reference, language)
 689 |             place_html = self.D.get(place_url, delay=delay, num_retries=num_retries)
 690 | 
 691 |             place = self.load_result(place_url, place_html)
 692 |             if place:
 693 |                 place = place['result']
 694 |                 result = self.parse_location(place)
 695 |                 result['name'] = place['name']
 696 |                 result['categories'] = place['types']
 697 |                 result['phone'] = place.get('formatted_phone_number', '')
 698 |                 result['website'] = place.get('website', '')
 699 |                 place_results.append(result)
 700 |         return place_results
 701 | 
 702 | 
 703 |     def load_result(self, url, html):
 704 |         """Parse the result from API
 705 | 
 706 |         If JSON is well formed and status is OK then will return result
 707 |         Else will return an empty dict
 708 |         """
 709 |         if html:
 710 |             try:
 711 |                 search_data = json.loads(html)
 712 |             except ValueError as e:
 713 |                 common.logger.debug(str(e))
 714 |             else:
 715 |                 status = search_data['status']
 716 |                 if status == 'OK':
 717 |                     return search_data
 718 |                 elif status == 'ZERO_RESULTS':
 719 |                     pass
 720 |                 elif status == 'OVER_QUERY_LIMIT':
 721 |                     # error geocoding - try again later
 722 |                     common.logger.info('Over query limit')
 723 |                     self.D.cache[url] = ''
 724 |                 elif status in ('REQUEST_DENIED', 'INVALID_REQUEST'):
 725 |                     common.logger.info('{0}: {1}'.format(status, url))
 726 |         return {}
 727 | 
 728 | 
 729 |     def parse_location(self, result):
 730 |         """Parse address data from Google's geocoding response into a more usable flat structure
 731 |         
 732 |         Example: https://developers.google.com/maps/documentation/geocoding/#JSON
 733 |         """
 734 |         results = collections.defaultdict(str)
 735 |         for e in result['address_components']:
 736 |             # parse address compenents into flat layer
 737 |             types, value, abbrev = e['types'], e['long_name'], e['short_name']
 738 |             if 'street_number' in types:
 739 |                 results['number'] = value
 740 |             elif 'route' in types:
 741 |                 results['street'] = value
 742 |             elif 'postal_code' in types:
 743 |                 results['postcode'] = value
 744 |             elif 'locality' in types:
 745 |                 results['suburb'] = value
 746 |             elif 'administrative_area_level_1' in types:
 747 |                 results['state'] = value
 748 |                 results['state_code'] = abbrev
 749 |             elif 'administrative_area_level_2' in types:
 750 |                 results['county'] = value
 751 |             elif 'administrative_area_level_3' in types:
 752 |                 results['district'] = value
 753 |             elif 'country' in types:
 754 |                 results['country'] = value
 755 |                 results['country_code'] = abbrev
 756 |        
 757 |         # extract addresses 
 758 |         results['full_address'] = result['formatted_address']
 759 |         if 'street' in results:
 760 |             results['address'] = (results['number'] + ' ' + results['street']).strip()
 761 | 
 762 |         results['lat'] = result['geometry']['location']['lat']
 763 |         results['lng'] = result['geometry']['location']['lng']
 764 |         results['types'] = result['types']
 765 |         return results
 766 | 
 767 | 
 768 | 
 769 | class StopCrawl(Exception):
 770 |     """Raise this exception to interrupt crawl
 771 |     """
 772 |     pass
 773 | 
 774 | 
 775 | def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs):
 776 |     """Download these urls in parallel
 777 | 
 778 |     url:
 779 |         the webpage to download
 780 |     urls:
 781 |         the webpages to download
 782 |     num_threads:
 783 |         the number of threads to download urls with
 784 |     cb:
 785 |         Called after each download with the HTML of the download. 
 786 |         The arguments are the url and downloaded html.
 787 |         Whatever URLs are returned are added to the crawl queue.
 788 |     dl:
 789 |         A callback for customizing the download.
 790 |         Takes the download object and url and should return the HTML.
 791 |     depth:
 792 |         True for depth first search
 793 |     """
 794 |     running = True
 795 |     lock = threading.Lock()
 796 |     def add_iter_urls():
 797 |         if lock.acquire(False):
 798 |             for url in url_iter or []:
 799 |                 download_queue.append(url)
 800 |                 break
 801 |             lock.release()
 802 | 
 803 | 
 804 |     def process_queue():
 805 |         """Thread for downloading webpages
 806 |         """
 807 |         D = Download(**kwargs)
 808 | 
 809 |         while True:
 810 |             try:
 811 |                 url = download_queue.pop() if depth else download_queue.popleft()
 812 | 
 813 |             except IndexError:
 814 |                 add_iter_urls()
 815 |                 break
 816 | 
 817 |             else:
 818 |                 # download this url
 819 |                 html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
 820 |                 if cb:
 821 |                     try:
 822 |                         # use callback to process downloaded HTML
 823 |                         result = cb(D, url, html)
 824 | 
 825 |                     except StopCrawl:
 826 |                         common.logger.info('Stopping crawl signal')
 827 |                         self.running = False
 828 | 
 829 |                     except Exception:
 830 |                         # catch any callback error to avoid losing thread
 831 |                         common.logger.exception('\nIn callback for: ' + str(url))
 832 | 
 833 |                     else:
 834 |                         # add these URL's to crawl queue
 835 |                         for link in result or []:
 836 |                             download_queue.append(link)
 837 |                                         
 838 |                 # update the crawler state
 839 |                 # no download or error so must have read from cache
 840 |                 num_caches = 0 if D.num_downloads or D.num_errors else 1
 841 |                 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue))
 842 | 
 843 |     download_queue = collections.deque()
 844 |     if urls:
 845 |         download_queue.extend(urls)
 846 |     if url:
 847 |         download_queue.append(url)
 848 |     add_iter_urls()
 849 |     common.logger.debug('Start new crawl')
 850 | 
 851 |     # initiate the state file with the number of URL's already in the queue
 852 |     state = State()
 853 |     state.update(queue_size=len(download_queue))
 854 | 
 855 |     # wait for all download threads to finish
 856 |     threads = []
 857 |     while running and (threads or download_queue):
 858 |         for thread in threads:
 859 |             if not thread.is_alive():
 860 |                 threads.remove(thread)
 861 |         while len(threads) < num_threads and download_queue:
 862 |             # cat start more threads
 863 |             thread = threading.Thread(target=process_queue)
 864 |             thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
 865 |             thread.start()
 866 |             threads.append(thread)
 867 |         time.sleep(SLEEP_TIME)
 868 |     # save the final state after threads finish
 869 |     state.save()
 870 | 
 871 | 
 872 | 
 873 | class State:
 874 |     """Save state of crawl to disk
 875 | 
 876 |     output_file:
 877 |         where to save the state
 878 |     timeout:
 879 |         how many seconds to wait between saving the state
 880 |     """
 881 |     def __init__(self, output_file=None, timeout=10):
 882 |         # where to save state to
 883 |         self.output_file = output_file or settings.status_file
 884 |         # how long to wait between saving state
 885 |         self.timeout = timeout
 886 |         # track the number of downloads and errors
 887 |         self.num_downloads = self.num_errors = self.num_caches = self.queue_size = 0
 888 |         # data to save to disk
 889 |         self.data = {}
 890 |         # whether data needs to be saved to dosk
 891 |         self.flush = False
 892 |         # track time duration of crawl
 893 |         self.start_time = time.time()
 894 |         self.last_time = 0
 895 |         # a lock to prevent multiple threads writing at once
 896 |         self.lock = threading.Lock()
 897 | 
 898 |     def update(self, num_downloads=0, num_errors=0, num_caches=0, queue_size=0):
 899 |         """Update the state with these values
 900 | 
 901 |         num_downloads:
 902 |             the number of downloads completed successfully
 903 |         num_errors:
 904 |             the number of errors encountered while downloading
 905 |         num_caches:
 906 |             the number of webpages read from cache instead of downloading
 907 |         queue_size:
 908 |             the number of URL's in the queue
 909 |         """
 910 |         self.num_downloads += num_downloads
 911 |         self.num_errors += num_errors
 912 |         self.num_caches += num_caches
 913 |         self.queue_size = queue_size
 914 |         self.data['num_downloads'] = self.num_downloads
 915 |         self.data['num_errors'] = self.num_errors
 916 |         self.data['num_caches'] = self.num_caches
 917 |         self.data['queue_size'] = self.queue_size
 918 | 
 919 |         if time.time() - self.last_time > self.timeout:
 920 |             self.lock.acquire()
 921 |             self.save()
 922 |             self.lock.release()
 923 | 
 924 |     def save(self):
 925 |         """Save state to disk
 926 |         """
 927 |         self.last_time = time.time()
 928 |         self.data['duration_secs'] = int(self.last_time - self.start_time)
 929 |         self.flush = False
 930 |         text = json.dumps(self.data)
 931 |         tmp_file = '%s.%d' % (self.output_file, os.getpid())
 932 |         fp = open(tmp_file, 'wb')
 933 |         fp.write(text)
 934 |         # ensure all content is written to disk
 935 |         fp.flush()
 936 |         fp.close()
 937 |         try:
 938 |             if os.name == 'nt': 
 939 |                 # on windows can not rename if file exists
 940 |                 if os.path.exists(self.output_file):
 941 |                     os.remove(self.output_file)
 942 |             # atomic copy to new location so state file is never partially written
 943 |             os.rename(tmp_file, self.output_file)
 944 |         except OSError:
 945 |             pass
 946 | 
 947 | 
 948 | 
 949 | class CrawlerCallback:
 950 |     """Example callback to crawl a website
 951 |     """
 952 |     def __init__(self, output_file=None, max_links=100, max_depth=1, allowed_urls='', banned_urls='^$', robots=None, crawl_existing=True):
 953 |         """
 954 |         output_file:
 955 |             where to save scraped data
 956 |         max_links:
 957 |             the maximum number of links to follow per page
 958 |         max_depth:
 959 |             the maximum depth to follow links into website (use None for no limit)
 960 |         allowed_urls:
 961 |             a regex for allowed urls, defaults to all urls
 962 |         banned_urls:
 963 |             a regex for banned urls, defaults to no urls
 964 |         robots:
 965 |             RobotFileParser object to determine which urls allowed to crawl
 966 |         crawl_existing:
 967 |             sets whether to crawl content already downloaded previously in the cache
 968 |         """
 969 |         self.found = adt.HashDict(int) # track depth of found URLs
 970 |         if output_file:
 971 |             self.writer = common.UnicodeWriter(output_file) 
 972 |         else:
 973 |             self.writer = None
 974 |         self.max_links = max_links
 975 |         self.max_depth = max_depth
 976 |         self.allowed_urls = re.compile(allowed_urls)
 977 |         self.banned_urls = re.compile(banned_urls)
 978 |         self.robots = robots
 979 |         self.crawl_existing = crawl_existing
 980 | 
 981 | 
 982 |     def __call__(self, D, url, html):
 983 |         # override this method to add scraping code ...
 984 |         return self.crawl(D, url, html)                                                                                                          
 985 | 
 986 | 
 987 |     def normalize(self, url, link):
 988 |         """Normalize the link to avoid duplicates
 989 | 
 990 |         >>> cb = CrawlerCallback()
 991 |         >>> cb.normalize('http://example.com', '../abc.html')
 992 |         'http://example.com/abc.html'
 993 |         >>> cb.normalize('http://example.com', 'abc.html#link')
 994 |         'http://example.com/abc.html'
 995 |         >>> cb.normalize('http://example.com', 'abc.html?a=1&amp;b=2')
 996 |         'http://example.com/abc.html?a=1&b=2'
 997 |         """
 998 |         link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
 999 |         link = common.unescape(link) # parse escaped characters such as &amp;
1000 |         link = urlparse.urljoin(url, link) # support relative links
1001 |         while urlparse.urlsplit(link).path.startswith('/..'):
1002 |             # remove invalid parent directory
1003 |             link = link.replace('/..', '', 1)
1004 |         return link
1005 | 
1006 | 
1007 |     def crawl(self, D, url, html): 
1008 |         """Crawl website html and return list of URLs crawled
1009 |         """
1010 |         def valid(link):
1011 |             """Check if should crawl this link
1012 |             """
1013 |             # check if a media file
1014 |             if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
1015 |                 # check if a proper HTTP link
1016 |                 if link.lower().startswith('http'):
1017 |                     # only crawl within website
1018 |                     if common.same_domain(domain, link):
1019 |                         # passes regex
1020 |                         if self.allowed_urls.match(link) and not self.banned_urls.match(link):
1021 |                             # not blocked by robots.txt
1022 |                             if not self.robots or self.robots.can_fetch(settings.user_agent, link):
1023 |                                 # allowed to recrawl
1024 |                                 if self.crawl_existing or (D.cache and link not in D.cache):
1025 |                                     return True
1026 |             return False
1027 | 
1028 |         domain = common.get_domain(url)
1029 |         depth = self.found[url]
1030 |         outstanding = []
1031 |         if depth != self.max_depth: 
1032 |             # extract links to continue crawling
1033 |             links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
1034 |             for link in links_re.findall(html):
1035 |                 try:
1036 |                     link = self.normalize(url, link)
1037 |                 except UnicodeDecodeError as e:
1038 |                     # unicode error when joining url
1039 |                     common.logger.info(e)
1040 |                 else:
1041 |                     if link not in self.found:
1042 |                         self.found[link] = depth + 1
1043 |                         if valid(link):
1044 |                             # is a new link
1045 |                             outstanding.append(link)
1046 |                             if len(outstanding) == self.max_links:
1047 |                                 break
1048 |         return outstanding
1049 | 
1050 | 
1051 | class DataCrawler:
1052 |     """Crawl a website and return all matches extracted using a given function
1053 |     """
1054 |     def __init__(self, D, extract_fn):
1055 |         """
1056 |         extract_fn:
1057 |             a function to parse given HTML and return a list of matches
1058 |         """
1059 |         self.D = D
1060 |         self.extract_fn = extract_fn
1061 | 
1062 |     def link_score(self, link):
1063 |         """Return how valuable this link is for ordering crawling
1064 |         The lower the better"""
1065 |         link = link.lower()
1066 |         total = 0
1067 |         if 'contact' in link:
1068 |             pass # this page is top priority
1069 |         elif 'about' in link:
1070 |             total += 10
1071 |         elif 'help' in link:
1072 |             total += 20
1073 |         else:
1074 |             # generic page
1075 |             total += 100
1076 |         # bias towards shorter links
1077 |         total += len(link)
1078 |         return total
1079 | 
1080 |     def find(self, website, max_depth, max_urls, max_results):
1081 |         """
1082 |         website:
1083 |             the URL of website to crawl
1084 |         max_depth:
1085 |             how many links deep to follow before stop crawl
1086 |         max_urls:
1087 |             how many URL's to download before stop crawl
1088 |         max_results:
1089 |             The maximum number of results to extract before stop crawl.
1090 |             If None then extract all results found in crawl.
1091 |         """
1092 |         # check for redirect URL
1093 |         self.D.get(website)
1094 |         redirect_url = self.D.cache.meta(website).get('url') if self.D.cache else self.final_url
1095 |         website = redirect_url or website
1096 |         
1097 |         domain = urlparse.urlparse(website).netloc
1098 |         scraped = adt.HashDict()
1099 |         c = CrawlerCallback(max_depth=max_depth)
1100 |         outstanding = [(0, website)] # list of URLs and their score
1101 |         results = []
1102 |         while outstanding and (max_urls is None or len(scraped) < max_urls) \
1103 |                           and (max_results is None or len(results) < max_results):
1104 |             _, url = outstanding.pop(0)
1105 |             scraped[url] = True
1106 |             html = self.D.get(url, num_retries=0)
1107 | 
1108 |             if html:
1109 |                 for result in self.extract_fn(html):
1110 |                     if result not in results:
1111 |                         results.append(result)
1112 |                         if len(results) == max_results:
1113 |                             break
1114 |                 # crawl the linked URLs
1115 |                 for link in c.crawl(self, url, html):
1116 |                     if urlparse.urlparse(link).netloc == domain:
1117 |                         if link not in scraped:
1118 |                             # insert sort this new record so crawl most promising first
1119 |                             score = self.link_score(link)
1120 |                             for i, (other_score, other_link) in enumerate(outstanding):
1121 |                                 if score < other_score:
1122 |                                     outstanding.insert(i, ((score, link)))
1123 |                                     break
1124 |                             else:
1125 |                                 outstanding.append((score, link))
1126 |         return results
1127 | 


--------------------------------------------------------------------------------
/pdict.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """
  2 | pdict has a dictionary like interface and a sqlite backend
  3 | It uses pickle to store Python objects and strings, which are then compressed
  4 | Multithreading is supported
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import datetime
 10 | import time
 11 | import sqlite3
 12 | import zlib
 13 | import itertools
 14 | import threading
 15 | import md5
 16 | import shutil
 17 | import glob
 18 | try:
 19 |     import cPickle as pickle
 20 | except ImportError:
 21 |     import pickle
 22 | try:
 23 |     # gdbm produces best performance
 24 |     import gdbm as dbm
 25 | except ImportError:
 26 |     import anydbm as dbm
 27 | 
 28 | DEFAULT_LIMIT = 1000
 29 | DEFAULT_TIMEOUT = 10000
 30 | 
 31 | 
 32 | 
 33 | def opendb(*argv, **kwargs):
 34 |     try:
 35 |         db = PersistentDict(*argv, **kwargs)
 36 |     except sqlite3.DatabaseError:
 37 |         db = DbmDict(*argv, **kwargs)
 38 |     #except dbm.error:
 39 |     return db
 40 | 
 41 | 
 42 | class PersistentDict:
 43 |     """Stores and retrieves persistent data through a dict-like interface
 44 |     Data is stored compressed on disk using sqlite3 
 45 | 
 46 |     filename: 
 47 |         where to store sqlite database. Uses in memory by default.
 48 |     compress_level: 
 49 |         between 1-9 (in my test levels 1-3 produced a 1300kb file in ~7 seconds while 4-9 a 288kb file in ~9 seconds)
 50 |     expires: 
 51 |         a timedelta object of how old data can be before expires. By default is set to None to disable.
 52 |     timeout: 
 53 |         how long should a thread wait for sqlite to be ready (in ms)
 54 |     isolation_level: 
 55 |         None for autocommit or else 'DEFERRED' / 'IMMEDIATE' / 'EXCLUSIVE'
 56 | 
 57 |     >>> cache = PersistentDict()
 58 |     >>> url = 'http://google.com/abc'
 59 |     >>> html = '<html>abc</html>'
 60 |     >>>
 61 |     >>> url in cache
 62 |     False
 63 |     >>> len(cache)
 64 |     0
 65 |     >>> cache[url] = html
 66 |     >>> url in cache
 67 |     True
 68 |     >>> len(cache)
 69 |     1
 70 |     >>> cache[url] == html
 71 |     True
 72 |     >>> cache.get(url)['value'] == html
 73 |     True
 74 |     >>> cache.meta(url)
 75 |     {}
 76 |     >>> cache.meta(url, 'meta')
 77 |     >>> cache.meta(url)
 78 |     'meta'
 79 |     >>> del cache[url]
 80 |     >>> url in cache
 81 |     False
 82 |     >>> os.remove(cache.filename)
 83 |     """
 84 |     def __init__(self, filename='cache.db', compress_level=6, expires=None, timeout=DEFAULT_TIMEOUT, isolation_level=None):
 85 |         """initialize a new PersistentDict with the specified database file.
 86 |         """
 87 |         self.filename = filename
 88 |         self.compress_level, self.expires, self.timeout, self.isolation_level = \
 89 |             compress_level, expires, timeout, isolation_level
 90 |         self.conn = sqlite3.connect(filename, timeout=timeout, isolation_level=isolation_level, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
 91 |         self.conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace')
 92 |         sql = """
 93 |         CREATE TABLE IF NOT EXISTS config (
 94 |             key TEXT NOT NULL PRIMARY KEY UNIQUE,
 95 |             value BLOB,
 96 |             meta BLOB,
 97 |             status INTEGER,
 98 |             updated timestamp DEFAULT (datetime('now', 'localtime'))
 99 |         );
100 |         """
101 |         self.conn.execute(sql)
102 |         self.conn.execute("CREATE INDEX IF NOT EXISTS keys ON config (key);")
103 | 
104 | 
105 |     def __copy__(self):
106 |         """make a copy of current cache settings
107 |         """
108 |         return PersistentDict(filename=self.filename, compress_level=self.compress_level, expires=self.expires, 
109 |                               timeout=self.timeout, isolation_level=self.isolation_level)
110 | 
111 | 
112 |     def __contains__(self, key):
113 |         """check the database to see if a key exists
114 |         """
115 |         row = self.conn.execute("SELECT updated FROM config WHERE key=?;", (key,)).fetchone()
116 |         return row and self.is_fresh(row[0])
117 | 
118 | 
119 |     def contains(self, keys, ignore_expires=False):
120 |         """check if a list of keys exist
121 |     
122 |         >>> # try 0 second expiration so expires immediately
123 |         >>> cache = PersistentDict(expires=datetime.timedelta(seconds=0))
124 |         >>> cache['a'] = 1; 
125 |         >>> cache.contains(['a', 'b'])
126 |         []
127 |         >>> cache.contains(['a', 'b'], ignore_expires=True)
128 |         [u'a']
129 |         >>> os.remove(cache.filename)
130 |         """
131 |         results = []
132 |         c = self.conn.cursor()
133 |         c.execute("SELECT key, updated FROM config WHERE key IN (%s);" % ','.join(len(keys)*'?'), keys)
134 |         for row in c:
135 |             if ignore_expires or self.is_fresh(row[1]):
136 |                 results.append(row[0])
137 |         return results
138 |         
139 | 
140 |     def __iter__(self):
141 |         """iterate each key in the database
142 |         """
143 |         c = self.conn.cursor()
144 |         c.execute("SELECT key FROM config;")
145 |         for row in c:
146 |             yield row[0]
147 | 
148 |     
149 |     def __nonzero__(self):
150 |         return True
151 | 
152 | 
153 |     def __len__(self):
154 |         """Return the number of entries in the cache
155 |         """
156 |         c = self.conn.cursor()
157 |         c.execute("SELECT count(*) FROM config;")
158 |         return c.fetchone()[0]
159 | 
160 | 
161 |     def __getitem__(self, key):
162 |         """return the value of the specified key or raise KeyError if not found
163 |         """
164 |         row = self.conn.execute("SELECT value, updated FROM config WHERE key=?;", (key,)).fetchone()
165 |         if row:
166 |             if self.is_fresh(row[1]):
167 |                 value = row[0]
168 |                 return self.deserialize(value)
169 |             else:
170 |                 raise KeyError("Key `%s' is stale" % key)
171 |         else:
172 |             raise KeyError("Key `%s' does not exist" % key)
173 | 
174 | 
175 |     def __delitem__(self, key):
176 |         """remove the specifed value from the database
177 |         """
178 |         self.conn.execute("DELETE FROM config WHERE key=?;", (key,))
179 | 
180 | 
181 |     def __setitem__(self, key, value):
182 |         """set the value of the specified key
183 |         """
184 |         updated = datetime.datetime.now()
185 |         self.conn.execute("INSERT OR REPLACE INTO config (key, value, meta, updated) VALUES(?, ?, ?, ?);", (
186 |             key, self.serialize(value), self.serialize({}), updated)
187 |         )
188 | 
189 | 
190 |     def serialize(self, value):
191 |         """convert object to a compressed pickled string to save in the db
192 |         """
193 |         return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), self.compress_level))
194 |     
195 |     def deserialize(self, value):
196 |         """convert compressed pickled string from database back into an object
197 |         """
198 |         if value:
199 |             return pickle.loads(zlib.decompress(value))
200 | 
201 | 
202 |     def is_fresh(self, t):
203 |         """returns whether this datetime has expired
204 |         """
205 |         return self.expires is None or datetime.datetime.now() - t < self.expires
206 | 
207 | 
208 |     def get(self, key, default=None):
209 |         """Get data at key and return default if not defined
210 |         """
211 |         data = default
212 |         if key:
213 |             row = self.conn.execute("SELECT value, meta, updated FROM config WHERE key=?;", (key,)).fetchone()
214 |             if row:
215 |                 if self.is_fresh(row[2]):
216 |                     value = row[0] 
217 |                     data = dict(
218 |                         value=self.deserialize(value),
219 |                         meta=self.deserialize(row[1]),
220 |                         updated=row[2]
221 |                     )
222 |         return data
223 | 
224 | 
225 |     def meta(self, key, value=None):
226 |         """Get / set meta for this value
227 | 
228 |         if value is passed then set the meta attribute for this key
229 |         if not then get the existing meta data for this key
230 |         """
231 |         if value is None:
232 |             # want to get meta
233 |             row = self.conn.execute("SELECT meta FROM config WHERE key=?;", (key,)).fetchone()
234 |             if row:
235 |                 return self.deserialize(row[0])
236 |             else:
237 |                 raise KeyError("Key `%s' does not exist" % key)
238 |         else:
239 |             # want to set meta
240 |             self.conn.execute("UPDATE config SET meta=?, updated=? WHERE key=?;", (self.serialize(value), datetime.datetime.now(), key))
241 | 
242 | 
243 |     def clear(self):
244 |         """Clear all cached data
245 |         """
246 |         self.conn.execute("DELETE FROM config;")
247 | 
248 | 
249 |     def merge(self, db, override=False):
250 |         """Merge this databases content
251 |         override determines whether to override existing keys
252 |         """
253 |         for key in db.keys():
254 |             if override or key not in self:
255 |                 self[key] = db[key]
256 | 
257 | 
258 |     def vacuum(self):
259 |         self.conn.execute('VACUUM')
260 | 
261 | 
262 | class DbmDict:
263 |     """Experimental new version of PersistentDict that uses the dbm modules instead
264 |     This allows lazy writes instead of a transaction for each write
265 | 
266 |     filename: 
267 |         where to store sqlite database. Uses in memory by default.
268 |     compress_level: 
269 |         between 1-9 (in my test levels 1-3 produced a 1300kb file in ~7 seconds while 4-9 a 288kb file in ~9 seconds)
270 | 
271 |     >>> filename = 'dbm.db'
272 |     >>> cache = DbmDict(filename)
273 |     >>> url = 'http://google.com/abc'
274 |     >>> html = '<html>abc</html>'
275 |     >>>
276 |     >>> url in cache
277 |     False
278 |     >>> cache[url] = html
279 |     >>> url in cache
280 |     True
281 |     >>> cache[url] == html
282 |     True
283 |     >>> cache.meta(url)
284 |     {}
285 |     >>> cache.meta(url, 'meta')
286 |     >>> cache.meta(url)
287 |     'meta'
288 |     >>> urls = list(cache)
289 |     >>> del cache[url]
290 |     >>> url in cache
291 |     False
292 |     >>> os.remove(filename)
293 |     """
294 |     def __init__(self, filename='dbm.db', compress_level=6):
295 |         """initialize a new PersistentDict with the specified database file.
296 |         """
297 |         self.filename, self.compress_level = filename, compress_level
298 |         self.db = dbm.open(filename, 'c')
299 |         self.lock = threading.Lock()
300 | 
301 | 
302 |     def __copy__(self):
303 |         """make a copy of current cache settings
304 |         """
305 |         return PersistentDict(filename=self.filename, compress_level=self.compress_level)
306 | 
307 | 
308 |     def __contains__(self, key):
309 |         """check the database to see if a key exists
310 |         """
311 |         with self.lock:
312 |             return self.db.has_key(key)
313 |    
314 | 
315 |     def __iter__(self):
316 |         """iterate each key in the database
317 |         """
318 |         with self.lock:
319 |             key = self.db.firstkey()
320 |         while key != None:
321 |             yield key
322 |             with self.lock:
323 |                 key = self.db.nextkey(key)
324 |            
325 | 
326 |     def __getitem__(self, key):
327 |         """return the value of the specified key or raise KeyError if not found
328 |         """
329 |         with self.lock:
330 |             value = self.db[key]
331 |         return self.deserialize(value)
332 | 
333 | 
334 |     def __delitem__(self, key):
335 |         """remove the specifed value from the database
336 |         """
337 |         with self.lock:
338 |             del self.db[key]
339 | 
340 | 
341 |     def __setitem__(self, key, value):
342 |         """set the value of the specified key
343 |         """
344 |         value = self.serialize(value)
345 |         with self.lock:
346 |             self.db[key] = value
347 | 
348 | 
349 |     def serialize(self, value):
350 |         """convert object to a compressed pickled string to save in the db
351 |         """
352 |         return zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), self.compress_level)
353 |    
354 | 
355 |     def deserialize(self, value):
356 |         """convert compressed pickled string from database back into an object
357 |         """
358 |         if value:
359 |             return pickle.loads(zlib.decompress(value))
360 | 
361 | 
362 |     def get(self, key, default=None):
363 |         """Get data at key and return default if not defined
364 |         """
365 |         try:
366 |             value = self[key]
367 |         except KeyError:
368 |             value = default
369 |         return value
370 | 
371 | 
372 |     def meta(self, key, value=None, prefix='__meta__'):
373 |         """Get / set meta for this value
374 | 
375 |         if value is passed then set the meta attribute for this key
376 |         if not then get the existing meta data for this key
377 |         """
378 |         key = prefix + key
379 |         if value is None:
380 |             # get the meta data
381 |             return self.get(key, {})
382 |         else:
383 |             # set the meta data
384 |             self[key] = value
385 | 
386 | 
387 |     def clear(self):
388 |         """Clear all cached data
389 |         """
390 |         for key in self:
391 |             del self[key]
392 | 
393 | 
394 |     def merge(self, db, override=False):
395 |         """Merge this databases content
396 |         override determines whether to override existing keys
397 |         """
398 |         for key in db:
399 |             if override or key not in self:
400 |                 self[key] = db[key]
401 | 
402 | class Queue:
403 |     """Stores queue of outstanding URL's on disk
404 | 
405 |     >>> filename = 'queue.db'
406 |     >>> queue = Queue(filename)
407 |     >>> keys = [('a', 1), ('b', 2), ('c', 1)]
408 |     >>> queue.push(keys) # add new keys
409 |     >>> len(queue)
410 |     3
411 |     >>> queue.push(keys) # trying adding duplicate keys
412 |     >>> len(queue)
413 |     3
414 |     >>> queue.clear(keys=['a'])
415 |     1
416 |     >>> queue.pull(limit=1)
417 |     [u'b']
418 |     >>> queue.clear() # remove all queue
419 |     1
420 |     >>> os.remove(filename)
421 |     """
422 |     size = None # track the size of the queue
423 |     counter = itertools.count(1).next # counter gives a unique status for each pull()
424 | 
425 |     def __init__(self, filename, timeout=DEFAULT_TIMEOUT, isolation_level=None):
426 |         self._conn = sqlite3.connect(filename, timeout=timeout, isolation_level=isolation_level, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
427 |         self._conn.text_factory = lambda x: unicode(x, 'utf-8', 'replace')
428 |         sql = """
429 |         CREATE TABLE IF NOT EXISTS queue (
430 |             key TEXT NOT NULL PRIMARY KEY UNIQUE,
431 |             status INTEGER,
432 |             priority INTEGER
433 |         );
434 |         """
435 |         self._conn.execute(sql)
436 |         self._conn.execute("CREATE INDEX IF NOT EXISTS priorities ON queue (priority);")
437 |         if Queue.size is None:
438 |             self._update_size()
439 | 
440 | 
441 |     def __len__(self):
442 |         """Get number of records queued
443 |         """
444 |         return Queue.size
445 |             
446 |     def _update_size(self):
447 |         """Calculate the number of records queued
448 |         """
449 |         row = self._conn.execute("SELECT count(*) FROM queue WHERE status=?;", (False,)).fetchone()
450 |         Queue.size = row[0]
451 | 
452 | 
453 |     def push(self, key_map):
454 |         """Add these keys to the queue
455 |         Will not insert if key already exists.
456 | 
457 |         key_map:
458 |             a list of (key, priority) tuples
459 |         """
460 |         if key_map:
461 |             c = self._conn.cursor()
462 |             c.execute("BEGIN TRANSACTION")
463 |             c.executemany("INSERT OR IGNORE INTO queue (key, priority, status) VALUES(?, ?, ?);", [(key, priority, False) for key, priority in key_map])
464 |             c.execute("END TRANSACTION")
465 |             self._update_size()
466 | 
467 | 
468 |     def pull(self, limit=DEFAULT_LIMIT):
469 |         """Get queued keys up to limit
470 |         """
471 |         status = Queue.counter()
472 |         self._conn.execute('UPDATE queue SET status=? WHERE key in (SELECT key FROM queue WHERE status=? ORDER BY priority DESC LIMIT ?);', (status, False, limit))
473 |         rows = self._conn.execute('SELECT key FROM queue WHERE status=? LIMIT ?', (status, limit))
474 |         keys = [row[0] for row in rows]
475 |         Queue.size -= len(keys)
476 |         if Queue.size < 0:
477 |             Queue.size = 0
478 |         return keys
479 | 
480 | 
481 |     def clear(self, keys=None):
482 |         """Remove keys from queue.
483 |         If keys is None remove all.
484 | 
485 |         Returns the number of keys removed
486 |         """
487 |         prev_size = len(self)
488 |         c = self._conn.cursor()
489 |         if keys:
490 |             c.execute("BEGIN TRANSACTION")
491 |             c.executemany("DELETE FROM queue WHERE key=?;", [(key,) for key in keys])
492 |             c.execute("END TRANSACTION")
493 |             self._update_size()
494 |         else:
495 |             c.execute("DELETE FROM queue;")
496 |             Queue.size = 0
497 |         return prev_size - len(self)
498 | 
499 | 
500 | 
501 | class FSCache:
502 |     """
503 |     Dictionary interface that stores cached 
504 |     values in the file system rather than in memory.
505 |     The file path is formed from an md5 hash of the key.
506 | 
507 |     folder:
508 |         the root level folder for the cache
509 | 
510 |     >>> fscache = FSCache('.')
511 |     >>> url = 'http://google.com/abc'
512 |     >>> html = '<html>abc</html>'
513 |     >>> url in fscache
514 |     False
515 |     >>> fscache[url] = html
516 |     >>> url in fscache
517 |     True
518 |     >>> fscache.get(url) == html
519 |     True
520 |     >>> fscache.get(html) == ''
521 |     True
522 |     >>> fscache.clear()
523 |     """
524 |     PARENT_DIR = 'fscache'
525 |     FILE_NAME = 'index.html'
526 | 
527 |     def __init__(self, folder):
528 |         self.folder = os.path.join(folder, FSCache.PARENT_DIR)
529 | 
530 |     
531 |     def __contains__(self, key):
532 |         """Does data for this key exist
533 |         """
534 |         return os.path.exists(self._key_path(key))
535 | 
536 | 
537 |     def __getitem__(self, key):
538 |         path = self._key_path(key)
539 |         try:
540 |             fp = open(path, 'rb')
541 |         except IOError:
542 |             # key does not exist
543 |             raise KeyError('%s does not exist' % key)
544 |         else:
545 |             # get value in key
546 |             return fp.read()
547 | 
548 | 
549 |     def __setitem__(self, key, value):
550 |         """Save value at this key to this value
551 |         """
552 |         path = self._key_path(key)
553 |         folder = os.path.dirname(path)
554 |         if not os.path.exists(folder):
555 |             os.makedirs(folder)
556 |         open(path, 'wb').write(value)
557 | 
558 | 
559 |     def __delitem__(self, key):
560 |         """Remove the value at this key and any empty parent sub-directories
561 |         """
562 |         path = self._key_path(key)
563 |         try:
564 |             os.remove(path)
565 |             os.removedirs(os.path.dirname(path))
566 |         except OSError:
567 |             pass
568 | 
569 |     def _key_path(self, key):
570 |         """The fils system path for this key
571 |         """
572 |         # create unique hash for this key
573 |         try:
574 |             key = key.encode('utf-8')
575 |         except UnicodeDecodeError:
576 |             pass
577 |         h = md5.md5(key).hexdigest()
578 |         # create file system path
579 |         path = os.path.join(self.folder, os.path.sep.join(h), FSCache.FILE_NAME)
580 |         return path
581 | 
582 | 
583 |     def get(self, key, default=''):
584 |         """Get data at this key and return default if does not exist
585 |         """
586 |         try:
587 |             value = self[key]
588 |         except KeyError:
589 |             value = default
590 |         return value
591 | 
592 | 
593 |     def clear(self):
594 |         """Remove all the cached values
595 |         """
596 |         if os.path.exists(self.folder):
597 |             shutil.rmtree(self.folder)
598 | 
599 | 
600 | 
601 | if __name__ == '__main__':
602 |     import tempfile
603 |     import webbrowser
604 |     from optparse import OptionParser
605 |     parser = OptionParser(usage='usage: %prog [options] <cache file>')
606 |     parser.add_option('-k', '--key', dest='key', help='The key to use')
607 |     parser.add_option('-v', '--value', dest='value', help='The value to store')
608 |     parser.add_option('-b', '--browser', action='store_true', dest='browser', default=False, help='View content of this key in a web browser')
609 |     parser.add_option('-c', '--clear', action='store_true', dest='clear', default=False, help='Clear all data for this cache')
610 |     parser.add_option('-s', '--size', action='store_true', dest='size', default=False, help='Display size of database')
611 |     options, args = parser.parse_args()
612 |     if not args:
613 |         parser.error('Must specify the cache file')
614 |     cache = PersistentDict(args[0])
615 | 
616 |     if options.value:
617 |         # store thie value 
618 |         if options.key:
619 |             cache[options.key] = options.value
620 |         else:
621 |             parser.error('Must specify the key')
622 |     elif options.browser:
623 |         if options.key:
624 |             value = cache[options.key]
625 |             filename = tempfile.NamedTemporaryFile().name
626 |             fp = open(filename, 'w')
627 |             fp.write(str(value))
628 |             fp.flush()
629 |             webbrowser.open(filename)
630 |         else:
631 |             parser.error('Must specify the key')
632 |     elif options.key:
633 |         print cache[options.key]
634 |     elif options.clear:
635 |         if raw_input('Really? Clear the cache? (y/n) ') == 'y':
636 |             cache.clear()
637 |             print 'cleared'
638 |     elif options.size:
639 |         print len(cache)
640 |     else:
641 |         parser.error('No options selected')
642 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | __doc__ = 'default application wide settings'
 2 | 
 3 | import sys
 4 | import os
 5 | import logging
 6 | 
 7 | 
 8 | # default location to store output state files
 9 | dirname, filename = os.path.split(sys.argv[0])
10 | state_dir = os.path.join(dirname, '.' + filename.replace('.py', '')) 
11 | if not os.path.exists(state_dir):
12 |     try:
13 |         os.mkdir(state_dir)
14 |     except OSError as e:
15 |         state_dir = ''
16 |         #print 'Unable to create state directory:', e
17 | cache_file  = os.path.relpath(os.path.join(state_dir, 'cache.db')) # file to use for pdict cache
18 | queue_file  = os.path.relpath(os.path.join(state_dir, 'queue.db')) # file to use for pdict queue
19 | status_file = os.path.join(state_dir, 'status.js') # where to store state of crawl
20 | log_file    = os.path.join(state_dir, 'webscraping.log') # default logging file
21 | 
22 | log_level = logging.INFO # logging level
23 | default_encoding = 'utf-8'
24 | default_headers =  {
25 |     'Referer': '', 
26 |     'Accept-Language': 'en-us,en;q=0.5',
27 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
28 | }
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from distutils.core import setup
 3 | 
 4 | def read(filename):
 5 |     return open(os.path.join(os.path.dirname(__file__), filename)).read()
 6 | 
 7 | setup(
 8 |     name='webscraping', 
 9 |     version='1.7.1',
10 |     packages=['webscraping'],
11 |     package_dir={'webscraping':'.'}, # look for package contents in current directory
12 |     author='Richard Penman',
13 |     author_email='richard@webscraping.com',
14 |     description='Pure python library aimed to make web scraping easier',
15 |     long_description=read('README.rst'),
16 |     url='https://github.com/richardpenman/webscraping',
17 |     classifiers = [
18 |         'Environment :: Web Environment',
19 |         'Intended Audience :: Developers',
20 |         'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
21 |         'Operating System :: OS Independent',
22 |         'Programming Language :: Python',
23 |         'Topic :: Internet :: WWW/HTTP'
24 |     ],
25 |     license='lgpl'
26 | )
27 | 


--------------------------------------------------------------------------------
/webkit.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | __doc__ = 'Interface to qt webkit for loading and interacting with JavaScript dependent webpages'
  4 | 
  5 | import sys, os, re, urllib2, random, itertools, json
  6 | from time import time, sleep
  7 | from datetime import datetime
  8 | 
  9 | # for using native Python strings
 10 | import sip
 11 | sip.setapi('QString', 2)
 12 | from PyQt4.QtGui import QApplication, QDesktopServices, QImage, QPainter, QMouseEvent, QKeyEvent, QKeySequence
 13 | from PyQt4.QtCore import Qt, QByteArray, QUrl, QTimer, QEventLoop, QIODevice, QObject, QPoint, QEvent
 14 | from PyQt4.QtWebKit import QWebFrame, QWebView, QWebElement, QWebPage, QWebSettings, QWebInspector
 15 | from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache
 16 | 
 17 | # maximum number of bytes to read from a POST request
 18 | MAX_POST_SIZE = 2 ** 25
 19 | 
 20 | import alg, common, pdict, settings
 21 | 
 22 | 
 23 | class NetworkAccessManager(QNetworkAccessManager):
 24 |     def __init__(self, proxy, use_cache):
 25 |         """Subclass QNetworkAccessManager for finer control network operations
 26 | 
 27 |         proxy: the string of a proxy to download through
 28 |         use_cache: whether to cache replies so that can load faster with the same content subsequent times
 29 |         """
 30 |         super(NetworkAccessManager, self).__init__()
 31 |         self.setProxy(proxy)
 32 |         self.sslErrors.connect(self.sslErrorHandler)
 33 |         # the requests that are still active
 34 |         self.active_requests = [] 
 35 |         self.cache = pdict.PersistentDict(settings.cache_file) if use_cache else None
 36 | 
 37 | 
 38 |     def shutdown(self):
 39 |         """Network is shutting down event
 40 |         """
 41 |         # prevent new requests
 42 |         self.setNetworkAccessible(QNetworkAccessManager.NotAccessible)
 43 |         # abort existing requests
 44 |         for request in self.active_requests:
 45 |             request.abort()
 46 |             request.deleteLater()
 47 | 
 48 | 
 49 |     def setProxy(self, proxy):
 50 |         """Parse proxy components from proxy
 51 |         """
 52 |         if proxy:
 53 |             fragments = common.parse_proxy(proxy)
 54 |             if fragments['host']:
 55 |                 QNetworkAccessManager.setProxy(self,
 56 |                     QNetworkProxy(QNetworkProxy.HttpProxy,
 57 |                       fragments['host'], int(fragments['port']),
 58 |                       fragments['username'], fragments['password']
 59 |                     )
 60 |                 )
 61 |             else:
 62 |                 common.logger.info('Invalid proxy: ' + str(proxy))
 63 | 
 64 | 
 65 |     def createRequest(self, operation, request, post):
 66 |         """Override creating a network request
 67 |         """
 68 |         url = request.url().toString()
 69 |         if str(request.url().path()).endswith('.ttf'):
 70 |             # block fonts, which can cause webkit to crash
 71 |             common.logger.debug(u'Blocking: {}'.format(url))
 72 |             request.setUrl(QUrl())
 73 | 
 74 |         data = post if post is None else post.peek(MAX_POST_SIZE)
 75 |         key = u'{} {}'.format(url, data)
 76 |         use_cache = not url.startswith('file')
 77 |         if self.cache is not None and use_cache and key in self.cache:
 78 |             common.logger.debug(u'Load from cache: {}'.format(key))
 79 |             content, headers, attributes = self.cache[key]
 80 |             reply = CachedNetworkReply(self, request.url(), content, headers, attributes)
 81 |         else:
 82 |             common.logger.debug(u'Request: {} {}'.format(url, post or ''))
 83 |             reply = QNetworkAccessManager.createRequest(self, operation, request, post)
 84 |             reply.error.connect(self.catch_error)
 85 |             self.active_requests.append(reply)
 86 |             reply.destroyed.connect(self.active_requests.remove)
 87 |             # save reference to original request
 88 |             reply.content = QByteArray()
 89 |             reply.readyRead.connect(self._save_content(reply))
 90 |             if self.cache is not None and use_cache:
 91 |                 reply.finished.connect(self._cache_content(reply, key))
 92 |         reply.orig_request = request
 93 |         reply.data = self.parse_data(data)
 94 |         return reply
 95 |     
 96 |     
 97 |     def _save_content(self, r):
 98 |         """Save copy of reply content before is lost
 99 |         """
100 |         def save_content():
101 |             r.content.append(r.peek(r.size()))
102 |         return save_content
103 |    
104 |     def _cache_content(self, r, key):
105 |         """Cache downloaded content
106 |         """
107 |         def cache_content():
108 |             headers = [(header, r.rawHeader(header)) for header in r.rawHeaderList()]
109 |             attributes = []
110 |             attributes.append((QNetworkRequest.HttpStatusCodeAttribute, r.attribute(QNetworkRequest.HttpStatusCodeAttribute).toInt()))
111 |             attributes.append((QNetworkRequest.HttpReasonPhraseAttribute, r.attribute(QNetworkRequest.HttpReasonPhraseAttribute).toByteArray()))
112 |             #attributes.append((QNetworkRequest.RedirectionTargetAttribute, r.attribute(QNetworkRequest.RedirectionTargetAttribute).toUrl()))
113 |             attributes.append((QNetworkRequest.ConnectionEncryptedAttribute, r.attribute(QNetworkRequest.ConnectionEncryptedAttribute).toBool()))
114 |             #attributes.append((QNetworkRequest.CacheLoadControlAttribute, r.attribute(QNetworkRequest.CacheLoadControlAttribute).toInt()))
115 |             #attributes.append((QNetworkRequest.CacheSaveControlAttribute, r.attribute(QNetworkRequest.CacheSaveControlAttribute).toBool()))
116 |             #attributes.append((QNetworkRequest.SourceIsFromCacheAttribute, r.attribute(QNetworkRequest.SourceIsFromCacheAttribute).toBool()))
117 |             #print 'save cache:', key, len(r.content), len(headers), attributes
118 |             self.cache[key] = r.content, headers, attributes
119 |         return cache_content
120 | 
121 | 
122 |     def parse_data(self, data):
123 |         """Parse this posted data into a list of key/value pairs
124 |         """
125 |         if data is None:
126 |             result = []
127 |         else:
128 |             try:
129 |                 result = json.loads(unicode(data))
130 |                 if isinstance(result, dict):
131 |                     result = result.items()
132 |                 if not isinstance(result, list):
133 |                     common.logger.info(u'Unexpected data format: {}'.format(result))
134 |                     result = []
135 |             except ValueError:
136 |                 url = QUrl('')
137 |                 url.setEncodedQuery(data)
138 |                 result = url.queryItems()
139 |         return result
140 | 
141 | 
142 |     def catch_error(self, eid):
143 |         """Interpret the HTTP error ID received
144 |         """
145 |         if eid not in (5, 301):
146 |             errors = {
147 |                 0 : 'no error condition. Note: When the HTTP protocol returns a redirect no error will be reported. You can check if there is a redirect with the QNetworkRequest::RedirectionTargetAttribute attribute.',
148 |                 1 : 'the remote server refused the connection (the server is not accepting requests)',
149 |                 2 : 'the remote server closed the connection prematurely, before the entire reply was received and processed',
150 |                 3 : 'the remote host name was not found (invalid hostname)',
151 |                 4 : 'the connection to the remote server timed out',
152 |                 5 : 'the operation was canceled via calls to abort() or close() before it was finished.',
153 |                 6 : 'the SSL/TLS handshake failed and the encrypted channel could not be established. The sslErrors() signal should have been emitted.',
154 |                 7 : 'the connection was broken due to disconnection from the network, however the system has initiated roaming to another access point. The request should be resubmitted and will be processed as soon as the connection is re-established.',
155 |                 101 : 'the connection to the proxy server was refused (the proxy server is not accepting requests)',
156 |                 102 : 'the proxy server closed the connection prematurely, before the entire reply was received and processed',
157 |                 103 : 'the proxy host name was not found (invalid proxy hostname)',
158 |                 104 : 'the connection to the proxy timed out or the proxy did not reply in time to the request sent',
159 |                 105 : 'the proxy requires authentication in order to honour the request but did not accept any credentials offered (if any)',
160 |                 201 : 'the access to the remote content was denied (similar to HTTP error 401)',
161 |                 202 : 'the operation requested on the remote content is not permitted',
162 |                 203 : 'the remote content was not found at the server (similar to HTTP error 404)',
163 |                 204 : 'the remote server requires authentication to serve the content but the credentials provided were not accepted (if any)',
164 |                 205 : 'the request needed to be sent again, but this failed for example because the upload data could not be read a second time.',
165 |                 301 : 'the Network Access API cannot honor the request because the protocol is not known',
166 |                 302 : 'the requested operation is invalid for this protocol',
167 |                 99 : 'an unknown network-related error was detected',
168 |                 199 : 'an unknown proxy-related error was detected',
169 |                 299 : 'an unknown error related to the remote content was detected',
170 |                 399 : 'a breakdown in protocol was detected (parsing error, invalid or unexpected responses, etc.)',
171 |             }
172 |             common.logger.debug('Error %d: %s (%s)' % (eid, errors.get(eid, 'unknown error'), self.sender().url().toString()))
173 | 
174 | 
175 |     def sslErrorHandler(self, reply, errors): 
176 |         common.logger.info('SSL errors: {}'.format(errors))
177 |         reply.ignoreSslErrors() 
178 | 
179 | 
180 | 
181 | class CachedNetworkReply(QNetworkReply):
182 |     def __init__(self, parent, url, content, headers, attributes):
183 |         super(CachedNetworkReply, self).__init__(parent)
184 |         self.setUrl(url)
185 |         self.content = content
186 |         self.offset = 0
187 |         for header, value in headers:
188 |             self.setRawHeader(header, value)
189 |         #self.setHeader(QNetworkRequest.ContentLengthHeader, len(content))
190 |         for attribute, value in attributes:
191 |             self.setAttribute(attribute, value)
192 |         self.setOpenMode(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
193 |         # trigger signals that content is ready
194 |         QTimer.singleShot(0, self.readyRead)
195 |         QTimer.singleShot(0, self.finished)
196 | 
197 |     def bytesAvailable(self):
198 |         return len(self.content) - self.offset
199 | 
200 |     def isSequential(self):
201 |         return True
202 | 
203 |     def abort(self):
204 |         pass # qt requires that this be defined
205 | 
206 |     def readData(self, size):
207 |         """Return up to size bytes from buffer
208 |         """
209 |         if self.offset >= len(self.content):
210 |             return ''
211 |         number = min(size, len(self.content) - self.offset)
212 |         data = self.content[self.offset : self.offset + number]
213 |         self.offset += number
214 |         return str(data)
215 | 
216 | 
217 | 
218 | class WebPage(QWebPage):
219 |     def __init__(self, user_agent, confirm=True):
220 |         """Override QWebPage to set User-Agent and JavaScript messages
221 | 
222 |         user_agent: the User Agent to submit
223 |         confirm: default response to confirm dialog boxes
224 |         """
225 |         super(WebPage, self).__init__()
226 |         self.user_agent = user_agent
227 |         self.confirm = confirm
228 |         self.setForwardUnsupportedContent(True)
229 | 
230 |     def userAgentForUrl(self, url):
231 |         """Use same user agent for all URL's
232 |         """
233 |         return self.user_agent
234 | 
235 |     def javaScriptAlert(self, frame, message):
236 |         """Override default JavaScript alert popup and send to log
237 |         """
238 |         common.logger.debug('Alert: ' + message)
239 | 
240 | 
241 |     def javaScriptConfirm(self, frame, message):
242 |         """Override default JavaScript confirm popup and send to log
243 |         """
244 |         common.logger.debug('Confirm: ' + message)
245 |         return self.confirm
246 | 
247 | 
248 |     def javaScriptPrompt(self, frame, message, default):
249 |         """Override default JavaScript prompt popup and send to log
250 |         """
251 |         common.logger.debug('Prompt: {} {}'.format(message, default))
252 | 
253 | 
254 |     def javaScriptConsoleMessage(self, message, line_number, source_id):
255 |         """Override default JavaScript console and send to log
256 |         """
257 |         common.logger.debug('Console: {} {} {}'.format(message, line_number, source_id))
258 | 
259 | 
260 |     def shouldInterruptJavaScript(self):
261 |         """Disable javascript interruption dialog box
262 |         """
263 |         return True
264 | 
265 | 
266 | 
267 | class Browser(QWebView):
268 |     def __init__(self, gui=False, user_agent=None, proxy=None, load_images=True, load_javascript=True, load_java=True, load_plugins=True, timeout=20, delay=5, app=None, use_cache=False):
269 |         """Widget class that contains the address bar, webview for rendering webpages, and a table for displaying results
270 | 
271 |         user_agent: the user-agent when downloading content
272 |         proxy: a QNetworkProxy to download through
273 |         load_images: whether to download images
274 |         load_javascript: whether to enable javascript
275 |         load_java: whether to enable java
276 |         load_plugins: whether to enable browser plugins
277 |         timeout: the maximum amount of seconds to wait for a request
278 |         delay: the minimum amount of seconds to wait between requests
279 |         app: QApplication object so that can instantiate multiple browser objects
280 |         use_cache: whether to cache all replies
281 |         """
282 |         # must instantiate the QApplication object before any other Qt objects
283 |         self.app = app or QApplication(sys.argv)
284 |         super(Browser, self).__init__()
285 | 
286 |         page = WebPage(user_agent or alg.rand_agent())
287 |         manager = NetworkAccessManager(proxy, use_cache)
288 |         page.setNetworkAccessManager(manager)
289 |         self.setPage(page)
290 |         page.networkAccessManager().finished.connect(self.finished)
291 |         # set whether to enable plugins, images, and java
292 |         self.settings().setAttribute(QWebSettings.AutoLoadImages, load_images)
293 |         self.settings().setAttribute(QWebSettings.JavascriptEnabled, load_javascript)
294 |         self.settings().setAttribute(QWebSettings.JavaEnabled, load_java)
295 |         self.settings().setAttribute(QWebSettings.PluginsEnabled, load_plugins)
296 |         self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
297 |         self.timeout = timeout
298 |         self.delay = delay
299 |         if gui:
300 |             self.showNormal()
301 |             self.raise_()
302 | 
303 | 
304 |     def __del__(self):
305 |         self.setPage(None)
306 | 
307 | 
308 |     def home(self):
309 |         """Go back to initial page in history
310 |         """
311 |         history = self.history()
312 |         history.goToItem(history.itemAt(0))
313 | 
314 | 
315 |     def save(self):
316 |         """Save the current HTML state to disk
317 |         """
318 |         for i in itertools.count(1):
319 |             filename = os.path.join(settings.state_dir, 'state{}.html'.format(i))
320 |             if not os.path.exists(filename):
321 |                 html = self.current_html()
322 |                 open(filename, 'w').write(common.to_unicode(html))
323 |                 print 'save', filename
324 |                 break
325 | 
326 | 
327 |     def set_proxy(self, proxy):
328 |         """Shortcut to set the proxy
329 |         """
330 |         self.page().networkAccessManager().setProxy(proxy)
331 | 
332 | 
333 |     def current_url(self):
334 |         """Return current URL
335 |         """
336 |         return str(self.url().toString())
337 | 
338 | 
339 |     def current_html(self):
340 |         """Return current rendered HTML
341 |         """
342 |         return common.to_unicode(unicode(self.page().mainFrame().toHtml()))
343 | 
344 | 
345 |     def current_text(self):
346 |         """Return text from the current rendered HTML
347 |         """
348 |         return common.to_unicode(unicode(self.page().mainFrame().toPlainText()))
349 | 
350 | 
351 |     def get(self, url, html=None, headers=None, data=None):
352 |         """Load given url in webkit and return html when loaded
353 | 
354 |         url: the URL to load
355 |         html: optional HTML to set instead of downloading
356 |         headers: the headers to attach to the request
357 |         data: the data to POST
358 |         """
359 |         if isinstance(url, basestring):
360 |             # convert string to Qt's URL object
361 |             url = QUrl(url)
362 |         if html:
363 |             # load pre downloaded HTML
364 |             self.setContent(html, baseUrl=url)
365 |             return html
366 | 
367 |         t1 = time()
368 |         loop = QEventLoop()
369 |         self.loadFinished.connect(loop.quit)
370 |         # need to make network request
371 |         request = QNetworkRequest(url)
372 |         if headers:
373 |             # add headers to request when defined
374 |             for header, value in headers:
375 |                 request.setRawHeader(header, value)
376 |         fn = super(Browser, self)
377 |         if data:
378 |             # POST request
379 |             fn.load(request, QNetworkAccessManager.PostOperation, data)
380 |         else:
381 |             # GET request
382 |             fn.load(request)
383 | 
384 |         # set a timeout on the download loop
385 |         timer = QTimer()
386 |         timer.setSingleShot(True)
387 |         timer.timeout.connect(loop.quit)
388 |         timer.start(self.timeout * 1000)
389 |         loop.exec_() # delay here until download finished or timeout
390 |     
391 |         if timer.isActive():
392 |             # downloaded successfully
393 |             timer.stop()
394 |             parsed_html = self.current_html()
395 |             self.wait(self.delay - (time() - t1))
396 |         else:
397 |             # did not download in time
398 |             common.logger.debug('Timed out: {}'.format(url.toString()))
399 |             parsed_html = ''
400 |         return parsed_html
401 | 
402 | 
403 |     def wait(self, timeout=1):
404 |         """Wait for delay time
405 |         """
406 |         deadline = time() + timeout
407 |         while time() < deadline:
408 |             sleep(0)
409 |             self.app.processEvents()
410 | 
411 | 
412 |     def wait_quiet(self, timeout=20):
413 |         """Wait until all requests have completed up to a maximum timeout.
414 |         Returns True if all requests complete before the timeout.
415 |         """
416 |         self.wait()
417 |         deadline = time() + timeout
418 |         manager = self.page().networkAccessManager()
419 |         while time() < deadline and manager.active_requests:
420 |             sleep(0)
421 |             self.app.processEvents()
422 |         self.app.processEvents()
423 |         return manager.active_requests == []
424 | 
425 | 
426 |     def wait_load(self, pattern, timeout=60):
427 |         """Wait for this content to be loaded up to maximum timeout.
428 |         Returns True if pattern was loaded before the timeout.
429 |         """
430 |         deadline = time() + timeout
431 |         while time() < deadline:
432 |             sleep(0)
433 |             self.app.processEvents()
434 |             if self.find(pattern):
435 |                 return True
436 |         return False
437 | 
438 | 
439 |     def wait_steady(self, timeout=60):
440 |         """Wait for the DOM to be steady, defined as no changes over a 1 second period
441 |         Returns True if DOM is steady before timeout, else False
442 |         """
443 |         deadline = time() + timeout
444 |         while time() < deadline:
445 |             orig_html = self.current_html()
446 |             self.wait(1)
447 |             cur_html = self.current_html()
448 |             if orig_html == cur_html:
449 |                 return True # DOM is steady
450 |         return False
451 | 
452 | 
453 |     def js(self, script):
454 |         """Shortcut to execute javascript on current document and return result
455 |         """
456 |         self.app.processEvents()
457 |         return self.page().mainFrame().evaluateJavaScript(script).toString()
458 | 
459 | 
460 |     def click(self, pattern='input', native=False):
461 |         """Click all elements that match the pattern.
462 | 
463 |         Uses standard CSS pattern matching: http://www.w3.org/TR/CSS2/selector.html
464 |         Returns the number of elements clicked
465 |         """
466 |         es = self.find(pattern)
467 |         for e in es:
468 |             if native:
469 |                 # get position of element
470 |                 e_pos = e.geometry().center()
471 |                 # scroll to element position
472 |                 self.page().mainFrame().setScrollPosition(e_pos)  
473 |                 scr_pos = self.page().mainFrame().scrollPosition()
474 |                 point_to_click = e_pos - scr_pos
475 |                 # create click on absolute coordinates
476 |                 press = QMouseEvent(QMouseEvent.MouseButtonPress, point_to_click, Qt.LeftButton, Qt.LeftButton, Qt.NoModifier)
477 |                 release = QMouseEvent(QMouseEvent.MouseButtonRelease, point_to_click, Qt.LeftButton, Qt.LeftButton, Qt.NoModifier)
478 |                 QApplication.postEvent(self, press)  
479 |                 QApplication.postEvent(self, release)
480 |             else:
481 |                 self.click_by_user_event_simulation(e)
482 |         return len(es)
483 | 
484 | 
485 |     def keys(self, pattern, text, native=False, blur=False):
486 |         """Simulate typing by focusing on elements that match the pattern and triggering key events.
487 |         If native is True then will use GUI key event simulation, else JavaScript.
488 |         If blur is True then will blur focus at the end of typing.
489 |         Returns the number of elements matched.
490 |         """
491 |         es = self.find(pattern)
492 |         for e in es:
493 |             if native:
494 |                 key_map = {'\t': Qt.Key_Tab, '\n': Qt.Key_Enter, 'DOWN': Qt.Key_Down, 'UP': Qt.Key_Up}
495 |                 self.click_by_gui_simulation(e)
496 |                 self.wait(0.1)
497 |                 for c in text:
498 |                     key = key_map.get(c, QKeySequence(c)[0])
499 |                     press = QKeyEvent(QEvent.KeyPress, key, Qt.NoModifier)
500 |                     release = QKeyEvent(QEvent.KeyRelease, key, Qt.NoModifier)
501 |                     QApplication.postEvent(self, press)  
502 |                     QApplication.postEvent(self, release)
503 |             else:
504 |                 #e.evaluateJavaScript("this.focus()")
505 |                 #self.click_by_user_event_simulation(e)
506 |                 self.fill(pattern, text, es=[e])
507 |                 for event_name in ('focus', 'keydown', 'change', 'keyup', 'keypress'):
508 |                     self.trigger_js_event(e, event_name)
509 |             if blur:
510 |                 e.evaluateJavaScript("this.blur()")
511 |         return len(es)
512 | 
513 | 
514 |     def attr(self, pattern, name, value=None):
515 |         """For the elements that match this pattern, set attribute if value is defined, else return the value.
516 |         """
517 |         if value is None:
518 |             # want to get attribute
519 |             return str(self.page().mainFrame().findFirstElement(pattern).attribute(name))
520 |         else:
521 |             es = self.find(pattern)
522 |             for e in es:
523 |                 e.setAttribute(name, value)
524 |             return len(es)
525 | 
526 | 
527 |     def fill(self, pattern, value, es=None):
528 |         """Set text of the matching form elements to value, and return the number of elements matched.
529 |         """
530 |         es = es or self.find(pattern)
531 |         for e in es:
532 |             tag = str(e.tagName()).lower()
533 |             if tag == 'input' or tag == "select":
534 |                 e.evaluateJavaScript('this.value = "{}"'.format(value))
535 |                 e.setAttribute('value', value)
536 |             else:
537 |                 e.setPlainText(value)
538 |         return len(es)
539 | 
540 |  
541 |     def find(self, pattern):
542 |         """Returns the elements matching this CSS pattern.
543 |         """
544 |         if isinstance(pattern, basestring):
545 |             matches = self.page().mainFrame().findAllElements(pattern).toList()
546 |         elif isinstance(pattern, list):
547 |             matches = pattern
548 |         elif isinstance(pattern, QWebElement):
549 |             matches = [pattern]
550 |         else:
551 |             common.logger.warning('Unknown pattern: ' + str(pattern))
552 |             matches = []
553 |         return matches
554 | 
555 | 
556 |     def screenshot(self, output_file):
557 |         """Take screenshot of current webpage and save results
558 |         """
559 |         frame = self.page().mainFrame()
560 |         self.page().setViewportSize(frame.contentsSize())
561 |         image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
562 |         painter = QPainter(image)
563 |         frame.render(painter)
564 |         painter.end()
565 |         common.logger.debug('saving: ' + output_file)
566 |         image.save(output_file)
567 | 
568 | 
569 |     def trigger_js_event(self, element, event_name):
570 |         """Triggers a JavaScript level event on an element.
571 |         
572 |         Takes a QWebElement as input, and a string name of the event (e.g. "click").
573 |         
574 |         Implementation is taken from Artemis:
575 |         https://github.com/cs-au-dk/Artemis/blob/720f051c4afb4cd69e560f8658ebe29465c59362/artemis-code/src/runtime/input/forms/formfieldinjector.cpp#L294
576 |         """
577 |         # TODO: Strictly we should create an appropriate event type as listed in:
578 |         # https://developer.mozilla.org/en-US/docs/Web/Events
579 |         # https://developer.mozilla.org/en-US/docs/Web/API/Document/createEvent#Notes
580 |         # For now we use generic "Event".
581 |         event_type = "Event";
582 |         event_init_method = "initEvent";
583 |         bubbles = "true";
584 |         cancellable = "true";
585 |         injection = "var event = document.createEvent('{}'); event.{}('{}', {}, {}); this.dispatchEvent(event);".format(event_type, event_init_method, event_name, bubbles, cancellable);
586 |         element.evaluateJavaScript(injection);
587 | 
588 | 
589 |     def click_by_user_event_simulation(self, element):
590 |         """Uses JS-level events to simulate a full user click.
591 |         
592 |         Takes a QWebElement as input.
593 |         
594 |         Implementation is taken from Artemis:
595 |         https://github.com/cs-au-dk/Artemis/blob/720f051c4afb4cd69e560f8658ebe29465c59362/artemis-code/src/runtime/input/clicksimulator.cpp#L42
596 |         """
597 |         self.trigger_js_event(element, "mouseover");
598 |         self.trigger_js_event(element, "mousemove");
599 |         self.trigger_js_event(element, "mousedown");
600 |         self.trigger_js_event(element, "focus");
601 |         self.trigger_js_event(element, "mouseup");
602 |         self.trigger_js_event(element, "click");
603 |         self.trigger_js_event(element, "mousemove");
604 |         self.trigger_js_event(element, "mouseout");
605 |         self.trigger_js_event(element, "blur");
606 |     
607 |     
608 |     def finished(self, reply):
609 |         """Override the reply finished signal to check the result of each request
610 |         """
611 |         pass
612 | 
613 | 
614 | 
615 | if __name__ == '__main__':
616 |     # initiate webkit and show gui
617 |     # once script is working you can disable the gui
618 |     w = Browser(gui=True) 
619 |     # load webpage
620 |     w.get('http://duckduckgo.com')
621 |     # fill search textbox 
622 |     w.fill('input[id=search_form_input_homepage]', 'web scraping')
623 |     # take screenshot of webpage
624 |     w.screenshot('duckduckgo.jpg')
625 |     # click search button 
626 |     w.click('input[id=search_button_homepage]')
627 |     # show webpage for 10 seconds
628 |     w.wait(10)
629 | 


--------------------------------------------------------------------------------
/xpath.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """
  2 | This module implements a subset of the XPath standard:
  3 | - tags
  4 | - indices
  5 | - attributes
  6 | - descendants
  7 | 
  8 | This was created because I needed a pure Python XPath parser.
  9 | 
 10 | Generally XPath solutions will normalize the HTML into XHTML before selecting nodes.
 11 | However this module tries to navigate the HTML structure directly without normalizing by searching for the next closing tag.
 12 | """
 13 | 
 14 | #TODO:
 15 | # - parent
 16 | # - search by text: text() == '...'
 17 | # - return xpath for most similar to text
 18 | # - multiple filters for a tag
 19 | 
 20 | import itertools, re, sys, urllib, urllib2, urlparse
 21 | from optparse import OptionParser
 22 | import adt, common, settings
 23 | 
 24 | 
 25 | class Doc:
 26 |     """Wrapper around a parsed webpage
 27 | 
 28 |     html:
 29 |         The content of webpage to parse
 30 |     remove:
 31 |         A list of tags to remove
 32 | 
 33 |     >>> doc = Doc('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>')
 34 |     >>> doc.search('/div/a')
 35 |     ['LINK 1', 'LINK 3']
 36 |     >>> doc.search('/div/a[@class="link"]')
 37 |     ['LINK 1']
 38 |     >>> doc.search('/div[1]//a')
 39 |     ['LINK 1', 'LINK 2']
 40 |     >>> doc.search('/div/a/@class')
 41 |     ['link', '']
 42 |     >>> doc.search('/div[-1]/a')
 43 |     ['LINK 3']
 44 | 
 45 |     >>> # test searching unicode
 46 |     >>> doc = Doc(u'<a href="http://www.google.com" class="flink">google</a>')
 47 |     >>> doc.get('//a[@class="flink"]')
 48 |     u'google'
 49 | 
 50 |     >>> # test finding just the first instance for a large amount of content
 51 |     >>> doc = Doc('<div><span>content</span></div>' * 10000)
 52 |     >>> doc.get('//span')
 53 |     'content'
 54 | 
 55 |     >>> # test extracting attribute of self closing tag
 56 |     >>> Doc('<div><img src="img.png"></div>').get('/div/img/@src')
 57 |     'img.png'
 58 | 
 59 |     >>> # test extracting attribute after self closing tag
 60 |     >>> Doc('<div><br><p>content</p></br></div>').get('/div/p')
 61 |     'content'
 62 |     """
 63 | 
 64 |     # regex to find a tag
 65 |     _tag_regex = re.compile('<([\w\:]+)')
 66 |     # regex to find an attribute
 67 |     _attributes_regex = re.compile('([\w\:-]+)\s*=\s*(".*?"|\'.*?\'|\S+)', re.DOTALL)
 68 |     # regex to find content of a tag
 69 |     _content_regex = re.compile('<.*?>(.*)</.*?>$', re.DOTALL)
 70 | 
 71 | 
 72 |     def __init__(self, html, remove=None):
 73 |         #self.html = self._clean(html, remove)
 74 |         self.html = html
 75 |         self.num_searches = 0
 76 | 
 77 |     def get(self, xpath):
 78 |         """Return the first result from this XPath selection
 79 |         """
 80 |         results = self._xpath(self.parse(xpath), self.html, limit=1)
 81 |         return common.first(results)
 82 | 
 83 |     def search(self, xpath):
 84 |         """Return all results from this XPath selection
 85 |         """
 86 |         return self._xpath(self.parse(xpath), self.html, limit=sys.maxint)
 87 | 
 88 | 
 89 |     def _xpath(self, path, html, limit):
 90 |         """Recursively search HTML for content at XPath
 91 |         """
 92 |         counter, separator, tag, index, attributes = path.pop(0)
 93 |         if counter == 0:
 94 |             self.num_searches += 1
 95 | 
 96 |         results = []
 97 |         if tag == '..':
 98 |             # parent
 99 |             raise common.WebScrapingError('.. not yet supported')
100 |             results.append(self.get_parent(html))
101 |         elif tag == 'text()':
102 |             # extract child text
103 |             text = self._get_content(self._get_html(html))
104 |             results.append(common.remove_tags(text, keep_children=False))
105 |             # check if next tag is selecting attribute
106 |         elif tag.startswith('@'):
107 |             attr = tag[1:].lower()
108 |             #parent = self.get_parent(context)
109 |             value = self._get_attributes(html).get(attr, '')
110 |             results.append(value)
111 |         else:
112 |             # have tag
113 |             if counter > 0:
114 |                 # get child html when not at root
115 |                 html = self._get_content(html)
116 | 
117 |             # search direct children if / and all descendants if //
118 |             search_fn = self._find_children if separator == '' else self._find_descendants
119 |             matches = search_fn(html, tag)
120 | 
121 |             # support negative indices
122 |             if index is not None and index < 0:
123 |                 matches = list(matches)
124 |                 index += len(matches) + 1
125 | 
126 |             for child_i, child in enumerate(matches):
127 |                 # check if matches index
128 |                 if index is None or index == child_i + 1:
129 |                     # check if matches attributes
130 |                     if not attributes or self._match_attributes(attributes, self._get_attributes(child)):
131 |                         if path:
132 |                             results.extend(self._xpath(path[:], child, limit))
133 |                         else:
134 |                             # final node
135 |                             results.append(self._get_content(child))
136 |                         if len(results) > limit:
137 |                             break
138 | 
139 |             #if not children:
140 |             #    attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or ''
141 |             #    common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
142 |         return results
143 | 
144 | 
145 | 
146 |     def _clean(self, html, remove):
147 |         """Remove specified unhelpful tags and comments
148 |         """
149 |         self.remove = remove
150 |         html = re.compile('<!--.*?-->', re.DOTALL).sub('', html) # remove comments
151 |         if remove:
152 |             # XXX combine tag list into single regex, if can match same at start and end
153 |             for tag in remove:
154 |                 html = re.compile('<' + tag + '[^>]*?/>', re.DOTALL | re.IGNORECASE).sub('', html)
155 |                 html = re.compile('<' + tag + '[^>]*?>.*?</' + tag + '>', re.DOTALL | re.IGNORECASE).sub('', html)
156 |                 html = re.compile('<' + tag + '[^>]*?>', re.DOTALL | re.IGNORECASE).sub('', html)
157 |         return html
158 | 
159 | 
160 |     def parse(self, xpath):
161 |         """Parse the xpath into: counter, separator, tag, index, and attributes
162 | 
163 |         >>> doc = Doc('')
164 |         >>> doc.parse('/div[1]//span[@class="text"]')
165 |         [(0, '', 'div', 1, []), (1, '/', 'span', None, [('class', 'text')])]
166 |         >>> doc.parse('//li[-2]')
167 |         [(0, '/', 'li', -2, [])]
168 |         >>> doc.parse('//option[@selected]')
169 |         [(0, '/', 'option', None, [('selected', None)])]
170 |         >>> doc.parse('/div[@id="content"]//span[1][@class="text"][@title=""]/a')
171 |         [(0, '', 'div', None, [('id', 'content')]), (1, '/', 'span', 1, [('class', 'text'), ('title', '')]), (2, '', 'a', None, [])]
172 |         """
173 |         tokens = []
174 |         counter = 0
175 |         for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath):
176 |             index, attributes = None, []
177 |             if '[' in token:
178 |                 tag = token[:token.find('[')]
179 |                 for attribute in re.compile('\[(.*?)\]').findall(token):
180 |                     try:
181 |                         index = int(attribute)
182 |                     except ValueError:
183 |                         match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search(attribute)
184 |                         if match:
185 |                             key, value = match.groups()
186 |                             attributes.append((key.lower(), value.lower()))
187 |                         else:
188 |                             match = re.compile('@(.*?)$').search(attribute)
189 |                             if match:
190 |                                 attributes.append((match.groups()[0].lower(), None))
191 |                             else:
192 |                                 raise common.WebScrapingError('Unknown format: ' + attribute)
193 |             else:
194 |                 tag = token
195 |             tokens.append((counter, separator, tag, index, attributes))
196 |             counter += 1
197 |         return tokens
198 | 
199 | 
200 |     def _get_attributes(self, html):
201 |         """Extract the attributes of the passed HTML tag
202 | 
203 |         >>> doc = Doc('')
204 |         >>> doc._get_attributes('<div id="ID" name="MY NAME" max-width="20" class=abc>content <span class="inner name">SPAN</span></div>')
205 |         {'max-width': '20', 'class': 'abc', 'id': 'ID', 'name': 'MY NAME'}
206 |         >>> doc._get_attributes('<td width=200 valign=top class=textelien>')
207 |         {'width': '200', 'class': 'textelien', 'valign': 'top'}
208 |         >>> doc._get_attributes('<option value="1" selected>')
209 |         {'selected': None, 'value': '1'}
210 |         """
211 | 
212 |         for i, c in enumerate(html):
213 |             if c == '>':
214 |                 html = html[:i]
215 |                 break
216 |         attributes = dict((name.lower().strip(), value.strip('\'" ')) for (name, value) in Doc._attributes_regex.findall(html))
217 |         #for attribute in ('checked', 'selected', 'required', 'multiple', 'disabled'):
218 |         for attribute in re.findall('\s+(checked|selected|required|multiple|disabled)', html):
219 |             attributes[attribute] = None
220 |         return attributes
221 | 
222 | 
223 |     def _match_attributes(self, desired_attributes, available_attributes):
224 |         """Returns True if all of desired attributes are in available attributes
225 |         Supports regex, which is not part of the XPath standard but is so useful!
226 | 
227 |         >>> doc = Doc('')
228 |         >>> doc._match_attributes([], {})
229 |         True
230 |         >>> doc._match_attributes([('class', 'test')], {})
231 |         False
232 |         >>> doc._match_attributes([], {'id':'test', 'class':'test2'})
233 |         True
234 |         >>> doc._match_attributes([('class', 'test')], {'id':'test', 'class':'test2'})
235 |         False
236 |         >>> doc._match_attributes([('class', 'test')], {'id':'test2', 'class':'test'})
237 |         True
238 |         >>> doc._match_attributes([('class', 'test'), ('id', 'content')], {'id':'test', 'class':'content'})
239 |         False
240 |         >>> doc._match_attributes([('class', 'test'), ('id', 'content')], {'id':'content', 'class':'test'})
241 |         True
242 |         >>> doc._match_attributes([('class', 'test\d')], {'id':'test', 'class':'test2'})
243 |         True
244 |         >>> doc._match_attributes([('class', 'test\d')], {'id':'test2', 'class':'test'})
245 |         False
246 |         >>> doc._match_attributes([('selected', None)], {'selected':None, 'class':'test'})
247 |         True
248 |         >>> doc._match_attributes([('selected', None)], {'class':'test'})
249 |         False
250 |         >>> doc._match_attributes([('class', 'test')], {'selected':None, 'class':'test'})
251 |         True
252 |         """
253 |         for name, value in desired_attributes:
254 |             if name in available_attributes:
255 |                 available_value = available_attributes[name]
256 |                 if value != available_value:
257 |                     if value is None or not re.match(re.compile(value + '$', re.IGNORECASE), available_attributes[name]):
258 |                         return False
259 |             else:
260 |                 return False
261 |         return True
262 | 
263 | 
264 |     def _get_html(self, context):
265 |         """Return HTML at this context
266 |         """
267 |         return context
268 |         if context:
269 |             i, j = context
270 |         else:
271 |             i, j = 0, len(self.html)
272 |         return self.html[i:j]
273 | 
274 | 
275 |     def _get_content(self, context, default=''):
276 |         """Extract the child HTML of a the passed HTML tag
277 | 
278 |         >>> doc = Doc('')
279 |         >>> doc._get_content('<div id="ID" name="NAME">content <span>SPAN</span></div>')
280 |         'content <span>SPAN</span>'
281 |         """
282 |         match = Doc._content_regex.match(self._get_html(context))
283 |         if match:
284 |             content = match.groups()[0]
285 |         else:
286 |             content = default
287 |         return content
288 | 
289 | 
290 |     def _find_children(self, html, tag):
291 |         """Find children with this tag type
292 | 
293 |         >>> doc = Doc('')
294 |         >>> list(doc._find_children('<span>1</span><div>abc<div>def</div>abc</div>ghi<div>jkl</div>', 'div'))
295 |         ['<div>abc<div>def</div>abc</div>', '<div>jkl</div>']
296 |         >>> list(doc._find_children('<tbody><tr><td></td></tr></tbody>', 'tbody'))
297 |         ['<tbody><tr><td></td></tr></tbody>']
298 |         >>> list(doc._find_children('<tr><td></td></tr>', 'tbody'))
299 |         ['<tr><td></td></tr>']
300 |         """
301 |         found = True
302 |         num_found = 0
303 |         orig_html = html
304 |         while found:
305 |             html = self._jump_next_tag(html)
306 |             if html:
307 |                 tag_html, html = self._split_tag(html)
308 |                 if tag_html:
309 |                     if tag.lower() in ('*', self._get_tag(tag_html).lower()):
310 |                         num_found += 1
311 |                         yield tag_html
312 |                 else:
313 |                     found = False
314 |             else:
315 |                 found = False
316 | 
317 |         if tag == 'tbody' and num_found == 0:
318 |             # skip tbody, which firefox includes in xpath when does not exist
319 |             yield orig_html
320 | 
321 | 
322 |     def _find_descendants(self, html, tag):
323 |         """Find descendants with this tag type
324 | 
325 |         >>> doc = Doc('')
326 |         >>> list(doc._find_descendants('<span>1</span><div>abc<div>def</div>abc</div>ghi<div>jkl</div>', 'div'))
327 |         ['<div>abc<div>def</div>abc</div>', '<div>def</div>', '<div>jkl</div>']
328 |         """
329 |         # XXX search with attribute here
330 |         if tag == '*':
331 |             raise common.WebScrapingError("`*' not currently supported for //")
332 |         for match in re.compile('<%s' % tag, re.DOTALL | re.IGNORECASE).finditer(html):
333 |             tag_html = html[match.start():]
334 |             tag_html, _ = self._split_tag(tag_html)
335 |             yield tag_html
336 | 
337 | 
338 |     def _jump_next_tag(self, html):
339 |         """Return html at start of next tag
340 | 
341 |         >>> doc = Doc('')
342 |         >>> doc._jump_next_tag('<div>abc</div>')
343 |         '<div>abc</div>'
344 |         >>> doc._jump_next_tag(' <div>abc</div>')
345 |         '<div>abc</div>'
346 |         >>> doc._jump_next_tag('</span> <div>abc</div>')
347 |         '<div>abc</div>'
348 |         >>> doc._jump_next_tag(' <br> <div>abc</div>')
349 |         '<br> <div>abc</div>'
350 |         """
351 |         while 1:
352 |             match = Doc._tag_regex.search(html)
353 |             if match:
354 |                 return html[match.start():]
355 |             else:
356 |                 return None
357 | 
358 | 
359 |     def _get_tag(self, html):
360 |         """Find tag type at this location
361 | 
362 |         >>> doc = Doc('')
363 |         >>> doc._get_tag('<div>abc</div>')
364 |         'div'
365 |         >>> doc._get_tag(' <div>')
366 |         >>> doc._get_tag('div')
367 |         """
368 |         match = Doc._tag_regex.match(html)
369 |         if match:
370 |             return match.groups()[0]
371 |         else:
372 |             return None
373 | 
374 | 
375 |     def _split_tag(self, html):
376 |         """Extract starting tag and contents from HTML
377 | 
378 |         >>> doc = Doc('')
379 |         >>> doc._split_tag('<div>abc<div>def</div>abc</div>ghi<div>jkl</div>')
380 |         ('<div>abc<div>def</div>abc</div>', 'ghi<div>jkl</div>')
381 |         >>> doc._split_tag('<br /><div>abc</div>')
382 |         ('<br />', '<div>abc</div>')
383 |         >>> doc._split_tag('<div>abc<div>def</div>abc</span>')
384 |         ('<div>abc<div>def</div>abc</span></div>', '')
385 |         >>> # test efficiency of splits
386 |         >>> a = [doc._split_tag('<div>abc<div>def</div>abc</span>') for i in range(10000)]
387 |         """
388 |         i = None
389 |         tag = self._get_tag(html)
390 |         depth = 0 # how far nested
391 |         for match in re.compile('</?%s.*?>' % tag, re.DOTALL | re.IGNORECASE).finditer(html):
392 |             if html[match.start() + 1] == '/':
393 |                 depth -= 1 # found closing tag
394 |             elif tag in common.EMPTY_TAGS:
395 |                 pass # this tag type does not close
396 |             elif html[match.end() - 2] == '/':
397 |                 pass # tag starts and ends (eg <br />)
398 |             else:
399 |                 depth += 1 # found opening tag
400 |             if depth == 0:
401 |                 # found top level match
402 |                 i = match.end()
403 |                 break
404 |         if i is None:
405 |             # all html is within this tag
406 |             return html + '</%s>' % tag, ''
407 |         else:
408 |             return html[:i], html[i:]
409 | 
410 | 
411 |     def _parent_tag(self, html):
412 |         """Find parent tag of this current tag
413 | 
414 |         >> doc = Doc('<p><div><span id="abc">empty</span></div></p>')
415 |         >> doc._parent_tag('<span id="abc">empty</span>')
416 |         '<div><span id="abc">empty</span></div>'
417 |         >> doc = Doc('<div><p></p><span id="abc">empty</span></div>')
418 |         >> doc._parent_tag('<span id="abc">empty</span>')
419 |         '<div><p></p><span id="abc">empty</span></div>'
420 |         """
421 |         raise Exception('Not implemented')
422 |         #index = self.html.find(html)
423 |         #while index >= 0:
424 |         #    index = self.html.rfind('<', start=0, end=index)
425 | 
426 | 
427 | try:
428 |     import lxml.html
429 | except ImportError:
430 |     class Tree:
431 |         def __init__(*args, **kwargs):
432 |             raise ImportError('lxml not installed')
433 | else:
434 |     # if lxml is supported create wrapper
435 |     class Tree:
436 |         def __init__(self, html, **kwargs):
437 |             if isinstance(html, lxml.html.HtmlElement):
438 |                 # input is already a passed lxml tree
439 |                 self.doc = html
440 |             else:
441 |                 try:
442 |                     self.doc = lxml.html.fromstring(html)
443 |                 except lxml.etree.LxmlError:
444 |                     self.doc = None
445 | 
446 |         def __eq__(self, html):
447 |             return self.orig_html is html
448 | 
449 | 
450 |         def xpath(self, path):
451 |             return [] if self.doc is None else self.doc.xpath(path)
452 | 
453 |         def get(self, path):
454 |             es = self.xpath(path)
455 |             if es:
456 |                 return self.tostring(es[0])
457 |             return ''
458 | 
459 |         def search(self, path):
460 |             return [self.tostring(e) for e in self.xpath(path)]
461 | 
462 |         def tostring(self, node):
463 |             try:
464 |                 parts = [node.text] + [unicode(c) if isinstance(c, basestring) else lxml.etree.tostring(c) for c in node] + [node.tail]
465 |                 return ''.join(filter(None, parts)) or str(node)
466 |             except AttributeError:
467 |                 return unicode(node)
468 | 
469 | 
470 | def get(html, xpath, remove=None):
471 |     """Return first element from XPath search of HTML
472 |     """
473 |     return Doc(html, remove=remove).get(xpath)
474 | 
475 | def search(html, xpath, remove=None):
476 |     """Return all elements from XPath search of HTML
477 |     """
478 |     return Doc(html, remove=remove).search(xpath)
479 | 
480 | def find_children(html, tag, remove=None):
481 |     """Find children with this tag type
482 |     """
483 |     return Doc(html, remove=remove)._find_children(html, tag)
484 | 
485 | 
486 | 
487 | class Form:
488 |     """Helper class for filling and submitting forms
489 |     """
490 |     def __init__(self, form):
491 |         self.data = {}
492 |         for input_name, input_value in zip(search(form, '//input/@name'), search(form, '//input/@value')):
493 |             self.data[input_name] = input_value
494 |         for text_name, text_value in zip(search(form, '//textarea/@name'), search(form, '//textarea')):
495 |             self.data[text_name] = text_value
496 |         for select_name, select_contents in zip(search(form, '//select/@name'), search(form, '//select')):
497 |             self.data[select_name] = get(select_contents, '/option[@selected]/@value')
498 |         if '' in self.data:
499 |             del self.data['']
500 | 
501 |     def __getitem__(self, key):
502 |         return self.data[key]
503 | 
504 |     def __setitem__(self, key, value):
505 |         self.data[key] = value
506 | 
507 |     def __str__(self):
508 |         return urllib.urlencode(self.data)
509 | 
510 |     def submit(self, D, action, **argv):
511 |         return D.get(url=action, data=self.data, **argv)
512 | 
513 | 
514 | 
515 | js_re = re.compile('location.href ?= ?[\'"](.*?)[\'"]')
516 | def get_links(html, url=None, local=True, external=True):
517 |     """Return all links from html and convert relative to absolute if source url is provided
518 | 
519 |     html:
520 |         HTML to parse
521 |     url:
522 |         optional URL for determining path of relative links
523 |     local:
524 |         whether to include links from same domain
525 |     external:
526 |         whether to include linkes from other domains
527 |     """
528 |     def normalize_link(link):
529 |         if urlparse.urlsplit(link).scheme in ('http', 'https', ''):
530 |             if '#' in link:
531 |                 link = link[:link.index('#')]
532 |             if url:
533 |                 link = urlparse.urljoin(url, link)
534 |                 if not local and common.same_domain(url, link):
535 |                     # local links not included
536 |                     link = None
537 |                 if not external and not common.same_domain(url, link):
538 |                     # external links not included
539 |                     link = None
540 |         else:
541 |             link = None # ignore mailto, etc
542 |         return link
543 |     a_links = search(html, '//a/@href')
544 |     i_links = search(html, '//iframe/@src')
545 |     js_links = js_re.findall(html)
546 |     links = []
547 |     for link in a_links + i_links + js_links:
548 |         try:
549 |             link = normalize_link(link)
550 |         except UnicodeError:
551 |             pass
552 |         else:
553 |             if link and link not in links:
554 |                 links.append(link)
555 |     return links
556 | 


--------------------------------------------------------------------------------