')
606 | parser.add_option('-k', '--key', dest='key', help='The key to use')
607 | parser.add_option('-v', '--value', dest='value', help='The value to store')
608 | parser.add_option('-b', '--browser', action='store_true', dest='browser', default=False, help='View content of this key in a web browser')
609 | parser.add_option('-c', '--clear', action='store_true', dest='clear', default=False, help='Clear all data for this cache')
610 | parser.add_option('-s', '--size', action='store_true', dest='size', default=False, help='Display size of database')
611 | options, args = parser.parse_args()
612 | if not args:
613 | parser.error('Must specify the cache file')
614 | cache = PersistentDict(args[0])
615 |
616 | if options.value:
617 | # store thie value
618 | if options.key:
619 | cache[options.key] = options.value
620 | else:
621 | parser.error('Must specify the key')
622 | elif options.browser:
623 | if options.key:
624 | value = cache[options.key]
625 | filename = tempfile.NamedTemporaryFile().name
626 | fp = open(filename, 'w')
627 | fp.write(str(value))
628 | fp.flush()
629 | webbrowser.open(filename)
630 | else:
631 | parser.error('Must specify the key')
632 | elif options.key:
633 | print cache[options.key]
634 | elif options.clear:
635 | if raw_input('Really? Clear the cache? (y/n) ') == 'y':
636 | cache.clear()
637 | print 'cleared'
638 | elif options.size:
639 | print len(cache)
640 | else:
641 | parser.error('No options selected')
642 |
--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
1 | __doc__ = 'default application wide settings'
2 |
3 | import sys
4 | import os
5 | import logging
6 |
7 |
8 | # default location to store output state files
9 | dirname, filename = os.path.split(sys.argv[0])
10 | state_dir = os.path.join(dirname, '.' + filename.replace('.py', ''))
11 | if not os.path.exists(state_dir):
12 | try:
13 | os.mkdir(state_dir)
14 | except OSError as e:
15 | state_dir = ''
16 | #print 'Unable to create state directory:', e
17 | cache_file = os.path.relpath(os.path.join(state_dir, 'cache.db')) # file to use for pdict cache
18 | queue_file = os.path.relpath(os.path.join(state_dir, 'queue.db')) # file to use for pdict queue
19 | status_file = os.path.join(state_dir, 'status.js') # where to store state of crawl
20 | log_file = os.path.join(state_dir, 'webscraping.log') # default logging file
21 |
22 | log_level = logging.INFO # logging level
23 | default_encoding = 'utf-8'
24 | default_headers = {
25 | 'Referer': '',
26 | 'Accept-Language': 'en-us,en;q=0.5',
27 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
28 | }
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from distutils.core import setup
3 |
4 | def read(filename):
5 | return open(os.path.join(os.path.dirname(__file__), filename)).read()
6 |
7 | setup(
8 | name='webscraping',
9 | version='1.7.1',
10 | packages=['webscraping'],
11 | package_dir={'webscraping':'.'}, # look for package contents in current directory
12 | author='Richard Penman',
13 | author_email='richard@webscraping.com',
14 | description='Pure python library aimed to make web scraping easier',
15 | long_description=read('README.rst'),
16 | url='https://github.com/richardpenman/webscraping',
17 | classifiers = [
18 | 'Environment :: Web Environment',
19 | 'Intended Audience :: Developers',
20 | 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
21 | 'Operating System :: OS Independent',
22 | 'Programming Language :: Python',
23 | 'Topic :: Internet :: WWW/HTTP'
24 | ],
25 | license='lgpl'
26 | )
27 |
--------------------------------------------------------------------------------
/webkit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __doc__ = 'Interface to qt webkit for loading and interacting with JavaScript dependent webpages'
4 |
5 | import sys, os, re, urllib2, random, itertools, json
6 | from time import time, sleep
7 | from datetime import datetime
8 |
9 | # for using native Python strings
10 | import sip
11 | sip.setapi('QString', 2)
12 | from PyQt4.QtGui import QApplication, QDesktopServices, QImage, QPainter, QMouseEvent, QKeyEvent, QKeySequence
13 | from PyQt4.QtCore import Qt, QByteArray, QUrl, QTimer, QEventLoop, QIODevice, QObject, QPoint, QEvent
14 | from PyQt4.QtWebKit import QWebFrame, QWebView, QWebElement, QWebPage, QWebSettings, QWebInspector
15 | from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkProxy, QNetworkRequest, QNetworkReply, QNetworkDiskCache
16 |
17 | # maximum number of bytes to read from a POST request
18 | MAX_POST_SIZE = 2 ** 25
19 |
20 | import alg, common, pdict, settings
21 |
22 |
23 | class NetworkAccessManager(QNetworkAccessManager):
24 | def __init__(self, proxy, use_cache):
25 | """Subclass QNetworkAccessManager for finer control network operations
26 |
27 | proxy: the string of a proxy to download through
28 | use_cache: whether to cache replies so that can load faster with the same content subsequent times
29 | """
30 | super(NetworkAccessManager, self).__init__()
31 | self.setProxy(proxy)
32 | self.sslErrors.connect(self.sslErrorHandler)
33 | # the requests that are still active
34 | self.active_requests = []
35 | self.cache = pdict.PersistentDict(settings.cache_file) if use_cache else None
36 |
37 |
38 | def shutdown(self):
39 | """Network is shutting down event
40 | """
41 | # prevent new requests
42 | self.setNetworkAccessible(QNetworkAccessManager.NotAccessible)
43 | # abort existing requests
44 | for request in self.active_requests:
45 | request.abort()
46 | request.deleteLater()
47 |
48 |
49 | def setProxy(self, proxy):
50 | """Parse proxy components from proxy
51 | """
52 | if proxy:
53 | fragments = common.parse_proxy(proxy)
54 | if fragments['host']:
55 | QNetworkAccessManager.setProxy(self,
56 | QNetworkProxy(QNetworkProxy.HttpProxy,
57 | fragments['host'], int(fragments['port']),
58 | fragments['username'], fragments['password']
59 | )
60 | )
61 | else:
62 | common.logger.info('Invalid proxy: ' + str(proxy))
63 |
64 |
65 | def createRequest(self, operation, request, post):
66 | """Override creating a network request
67 | """
68 | url = request.url().toString()
69 | if str(request.url().path()).endswith('.ttf'):
70 | # block fonts, which can cause webkit to crash
71 | common.logger.debug(u'Blocking: {}'.format(url))
72 | request.setUrl(QUrl())
73 |
74 | data = post if post is None else post.peek(MAX_POST_SIZE)
75 | key = u'{} {}'.format(url, data)
76 | use_cache = not url.startswith('file')
77 | if self.cache is not None and use_cache and key in self.cache:
78 | common.logger.debug(u'Load from cache: {}'.format(key))
79 | content, headers, attributes = self.cache[key]
80 | reply = CachedNetworkReply(self, request.url(), content, headers, attributes)
81 | else:
82 | common.logger.debug(u'Request: {} {}'.format(url, post or ''))
83 | reply = QNetworkAccessManager.createRequest(self, operation, request, post)
84 | reply.error.connect(self.catch_error)
85 | self.active_requests.append(reply)
86 | reply.destroyed.connect(self.active_requests.remove)
87 | # save reference to original request
88 | reply.content = QByteArray()
89 | reply.readyRead.connect(self._save_content(reply))
90 | if self.cache is not None and use_cache:
91 | reply.finished.connect(self._cache_content(reply, key))
92 | reply.orig_request = request
93 | reply.data = self.parse_data(data)
94 | return reply
95 |
96 |
97 | def _save_content(self, r):
98 | """Save copy of reply content before is lost
99 | """
100 | def save_content():
101 | r.content.append(r.peek(r.size()))
102 | return save_content
103 |
104 | def _cache_content(self, r, key):
105 | """Cache downloaded content
106 | """
107 | def cache_content():
108 | headers = [(header, r.rawHeader(header)) for header in r.rawHeaderList()]
109 | attributes = []
110 | attributes.append((QNetworkRequest.HttpStatusCodeAttribute, r.attribute(QNetworkRequest.HttpStatusCodeAttribute).toInt()))
111 | attributes.append((QNetworkRequest.HttpReasonPhraseAttribute, r.attribute(QNetworkRequest.HttpReasonPhraseAttribute).toByteArray()))
112 | #attributes.append((QNetworkRequest.RedirectionTargetAttribute, r.attribute(QNetworkRequest.RedirectionTargetAttribute).toUrl()))
113 | attributes.append((QNetworkRequest.ConnectionEncryptedAttribute, r.attribute(QNetworkRequest.ConnectionEncryptedAttribute).toBool()))
114 | #attributes.append((QNetworkRequest.CacheLoadControlAttribute, r.attribute(QNetworkRequest.CacheLoadControlAttribute).toInt()))
115 | #attributes.append((QNetworkRequest.CacheSaveControlAttribute, r.attribute(QNetworkRequest.CacheSaveControlAttribute).toBool()))
116 | #attributes.append((QNetworkRequest.SourceIsFromCacheAttribute, r.attribute(QNetworkRequest.SourceIsFromCacheAttribute).toBool()))
117 | #print 'save cache:', key, len(r.content), len(headers), attributes
118 | self.cache[key] = r.content, headers, attributes
119 | return cache_content
120 |
121 |
122 | def parse_data(self, data):
123 | """Parse this posted data into a list of key/value pairs
124 | """
125 | if data is None:
126 | result = []
127 | else:
128 | try:
129 | result = json.loads(unicode(data))
130 | if isinstance(result, dict):
131 | result = result.items()
132 | if not isinstance(result, list):
133 | common.logger.info(u'Unexpected data format: {}'.format(result))
134 | result = []
135 | except ValueError:
136 | url = QUrl('')
137 | url.setEncodedQuery(data)
138 | result = url.queryItems()
139 | return result
140 |
141 |
142 | def catch_error(self, eid):
143 | """Interpret the HTTP error ID received
144 | """
145 | if eid not in (5, 301):
146 | errors = {
147 | 0 : 'no error condition. Note: When the HTTP protocol returns a redirect no error will be reported. You can check if there is a redirect with the QNetworkRequest::RedirectionTargetAttribute attribute.',
148 | 1 : 'the remote server refused the connection (the server is not accepting requests)',
149 | 2 : 'the remote server closed the connection prematurely, before the entire reply was received and processed',
150 | 3 : 'the remote host name was not found (invalid hostname)',
151 | 4 : 'the connection to the remote server timed out',
152 | 5 : 'the operation was canceled via calls to abort() or close() before it was finished.',
153 | 6 : 'the SSL/TLS handshake failed and the encrypted channel could not be established. The sslErrors() signal should have been emitted.',
154 | 7 : 'the connection was broken due to disconnection from the network, however the system has initiated roaming to another access point. The request should be resubmitted and will be processed as soon as the connection is re-established.',
155 | 101 : 'the connection to the proxy server was refused (the proxy server is not accepting requests)',
156 | 102 : 'the proxy server closed the connection prematurely, before the entire reply was received and processed',
157 | 103 : 'the proxy host name was not found (invalid proxy hostname)',
158 | 104 : 'the connection to the proxy timed out or the proxy did not reply in time to the request sent',
159 | 105 : 'the proxy requires authentication in order to honour the request but did not accept any credentials offered (if any)',
160 | 201 : 'the access to the remote content was denied (similar to HTTP error 401)',
161 | 202 : 'the operation requested on the remote content is not permitted',
162 | 203 : 'the remote content was not found at the server (similar to HTTP error 404)',
163 | 204 : 'the remote server requires authentication to serve the content but the credentials provided were not accepted (if any)',
164 | 205 : 'the request needed to be sent again, but this failed for example because the upload data could not be read a second time.',
165 | 301 : 'the Network Access API cannot honor the request because the protocol is not known',
166 | 302 : 'the requested operation is invalid for this protocol',
167 | 99 : 'an unknown network-related error was detected',
168 | 199 : 'an unknown proxy-related error was detected',
169 | 299 : 'an unknown error related to the remote content was detected',
170 | 399 : 'a breakdown in protocol was detected (parsing error, invalid or unexpected responses, etc.)',
171 | }
172 | common.logger.debug('Error %d: %s (%s)' % (eid, errors.get(eid, 'unknown error'), self.sender().url().toString()))
173 |
174 |
175 | def sslErrorHandler(self, reply, errors):
176 | common.logger.info('SSL errors: {}'.format(errors))
177 | reply.ignoreSslErrors()
178 |
179 |
180 |
181 | class CachedNetworkReply(QNetworkReply):
182 | def __init__(self, parent, url, content, headers, attributes):
183 | super(CachedNetworkReply, self).__init__(parent)
184 | self.setUrl(url)
185 | self.content = content
186 | self.offset = 0
187 | for header, value in headers:
188 | self.setRawHeader(header, value)
189 | #self.setHeader(QNetworkRequest.ContentLengthHeader, len(content))
190 | for attribute, value in attributes:
191 | self.setAttribute(attribute, value)
192 | self.setOpenMode(QNetworkReply.ReadOnly | QNetworkReply.Unbuffered)
193 | # trigger signals that content is ready
194 | QTimer.singleShot(0, self.readyRead)
195 | QTimer.singleShot(0, self.finished)
196 |
197 | def bytesAvailable(self):
198 | return len(self.content) - self.offset
199 |
200 | def isSequential(self):
201 | return True
202 |
203 | def abort(self):
204 | pass # qt requires that this be defined
205 |
206 | def readData(self, size):
207 | """Return up to size bytes from buffer
208 | """
209 | if self.offset >= len(self.content):
210 | return ''
211 | number = min(size, len(self.content) - self.offset)
212 | data = self.content[self.offset : self.offset + number]
213 | self.offset += number
214 | return str(data)
215 |
216 |
217 |
218 | class WebPage(QWebPage):
219 | def __init__(self, user_agent, confirm=True):
220 | """Override QWebPage to set User-Agent and JavaScript messages
221 |
222 | user_agent: the User Agent to submit
223 | confirm: default response to confirm dialog boxes
224 | """
225 | super(WebPage, self).__init__()
226 | self.user_agent = user_agent
227 | self.confirm = confirm
228 | self.setForwardUnsupportedContent(True)
229 |
230 | def userAgentForUrl(self, url):
231 | """Use same user agent for all URL's
232 | """
233 | return self.user_agent
234 |
235 | def javaScriptAlert(self, frame, message):
236 | """Override default JavaScript alert popup and send to log
237 | """
238 | common.logger.debug('Alert: ' + message)
239 |
240 |
241 | def javaScriptConfirm(self, frame, message):
242 | """Override default JavaScript confirm popup and send to log
243 | """
244 | common.logger.debug('Confirm: ' + message)
245 | return self.confirm
246 |
247 |
248 | def javaScriptPrompt(self, frame, message, default):
249 | """Override default JavaScript prompt popup and send to log
250 | """
251 | common.logger.debug('Prompt: {} {}'.format(message, default))
252 |
253 |
254 | def javaScriptConsoleMessage(self, message, line_number, source_id):
255 | """Override default JavaScript console and send to log
256 | """
257 | common.logger.debug('Console: {} {} {}'.format(message, line_number, source_id))
258 |
259 |
260 | def shouldInterruptJavaScript(self):
261 | """Disable javascript interruption dialog box
262 | """
263 | return True
264 |
265 |
266 |
267 | class Browser(QWebView):
268 | def __init__(self, gui=False, user_agent=None, proxy=None, load_images=True, load_javascript=True, load_java=True, load_plugins=True, timeout=20, delay=5, app=None, use_cache=False):
269 | """Widget class that contains the address bar, webview for rendering webpages, and a table for displaying results
270 |
271 | user_agent: the user-agent when downloading content
272 | proxy: a QNetworkProxy to download through
273 | load_images: whether to download images
274 | load_javascript: whether to enable javascript
275 | load_java: whether to enable java
276 | load_plugins: whether to enable browser plugins
277 | timeout: the maximum amount of seconds to wait for a request
278 | delay: the minimum amount of seconds to wait between requests
279 | app: QApplication object so that can instantiate multiple browser objects
280 | use_cache: whether to cache all replies
281 | """
282 | # must instantiate the QApplication object before any other Qt objects
283 | self.app = app or QApplication(sys.argv)
284 | super(Browser, self).__init__()
285 |
286 | page = WebPage(user_agent or alg.rand_agent())
287 | manager = NetworkAccessManager(proxy, use_cache)
288 | page.setNetworkAccessManager(manager)
289 | self.setPage(page)
290 | page.networkAccessManager().finished.connect(self.finished)
291 | # set whether to enable plugins, images, and java
292 | self.settings().setAttribute(QWebSettings.AutoLoadImages, load_images)
293 | self.settings().setAttribute(QWebSettings.JavascriptEnabled, load_javascript)
294 | self.settings().setAttribute(QWebSettings.JavaEnabled, load_java)
295 | self.settings().setAttribute(QWebSettings.PluginsEnabled, load_plugins)
296 | self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
297 | self.timeout = timeout
298 | self.delay = delay
299 | if gui:
300 | self.showNormal()
301 | self.raise_()
302 |
303 |
304 | def __del__(self):
305 | self.setPage(None)
306 |
307 |
308 | def home(self):
309 | """Go back to initial page in history
310 | """
311 | history = self.history()
312 | history.goToItem(history.itemAt(0))
313 |
314 |
315 | def save(self):
316 | """Save the current HTML state to disk
317 | """
318 | for i in itertools.count(1):
319 | filename = os.path.join(settings.state_dir, 'state{}.html'.format(i))
320 | if not os.path.exists(filename):
321 | html = self.current_html()
322 | open(filename, 'w').write(common.to_unicode(html))
323 | print 'save', filename
324 | break
325 |
326 |
327 | def set_proxy(self, proxy):
328 | """Shortcut to set the proxy
329 | """
330 | self.page().networkAccessManager().setProxy(proxy)
331 |
332 |
333 | def current_url(self):
334 | """Return current URL
335 | """
336 | return str(self.url().toString())
337 |
338 |
339 | def current_html(self):
340 | """Return current rendered HTML
341 | """
342 | return common.to_unicode(unicode(self.page().mainFrame().toHtml()))
343 |
344 |
345 | def current_text(self):
346 | """Return text from the current rendered HTML
347 | """
348 | return common.to_unicode(unicode(self.page().mainFrame().toPlainText()))
349 |
350 |
351 | def get(self, url, html=None, headers=None, data=None):
352 | """Load given url in webkit and return html when loaded
353 |
354 | url: the URL to load
355 | html: optional HTML to set instead of downloading
356 | headers: the headers to attach to the request
357 | data: the data to POST
358 | """
359 | if isinstance(url, basestring):
360 | # convert string to Qt's URL object
361 | url = QUrl(url)
362 | if html:
363 | # load pre downloaded HTML
364 | self.setContent(html, baseUrl=url)
365 | return html
366 |
367 | t1 = time()
368 | loop = QEventLoop()
369 | self.loadFinished.connect(loop.quit)
370 | # need to make network request
371 | request = QNetworkRequest(url)
372 | if headers:
373 | # add headers to request when defined
374 | for header, value in headers:
375 | request.setRawHeader(header, value)
376 | fn = super(Browser, self)
377 | if data:
378 | # POST request
379 | fn.load(request, QNetworkAccessManager.PostOperation, data)
380 | else:
381 | # GET request
382 | fn.load(request)
383 |
384 | # set a timeout on the download loop
385 | timer = QTimer()
386 | timer.setSingleShot(True)
387 | timer.timeout.connect(loop.quit)
388 | timer.start(self.timeout * 1000)
389 | loop.exec_() # delay here until download finished or timeout
390 |
391 | if timer.isActive():
392 | # downloaded successfully
393 | timer.stop()
394 | parsed_html = self.current_html()
395 | self.wait(self.delay - (time() - t1))
396 | else:
397 | # did not download in time
398 | common.logger.debug('Timed out: {}'.format(url.toString()))
399 | parsed_html = ''
400 | return parsed_html
401 |
402 |
403 | def wait(self, timeout=1):
404 | """Wait for delay time
405 | """
406 | deadline = time() + timeout
407 | while time() < deadline:
408 | sleep(0)
409 | self.app.processEvents()
410 |
411 |
412 | def wait_quiet(self, timeout=20):
413 | """Wait until all requests have completed up to a maximum timeout.
414 | Returns True if all requests complete before the timeout.
415 | """
416 | self.wait()
417 | deadline = time() + timeout
418 | manager = self.page().networkAccessManager()
419 | while time() < deadline and manager.active_requests:
420 | sleep(0)
421 | self.app.processEvents()
422 | self.app.processEvents()
423 | return manager.active_requests == []
424 |
425 |
426 | def wait_load(self, pattern, timeout=60):
427 | """Wait for this content to be loaded up to maximum timeout.
428 | Returns True if pattern was loaded before the timeout.
429 | """
430 | deadline = time() + timeout
431 | while time() < deadline:
432 | sleep(0)
433 | self.app.processEvents()
434 | if self.find(pattern):
435 | return True
436 | return False
437 |
438 |
439 | def wait_steady(self, timeout=60):
440 | """Wait for the DOM to be steady, defined as no changes over a 1 second period
441 | Returns True if DOM is steady before timeout, else False
442 | """
443 | deadline = time() + timeout
444 | while time() < deadline:
445 | orig_html = self.current_html()
446 | self.wait(1)
447 | cur_html = self.current_html()
448 | if orig_html == cur_html:
449 | return True # DOM is steady
450 | return False
451 |
452 |
453 | def js(self, script):
454 | """Shortcut to execute javascript on current document and return result
455 | """
456 | self.app.processEvents()
457 | return self.page().mainFrame().evaluateJavaScript(script).toString()
458 |
459 |
460 | def click(self, pattern='input', native=False):
461 | """Click all elements that match the pattern.
462 |
463 | Uses standard CSS pattern matching: http://www.w3.org/TR/CSS2/selector.html
464 | Returns the number of elements clicked
465 | """
466 | es = self.find(pattern)
467 | for e in es:
468 | if native:
469 | # get position of element
470 | e_pos = e.geometry().center()
471 | # scroll to element position
472 | self.page().mainFrame().setScrollPosition(e_pos)
473 | scr_pos = self.page().mainFrame().scrollPosition()
474 | point_to_click = e_pos - scr_pos
475 | # create click on absolute coordinates
476 | press = QMouseEvent(QMouseEvent.MouseButtonPress, point_to_click, Qt.LeftButton, Qt.LeftButton, Qt.NoModifier)
477 | release = QMouseEvent(QMouseEvent.MouseButtonRelease, point_to_click, Qt.LeftButton, Qt.LeftButton, Qt.NoModifier)
478 | QApplication.postEvent(self, press)
479 | QApplication.postEvent(self, release)
480 | else:
481 | self.click_by_user_event_simulation(e)
482 | return len(es)
483 |
484 |
485 | def keys(self, pattern, text, native=False, blur=False):
486 | """Simulate typing by focusing on elements that match the pattern and triggering key events.
487 | If native is True then will use GUI key event simulation, else JavaScript.
488 | If blur is True then will blur focus at the end of typing.
489 | Returns the number of elements matched.
490 | """
491 | es = self.find(pattern)
492 | for e in es:
493 | if native:
494 | key_map = {'\t': Qt.Key_Tab, '\n': Qt.Key_Enter, 'DOWN': Qt.Key_Down, 'UP': Qt.Key_Up}
495 | self.click_by_gui_simulation(e)
496 | self.wait(0.1)
497 | for c in text:
498 | key = key_map.get(c, QKeySequence(c)[0])
499 | press = QKeyEvent(QEvent.KeyPress, key, Qt.NoModifier)
500 | release = QKeyEvent(QEvent.KeyRelease, key, Qt.NoModifier)
501 | QApplication.postEvent(self, press)
502 | QApplication.postEvent(self, release)
503 | else:
504 | #e.evaluateJavaScript("this.focus()")
505 | #self.click_by_user_event_simulation(e)
506 | self.fill(pattern, text, es=[e])
507 | for event_name in ('focus', 'keydown', 'change', 'keyup', 'keypress'):
508 | self.trigger_js_event(e, event_name)
509 | if blur:
510 | e.evaluateJavaScript("this.blur()")
511 | return len(es)
512 |
513 |
514 | def attr(self, pattern, name, value=None):
515 | """For the elements that match this pattern, set attribute if value is defined, else return the value.
516 | """
517 | if value is None:
518 | # want to get attribute
519 | return str(self.page().mainFrame().findFirstElement(pattern).attribute(name))
520 | else:
521 | es = self.find(pattern)
522 | for e in es:
523 | e.setAttribute(name, value)
524 | return len(es)
525 |
526 |
527 | def fill(self, pattern, value, es=None):
528 | """Set text of the matching form elements to value, and return the number of elements matched.
529 | """
530 | es = es or self.find(pattern)
531 | for e in es:
532 | tag = str(e.tagName()).lower()
533 | if tag == 'input' or tag == "select":
534 | e.evaluateJavaScript('this.value = "{}"'.format(value))
535 | e.setAttribute('value', value)
536 | else:
537 | e.setPlainText(value)
538 | return len(es)
539 |
540 |
541 | def find(self, pattern):
542 | """Returns the elements matching this CSS pattern.
543 | """
544 | if isinstance(pattern, basestring):
545 | matches = self.page().mainFrame().findAllElements(pattern).toList()
546 | elif isinstance(pattern, list):
547 | matches = pattern
548 | elif isinstance(pattern, QWebElement):
549 | matches = [pattern]
550 | else:
551 | common.logger.warning('Unknown pattern: ' + str(pattern))
552 | matches = []
553 | return matches
554 |
555 |
556 | def screenshot(self, output_file):
557 | """Take screenshot of current webpage and save results
558 | """
559 | frame = self.page().mainFrame()
560 | self.page().setViewportSize(frame.contentsSize())
561 | image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
562 | painter = QPainter(image)
563 | frame.render(painter)
564 | painter.end()
565 | common.logger.debug('saving: ' + output_file)
566 | image.save(output_file)
567 |
568 |
569 | def trigger_js_event(self, element, event_name):
570 | """Triggers a JavaScript level event on an element.
571 |
572 | Takes a QWebElement as input, and a string name of the event (e.g. "click").
573 |
574 | Implementation is taken from Artemis:
575 | https://github.com/cs-au-dk/Artemis/blob/720f051c4afb4cd69e560f8658ebe29465c59362/artemis-code/src/runtime/input/forms/formfieldinjector.cpp#L294
576 | """
577 | # TODO: Strictly we should create an appropriate event type as listed in:
578 | # https://developer.mozilla.org/en-US/docs/Web/Events
579 | # https://developer.mozilla.org/en-US/docs/Web/API/Document/createEvent#Notes
580 | # For now we use generic "Event".
581 | event_type = "Event";
582 | event_init_method = "initEvent";
583 | bubbles = "true";
584 | cancellable = "true";
585 | injection = "var event = document.createEvent('{}'); event.{}('{}', {}, {}); this.dispatchEvent(event);".format(event_type, event_init_method, event_name, bubbles, cancellable);
586 | element.evaluateJavaScript(injection);
587 |
588 |
589 | def click_by_user_event_simulation(self, element):
590 | """Uses JS-level events to simulate a full user click.
591 |
592 | Takes a QWebElement as input.
593 |
594 | Implementation is taken from Artemis:
595 | https://github.com/cs-au-dk/Artemis/blob/720f051c4afb4cd69e560f8658ebe29465c59362/artemis-code/src/runtime/input/clicksimulator.cpp#L42
596 | """
597 | self.trigger_js_event(element, "mouseover");
598 | self.trigger_js_event(element, "mousemove");
599 | self.trigger_js_event(element, "mousedown");
600 | self.trigger_js_event(element, "focus");
601 | self.trigger_js_event(element, "mouseup");
602 | self.trigger_js_event(element, "click");
603 | self.trigger_js_event(element, "mousemove");
604 | self.trigger_js_event(element, "mouseout");
605 | self.trigger_js_event(element, "blur");
606 |
607 |
608 | def finished(self, reply):
609 | """Override the reply finished signal to check the result of each request
610 | """
611 | pass
612 |
613 |
614 |
615 | if __name__ == '__main__':
616 | # initiate webkit and show gui
617 | # once script is working you can disable the gui
618 | w = Browser(gui=True)
619 | # load webpage
620 | w.get('http://duckduckgo.com')
621 | # fill search textbox
622 | w.fill('input[id=search_form_input_homepage]', 'web scraping')
623 | # take screenshot of webpage
624 | w.screenshot('duckduckgo.jpg')
625 | # click search button
626 | w.click('input[id=search_button_homepage]')
627 | # show webpage for 10 seconds
628 | w.wait(10)
629 |
--------------------------------------------------------------------------------
/xpath.py:
--------------------------------------------------------------------------------
1 | __doc__ = """
2 | This module implements a subset of the XPath standard:
3 | - tags
4 | - indices
5 | - attributes
6 | - descendants
7 |
8 | This was created because I needed a pure Python XPath parser.
9 |
10 | Generally XPath solutions will normalize the HTML into XHTML before selecting nodes.
11 | However this module tries to navigate the HTML structure directly without normalizing by searching for the next closing tag.
12 | """
13 |
14 | #TODO:
15 | # - parent
16 | # - search by text: text() == '...'
17 | # - return xpath for most similar to text
18 | # - multiple filters for a tag
19 |
20 | import itertools, re, sys, urllib, urllib2, urlparse
21 | from optparse import OptionParser
22 | import adt, common, settings
23 |
24 |
25 | class Doc:
26 | """Wrapper around a parsed webpage
27 |
28 | html:
29 | The content of webpage to parse
30 | remove:
31 | A list of tags to remove
32 |
33 | >>> doc = Doc('ghi')
34 | >>> doc.search('/div/a')
35 | ['LINK 1', 'LINK 3']
36 | >>> doc.search('/div/a[@class="link"]')
37 | ['LINK 1']
38 | >>> doc.search('/div[1]//a')
39 | ['LINK 1', 'LINK 2']
40 | >>> doc.search('/div/a/@class')
41 | ['link', '']
42 | >>> doc.search('/div[-1]/a')
43 | ['LINK 3']
44 |
45 | >>> # test searching unicode
46 | >>> doc = Doc(u'google')
47 | >>> doc.get('//a[@class="flink"]')
48 | u'google'
49 |
50 | >>> # test finding just the first instance for a large amount of content
51 | >>> doc = Doc('content
' * 10000)
52 | >>> doc.get('//span')
53 | 'content'
54 |
55 | >>> # test extracting attribute of self closing tag
56 | >>> Doc('').get('/div/img/@src')
57 | 'img.png'
58 |
59 | >>> # test extracting attribute after self closing tag
60 | >>> Doc('').get('/div/p')
61 | 'content'
62 | """
63 |
64 | # regex to find a tag
65 | _tag_regex = re.compile('<([\w\:]+)')
66 | # regex to find an attribute
67 | _attributes_regex = re.compile('([\w\:-]+)\s*=\s*(".*?"|\'.*?\'|\S+)', re.DOTALL)
68 | # regex to find content of a tag
69 | _content_regex = re.compile('<.*?>(.*)$', re.DOTALL)
70 |
71 |
72 | def __init__(self, html, remove=None):
73 | #self.html = self._clean(html, remove)
74 | self.html = html
75 | self.num_searches = 0
76 |
77 | def get(self, xpath):
78 | """Return the first result from this XPath selection
79 | """
80 | results = self._xpath(self.parse(xpath), self.html, limit=1)
81 | return common.first(results)
82 |
83 | def search(self, xpath):
84 | """Return all results from this XPath selection
85 | """
86 | return self._xpath(self.parse(xpath), self.html, limit=sys.maxint)
87 |
88 |
89 | def _xpath(self, path, html, limit):
90 | """Recursively search HTML for content at XPath
91 | """
92 | counter, separator, tag, index, attributes = path.pop(0)
93 | if counter == 0:
94 | self.num_searches += 1
95 |
96 | results = []
97 | if tag == '..':
98 | # parent
99 | raise common.WebScrapingError('.. not yet supported')
100 | results.append(self.get_parent(html))
101 | elif tag == 'text()':
102 | # extract child text
103 | text = self._get_content(self._get_html(html))
104 | results.append(common.remove_tags(text, keep_children=False))
105 | # check if next tag is selecting attribute
106 | elif tag.startswith('@'):
107 | attr = tag[1:].lower()
108 | #parent = self.get_parent(context)
109 | value = self._get_attributes(html).get(attr, '')
110 | results.append(value)
111 | else:
112 | # have tag
113 | if counter > 0:
114 | # get child html when not at root
115 | html = self._get_content(html)
116 |
117 | # search direct children if / and all descendants if //
118 | search_fn = self._find_children if separator == '' else self._find_descendants
119 | matches = search_fn(html, tag)
120 |
121 | # support negative indices
122 | if index is not None and index < 0:
123 | matches = list(matches)
124 | index += len(matches) + 1
125 |
126 | for child_i, child in enumerate(matches):
127 | # check if matches index
128 | if index is None or index == child_i + 1:
129 | # check if matches attributes
130 | if not attributes or self._match_attributes(attributes, self._get_attributes(child)):
131 | if path:
132 | results.extend(self._xpath(path[:], child, limit))
133 | else:
134 | # final node
135 | results.append(self._get_content(child))
136 | if len(results) > limit:
137 | break
138 |
139 | #if not children:
140 | # attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or ''
141 | # common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
142 | return results
143 |
144 |
145 |
146 | def _clean(self, html, remove):
147 | """Remove specified unhelpful tags and comments
148 | """
149 | self.remove = remove
150 | html = re.compile('', re.DOTALL).sub('', html) # remove comments
151 | if remove:
152 | # XXX combine tag list into single regex, if can match same at start and end
153 | for tag in remove:
154 | html = re.compile('<' + tag + '[^>]*?/>', re.DOTALL | re.IGNORECASE).sub('', html)
155 | html = re.compile('<' + tag + '[^>]*?>.*?' + tag + '>', re.DOTALL | re.IGNORECASE).sub('', html)
156 | html = re.compile('<' + tag + '[^>]*?>', re.DOTALL | re.IGNORECASE).sub('', html)
157 | return html
158 |
159 |
160 | def parse(self, xpath):
161 | """Parse the xpath into: counter, separator, tag, index, and attributes
162 |
163 | >>> doc = Doc('')
164 | >>> doc.parse('/div[1]//span[@class="text"]')
165 | [(0, '', 'div', 1, []), (1, '/', 'span', None, [('class', 'text')])]
166 | >>> doc.parse('//li[-2]')
167 | [(0, '/', 'li', -2, [])]
168 | >>> doc.parse('//option[@selected]')
169 | [(0, '/', 'option', None, [('selected', None)])]
170 | >>> doc.parse('/div[@id="content"]//span[1][@class="text"][@title=""]/a')
171 | [(0, '', 'div', None, [('id', 'content')]), (1, '/', 'span', 1, [('class', 'text'), ('title', '')]), (2, '', 'a', None, [])]
172 | """
173 | tokens = []
174 | counter = 0
175 | for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath):
176 | index, attributes = None, []
177 | if '[' in token:
178 | tag = token[:token.find('[')]
179 | for attribute in re.compile('\[(.*?)\]').findall(token):
180 | try:
181 | index = int(attribute)
182 | except ValueError:
183 | match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search(attribute)
184 | if match:
185 | key, value = match.groups()
186 | attributes.append((key.lower(), value.lower()))
187 | else:
188 | match = re.compile('@(.*?)$').search(attribute)
189 | if match:
190 | attributes.append((match.groups()[0].lower(), None))
191 | else:
192 | raise common.WebScrapingError('Unknown format: ' + attribute)
193 | else:
194 | tag = token
195 | tokens.append((counter, separator, tag, index, attributes))
196 | counter += 1
197 | return tokens
198 |
199 |
200 | def _get_attributes(self, html):
201 | """Extract the attributes of the passed HTML tag
202 |
203 | >>> doc = Doc('')
204 | >>> doc._get_attributes('content SPAN
')
205 | {'max-width': '20', 'class': 'abc', 'id': 'ID', 'name': 'MY NAME'}
206 | >>> doc._get_attributes('')
207 | {'width': '200', 'class': 'textelien', 'valign': 'top'}
208 | >>> doc._get_attributes(' |