├── .gitignore ├── README.md ├── requirements.txt ├── scraper ├── __init__.py ├── __main__.py ├── commands │ ├── __init__.py │ └── _base.py ├── downloader.py ├── scheduler.py └── urls.py └── tests ├── README.md ├── __init__.py ├── runtests.py ├── test_data └── test.html └── test_urls.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .idea/ 3 | .env/ 4 | 5 | .DS_Store 6 | *.pyc 7 | 8 | facebook.py 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraper 2 | 3 | This is some python code that is capable of crawling and scraping dynamic Web sites. 4 | 5 | **Dynamic Web sites** are sites that: 6 | - dynamically load content (e.g. gets search results from an AJAX call) 7 | - render their HTML using javascript (e.g. any reactjs Web site) 8 | 9 | Scraper is able to scrape dynamic Web sites by loading the page in a virtual Webkit browser, allowing the JavaScript to run, before parsing the HTML. 10 | 11 | **Crawling** - each URL that is found on a Web page is added to a queue to be rendered. 12 | 13 | # Usage 14 | 15 | You can use this code in one of two ways: 16 | 1. Use scraper as a "program" and follow the command pattern provided. 17 | 2. Import "scraper" yourself and use it as a library. 18 | 19 | ___ 20 | 21 | ### As a program 22 | ```shell 23 | python3 ./scraper -h 24 | usage: scraper [-h] {} 25 | 26 | Crawl and scrape dynamic Web sites. Scrape Web sites that dynamically load 27 | content or sites that render their HTML using javascript. Either use the 28 | command pattern provided or import "scraper" to use as a library. 29 | 30 | positional arguments: 31 | {} The command to run. 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | ``` 36 | 37 | #### Commands 38 | 39 | - `TODO` 40 | 41 | ___ 42 | 43 | ### As a library 44 | example.py 45 | ```python 46 | 47 | def callback(self, url, html): 48 | # url is the URL of the page that just finished rendering 49 | # html is the rendered HTML of the page, (at this point the page's dynamic content has already loaded into the HTML) 50 | do stuff... 51 | 52 | scraper = DownloadScheduler( 53 | callback, 54 | initial=[Url('https://www.google.com/search?q=shark+week')], 55 | processes=4 56 | ) 57 | scraper.schedule() 58 | ``` 59 | 60 | the `DownloadScheduler` parameters: 61 | - `callback` is called every time a page is rendered in our webkit engine. 62 | - `initial` is a list of `Url`s that we are going to start scraping/crawling at. 63 | - `processes` is the number of parallel processes we want to crawl/render with. 64 | 65 | # Requirements 66 | 67 | - python 3.6+ 68 | - pip dependencies 69 | - PyQt5==5.8.2 70 | - pyobjc==3.2.1 (mac only) 71 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyQt5==5.8.2 2 | pyobjc==3.2.1 3 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | __program_name__ = 'scraper' 3 | -------------------------------------------------------------------------------- /scraper/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | CLI handler. 6 | """ 7 | 8 | import argparse 9 | import signal 10 | import sys 11 | 12 | from commands import available_commands, handle_command 13 | 14 | PROGRAM_NAME = 'scraper' 15 | 16 | description = f''' 17 | Crawl and scrape dynamic Web sites. 18 | Scrape Web sites that dynamically load content or 19 | sites that render their HTML using javascript. 20 | Either use the command pattern provided or 21 | import "{PROGRAM_NAME}" to use as a library. 22 | ''' 23 | 24 | 25 | def get_parser(): 26 | parser = argparse.ArgumentParser( 27 | description=description, 28 | prog=PROGRAM_NAME, 29 | ) 30 | parser.add_argument('command', type=str, help='The command to run.', choices=available_commands()) 31 | return parser 32 | 33 | 34 | def main(): 35 | # parse args 36 | parser = get_parser() 37 | args = parser.parse_args(sys.argv[1:2]) 38 | command = args.command 39 | 40 | if command: 41 | handle_command(command, parser) 42 | 43 | 44 | if __name__ == '__main__': 45 | # Config stuff 46 | signal.signal(signal.SIGINT, signal.SIG_DFL) 47 | # sys.excepthook = lambda et, e, tb: print(f'{et.__name__}: {e}') 48 | 49 | # Start the main program 50 | main() 51 | -------------------------------------------------------------------------------- /scraper/commands/__init__.py: -------------------------------------------------------------------------------- 1 | """ Command Pattern 2 | 3 | Functions to fetch or execute the available commands. 4 | """ 5 | import argparse 6 | import os 7 | import sys 8 | from importlib import import_module 9 | 10 | 11 | def handle_command(command, parser): 12 | """ Executes a command. 13 | Args: 14 | command (str): Name of the command. 15 | parser (ArgumentParser): The Schireson program's parser. 16 | """ 17 | command_dict = get_commands() 18 | 19 | # get command options 20 | parser = argparse.ArgumentParser(parents=[parser], add_help=False) 21 | command_dict[command].add_arguments(parser) 22 | args = parser.parse_args(sys.argv[1:]) 23 | 24 | # execute command 25 | command_dict[command].execute(args) 26 | 27 | 28 | def available_commands(): 29 | """ All available commands. 30 | Returns: 31 | ([str]) Names of the available commands. 32 | """ 33 | commands_dir = os.path.dirname(os.path.abspath(__file__)) 34 | 35 | if not os.path.isdir(commands_dir): 36 | raise Exception(f'Cannot find "commands" directory at {commands_dir}') 37 | 38 | return [ 39 | name.split('.')[0] 40 | for name in os.listdir(commands_dir) 41 | if name.endswith('.py') and not name.startswith('_') 42 | ] 43 | 44 | 45 | def get_commands(): 46 | """ Command names mapped to an instance of that command. 47 | Returns: 48 | (dict) mapping {: } 49 | """ 50 | commands = available_commands() 51 | return { 52 | name: import_module( 53 | f'commands.{name}' 54 | ).Command() 55 | for name in commands 56 | } 57 | -------------------------------------------------------------------------------- /scraper/commands/_base.py: -------------------------------------------------------------------------------- 1 | """ Interface for a Command. """ 2 | 3 | from abc import abstractmethod 4 | 5 | 6 | class BaseCommand: 7 | """ Command interface. """ 8 | 9 | help = '' 10 | 11 | def __init__(self): 12 | pass 13 | 14 | def add_arguments(self, parser): 15 | """ CLI args for this command. 16 | Args: 17 | parser (ArgumentParser): Parser the developer can configure for his/her needs. 18 | Note: 19 | parser args get passed to `execute()`. 20 | """ 21 | pass 22 | 23 | @abstractmethod 24 | def execute(self, args): 25 | """ Command is triggered by calling this. 26 | Args: 27 | args (obj): namespace containing the argument for this command AND the Schireson program. 28 | """ 29 | pass 30 | -------------------------------------------------------------------------------- /scraper/downloader.py: -------------------------------------------------------------------------------- 1 | """ Module to download the HTML of a dynamic or static Web site. 2 | 3 | Uses PyQt to access the Qt GUI framework. 4 | This uses the C++ bindings to access Webkit browser that 5 | we can use to fully render a Web page before parsing its HTML. 6 | 7 | # Example Usage 8 | `see code at bottom` 9 | """ 10 | import argparse 11 | from sys import platform 12 | 13 | from PyQt5.QtCore import QUrl 14 | from PyQt5.QtWidgets import QApplication 15 | from PyQt5.QtWebEngineWidgets import QWebEnginePage 16 | 17 | 18 | class WebkitRenderer(QWebEnginePage): 19 | """ Class to render a given URL """ 20 | 21 | def __init__(self, rendered_callback): 22 | """ 23 | Args: 24 | rendered_callback (func): called once a Web page is rendered. 25 | 26 | Callback Args: 27 | url (str): The URL of the Web page. 28 | html (str): HTML of the rendered Web page. 29 | """ 30 | self.app = QApplication([]) 31 | super(WebkitRenderer, self).__init__() 32 | self.loadFinished.connect(self._loadFinished) 33 | self.rendered_callback = rendered_callback 34 | 35 | def javaScriptConsoleMessage(self, msg_level, p_str, p_int, p_str_1): 36 | """ Ignore console messages """ 37 | pass 38 | 39 | def render(self, url): 40 | """ Download and render the URL 41 | Args: 42 | url (str): The URL to load. 43 | """ 44 | self.load(QUrl(url)) 45 | self.app.exec() # put app into infinite loop, listening to signals/events 46 | 47 | def _loadFinished(self, result): 48 | """ Event handler - A Web page finished loading 49 | Args: 50 | result (bool): success indicator 51 | """ 52 | if result: 53 | self.toHtml(self.html_callback) # async and takes a callback 54 | else: 55 | url = self.url().toString() 56 | self.rendered_callback(url, None) 57 | self.app.quit() 58 | 59 | def html_callback(self, data): 60 | """ Receives rendered Web Page's HTML """ 61 | url = self.url().toString() 62 | self.rendered_callback(url, data) 63 | self.app.quit() # break app out of infinite loop 64 | 65 | 66 | if __name__ == '__main__': 67 | 68 | if platform == 'darwin': # if mac: hide python launch icons 69 | import AppKit 70 | info = AppKit.NSBundle.mainBundle().infoDictionary() 71 | info["LSBackgroundOnly"] = "1" 72 | # render_engine.py needs to be able to run as a 73 | # standalone script to achieve parallelization. 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument('url', type=str) 76 | args = parser.parse_args() 77 | 78 | def cb(url, html): 79 | print(html) 80 | 81 | wr = WebkitRenderer(cb) 82 | wr.render(args.url) 83 | -------------------------------------------------------------------------------- /scraper/scheduler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module schedules Web page downloads. 3 | it "crawls" new links it discovers. 4 | And routes downloaded Web pages to appropriate callbacks. 5 | 6 | # Example Usage 7 | ``` 8 | def callback(url, success, html): 9 | print(html) 10 | 11 | s = DownloadScheduler(callback, initial=['https://www.google.com/search?q=shark+week']) 12 | s.schedule() 13 | ``` 14 | """ 15 | from collections import deque 16 | from concurrent.futures import ProcessPoolExecutor, as_completed 17 | import os 18 | import subprocess 19 | 20 | from urls import urls_from_html 21 | 22 | 23 | class DownloadScheduler: 24 | 25 | def __init__(self, callback, initial=None, processes=5, url_filter=None): 26 | """ DownloadScheduler downloads Web pages at certain URLs 27 | Schedules newly discovered links, adding them to a queue, in a "crawling" fashion 28 | Args: 29 | callback (func): Callback function whenever a Web page downloads 30 | initial ([Url]): List of `Url`s to start the "crawling" 31 | processes (int): The maximum number of download processes to parallelize 32 | """ 33 | self.callback = callback 34 | self.queue = deque(initial or []) 35 | self.visited = set() 36 | self.processes = processes 37 | self.url_filter = url_filter 38 | 39 | def download_complete(self, future, url): 40 | """ Callback when a download completes 41 | Args: 42 | future (Future): the (completed) future containing a Web site's HTML content. 43 | url (Url): the URL of downloaded Web page. 44 | """ 45 | try: 46 | html = future.result() 47 | except Exception as e: 48 | print(f'Exception {e}') 49 | else: 50 | urls = list(filter(self.url_filter, urls_from_html(html, url.url))) 51 | self.queue.extendleft(urls) 52 | self.callback(url.url, html) 53 | 54 | def schedule(self): 55 | """ Begins downloading the Web pages in the queue. 56 | Calls `download_complete()` when a download finishes. 57 | """ 58 | with ProcessPoolExecutor(max_workers=self.processes) as executor: 59 | while self.queue: 60 | urls = pop_chunk(self.processes, self.queue.pop) 61 | self.visited |= set(urls) 62 | future_to_url = {executor.submit(download, url): url for url in urls} 63 | for f in as_completed(future_to_url, timeout=15): 64 | self.download_complete(f, future_to_url[f]) 65 | 66 | 67 | def pop_chunk(n, fn): 68 | """ Calls fn() n-times, putting the return value in a list 69 | Args: 70 | n (int): maximum size of the chunk 71 | fn (func): function to call (probably some collection instance's pop() function) 72 | Returns: 73 | ([]) list of whatever items that were in 74 | Example: 75 | >>> foo = [1, 2, 3, 4, 5] 76 | >>> pop_chunk(3, foo.pop) 77 | [5, 4, 3] 78 | >>> foo 79 | [1, 2] 80 | """ 81 | return_values = [] 82 | for _ in range(n): 83 | try: 84 | return_values.append(fn()) 85 | except IndexError: 86 | break 87 | return return_values 88 | 89 | 90 | def download(url): 91 | """ Uses 'downloader.py' to download a Web page's HTML content. 92 | Args: 93 | url (Url): The URL whose HTML we want to download/fetch 94 | Returns: 95 | (str) A string of the HTML content found at the given URL 96 | Note: 97 | This method is parallelized 98 | """ 99 | abs_path = os.path.dirname(os.path.abspath(__file__)) 100 | script_path = os.path.join(abs_path, 'downloader.py') 101 | args = ['python', script_path, url.url] 102 | proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) 103 | html = proc.stdout.read() 104 | return html.decode('utf-8') 105 | -------------------------------------------------------------------------------- /scraper/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities to handle and deal with URLs. 3 | """ 4 | 5 | import re 6 | from urllib.parse import urljoin, urlparse 7 | 8 | 9 | class Url: 10 | """ Validation, normalization of a URL. 11 | 12 | Note on Url "uniqueness": 13 | >>> Url('http://google.com') == Url('https://google.com') == Url('https://google.com?q=shark+week') 14 | True 15 | """ 16 | def __init__(self, url): 17 | self.url = url.lower() 18 | parsed = urlparse(self.url) 19 | if not parsed.netloc: 20 | raise ValueError(f'{self.url} is not a complete URL.') 21 | 22 | def normalized(self): 23 | parsed = urlparse(self.url) 24 | return f'http://{parsed.netloc}{parsed.path}' 25 | 26 | def __hash__(self): 27 | return hash(self.normalized()) 28 | 29 | def __eq__(self, other): 30 | return self.normalized() == other.normalized() 31 | 32 | def __str__(self): 33 | return self.normalized() 34 | 35 | def __repr__(self): 36 | return self.normalized() 37 | 38 | 39 | def url_filter(url): 40 | """ Filter to remove non HTML URLs """ 41 | if url.endswith(('.json', '.css', '.png', '.jpg', '.svg', '.ico', '.js', '.gif', '.pdf', '.xml')): 42 | return False 43 | if url.startswith(('mailto',)): 44 | return False 45 | return True 46 | 47 | 48 | def urls_from_html(html, html_url, Class_=Url): 49 | """ Parses HTML for URLs 50 | 51 | Args: 52 | html (str): HTML content 53 | html_url (str): URL of the HTML content. Required to create full URLs from relative paths. 54 | Class_ (class): The type of URL objects to return 55 | 56 | Returns: 57 | ([Class_]) list of Class_ instances. 58 | """ 59 | 60 | urls = re.findall(r'href="(.*?)"', html) 61 | 62 | # build absolute URLs from relative paths 63 | for i, url in enumerate(urls): 64 | parsed = urlparse(url) 65 | if not parsed.netloc: 66 | urls[i] = urljoin(html_url, url) 67 | 68 | # Create `Class_` instances from URLs we found in the HTML 69 | unique = set() 70 | for u in urls: 71 | try: 72 | if url_filter(u): 73 | unique.add(Class_(u)) 74 | except ValueError: 75 | pass 76 | 77 | return list(unique) 78 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | ``` 4 | python runtests.py 5 | ``` 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelwayman/web-scraper/ac3a4493e6329b5b028fcb7215aa54ce7d7a6ce9/tests/__init__.py -------------------------------------------------------------------------------- /tests/runtests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import unittest 6 | 7 | 8 | sys.path.insert(0, os.path.abspath('..')) 9 | 10 | 11 | if __name__ == '__main__': 12 | testsuite = unittest.TestLoader().discover('.') 13 | unittest.TextTestRunner(verbosity=2).run(testsuite) 14 | -------------------------------------------------------------------------------- /tests/test_data/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 | 9 | 10 | 18 |

Hello testers

19 |

Today we are going to discuss local query testing.

20 | 21 | 22 | -------------------------------------------------------------------------------- /tests/test_urls.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from schireson.urls import urls_from_html, Url 5 | 6 | 7 | class TestUrlsFromHtml(unittest.TestCase): 8 | """ 9 | Tests `urls_from_html` function 10 | """ 11 | 12 | def _html(self): 13 | base_path = os.path.abspath(os.path.dirname(__file__)) 14 | test_file_path = os.path.join(base_path, 'test_data', 'test.html') 15 | with open(test_file_path, 'r') as f: 16 | return f.read() 17 | 18 | def setUp(self): 19 | self.test_html = self._html() 20 | 21 | def test_urls(self): 22 | urls = urls_from_html(self.test_html, 'http://foobar.com/') 23 | urls = {x.url for x in urls} 24 | self.assertSetEqual( 25 | urls, 26 | {'https://my.3rdpartyblog.com', 'http://foobar.com/index.html', 27 | 'http://foobar.com/?q=local_query', 'http://foobar.com/about'} 28 | ) 29 | 30 | 31 | class TestUrl(unittest.TestCase): 32 | """ 33 | Tests `Url` class 34 | """ 35 | 36 | def test_normalized(self): 37 | url = Url('https://m.facebook.com/mike.test?q=hello&foo=bar') 38 | self.assertEquals(url.normalized(), 'http://m.facebook.com/mike.test') 39 | 40 | url = Url('https://www.facebook.com/mike.test/wow/') 41 | self.assertEquals(url.normalized(), 'http://www.facebook.com/mike.test/wow') 42 | 43 | def test_complete_url_assertion(self): 44 | with self.assertRaises(ValueError): 45 | Url('fb.com/mike.test') 46 | 47 | def test_hash(self): 48 | url1 = Url('https://m.facebook.com/mike.test') 49 | url2 = Url('https://m.facebook.com/mike.test?q=q1') 50 | url3 = Url('https://m.facebook.com/mike.test?q=q2') 51 | 52 | s = {url1, url2, url3} 53 | self.assertEquals(len(s), 1) 54 | 55 | def test___eq__(self): 56 | url1 = Url('https://m.facebook.com/mike.test') 57 | url2 = Url('https://m.facebook.com/mike.test?q=q1') 58 | url3 = Url('https://m.facebook.com/mike.test?q=q2') 59 | 60 | self.assertTrue(url1 == url2 == url3) 61 | --------------------------------------------------------------------------------