├── .gitignore
├── README.md
├── requirements.txt
├── scraper
    ├── __init__.py
    ├── __main__.py
    ├── commands
    │   ├── __init__.py
    │   └── _base.py
    ├── downloader.py
    ├── scheduler.py
    └── urls.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── runtests.py
    ├── test_data
        └── test.html
    └── test_urls.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .idea/
3 | .env/
4 | 
5 | .DS_Store
6 | *.pyc
7 | 
8 | facebook.py
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scraper
 2 | 
 3 | This is some python code that is capable of crawling and scraping dynamic Web sites.
 4 | 
 5 | **Dynamic Web sites** are sites that:
 6 |  - dynamically load content (e.g. gets search results from an AJAX call)
 7 |  - render their HTML using javascript (e.g. any reactjs Web site)
 8 | 
 9 | Scraper is able to scrape dynamic Web sites by loading the page in a virtual Webkit browser, allowing the JavaScript to run, before parsing the HTML.
10 | 
11 | **Crawling** - each URL that is found on a Web page is added to a queue to be rendered.
12 | 
13 | # Usage
14 | 
15 | You can use this code in one of two ways:
16 |  1. Use scraper as a "program" and follow the command pattern provided.
17 |  2. Import "scraper" yourself and use it as a library.
18 | 
19 | ___
20 | 
21 | ### As a program
22 | ```shell
23 | python3 ./scraper -h
24 | usage: scraper [-h] {}
25 | 
26 | Crawl and scrape dynamic Web sites. Scrape Web sites that dynamically load
27 | content or sites that render their HTML using javascript. Either use the
28 | command pattern provided or import "scraper" to use as a library.
29 | 
30 | positional arguments:
31 |   {}  The command to run.
32 | 
33 | optional arguments:
34 |   -h, --help  show this help message and exit
35 | ```
36 | 
37 | #### Commands
38 | 
39 |  - `TODO`
40 | 
41 | ___
42 | 
43 | ### As a library
44 | example.py
45 | ```python
46 | 
47 | def callback(self, url, html):
48 |     # url is the URL of the page that just finished rendering
49 |     # html is the rendered HTML of the page, (at this point the page's dynamic content has already loaded into the HTML)
50 |     do stuff...
51 | 
52 | scraper = DownloadScheduler(
53 |     callback,
54 |     initial=[Url('https://www.google.com/search?q=shark+week')],
55 |     processes=4
56 | )
57 | scraper.schedule()
58 | ```
59 | 
60 | the `DownloadScheduler` parameters:
61 |  - `callback` is called every time a page is rendered in our webkit engine.
62 |  - `initial` is a list of `Url`s that we are going to start scraping/crawling at.
63 |  - `processes` is the number of parallel processes we want to crawl/render with.
64 | 
65 | # Requirements
66 | 
67 |  - python 3.6+
68 |  - pip dependencies
69 |     - PyQt5==5.8.2
70 |     - pyobjc==3.2.1 (mac only)
71 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyQt5==5.8.2
2 | pyobjc==3.2.1
3 | 


--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | __program_name__ = 'scraper'
3 | 


--------------------------------------------------------------------------------
/scraper/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | CLI handler.
 6 | """
 7 | 
 8 | import argparse
 9 | import signal
10 | import sys
11 | 
12 | from commands import available_commands, handle_command
13 | 
14 | PROGRAM_NAME = 'scraper'
15 | 
16 | description = f'''
17 | Crawl and scrape dynamic Web sites.
18 | Scrape Web sites that dynamically load content or
19 | sites that render their HTML using javascript.
20 | Either use the command pattern provided or
21 | import "{PROGRAM_NAME}" to use as a library.
22 | '''
23 | 
24 | 
25 | def get_parser():
26 |     parser = argparse.ArgumentParser(
27 |         description=description,
28 |         prog=PROGRAM_NAME,
29 |     )
30 |     parser.add_argument('command', type=str, help='The command to run.', choices=available_commands())
31 |     return parser
32 | 
33 | 
34 | def main():
35 |     # parse args
36 |     parser = get_parser()
37 |     args = parser.parse_args(sys.argv[1:2])
38 |     command = args.command
39 | 
40 |     if command:
41 |         handle_command(command, parser)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     # Config stuff
46 |     signal.signal(signal.SIGINT, signal.SIG_DFL)
47 |     # sys.excepthook = lambda et, e, tb: print(f'{et.__name__}: {e}')
48 | 
49 |     # Start the main program
50 |     main()
51 | 


--------------------------------------------------------------------------------
/scraper/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Command Pattern
 2 | 
 3 | Functions to fetch or execute the available commands.
 4 | """
 5 | import argparse
 6 | import os
 7 | import sys
 8 | from importlib import import_module
 9 | 
10 | 
11 | def handle_command(command, parser):
12 |     """ Executes a command.
13 |     Args:
14 |         command (str): Name of the command.
15 |         parser (ArgumentParser): The Schireson program's parser.
16 |     """
17 |     command_dict = get_commands()
18 | 
19 |     # get command options
20 |     parser = argparse.ArgumentParser(parents=[parser], add_help=False)
21 |     command_dict[command].add_arguments(parser)
22 |     args = parser.parse_args(sys.argv[1:])
23 | 
24 |     # execute command
25 |     command_dict[command].execute(args)
26 | 
27 | 
28 | def available_commands():
29 |     """ All available commands.
30 |     Returns:
31 |         ([str]) Names of the available commands.
32 |     """
33 |     commands_dir = os.path.dirname(os.path.abspath(__file__))
34 | 
35 |     if not os.path.isdir(commands_dir):
36 |         raise Exception(f'Cannot find "commands" directory at {commands_dir}')
37 | 
38 |     return [
39 |         name.split('.')[0]
40 |         for name in os.listdir(commands_dir)
41 |         if name.endswith('.py') and not name.startswith('_')
42 |     ]
43 | 
44 | 
45 | def get_commands():
46 |     """ Command names mapped to an instance of that command.
47 |     Returns:
48 |         (dict) mapping {<command_file_name>: <command.Command()>}
49 |     """
50 |     commands = available_commands()
51 |     return {
52 |         name: import_module(
53 |             f'commands.{name}'
54 |         ).Command()
55 |         for name in commands
56 |     }
57 | 


--------------------------------------------------------------------------------
/scraper/commands/_base.py:
--------------------------------------------------------------------------------
 1 | """ Interface for a Command. """
 2 | 
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class BaseCommand:
 7 |     """ Command interface. """
 8 | 
 9 |     help = ''
10 | 
11 |     def __init__(self):
12 |         pass
13 | 
14 |     def add_arguments(self, parser):
15 |         """ CLI args for this command.
16 |         Args:
17 |             parser (ArgumentParser): Parser the developer can configure for his/her needs.
18 |         Note:
19 |             parser args get passed to `execute()`.
20 |         """
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def execute(self, args):
25 |         """ Command is triggered by calling this.
26 |         Args:
27 |             args (obj): namespace containing the argument for this command AND the Schireson program.
28 |         """
29 |         pass
30 | 


--------------------------------------------------------------------------------
/scraper/downloader.py:
--------------------------------------------------------------------------------
 1 | """ Module to download the HTML of a dynamic or static Web site.
 2 | 
 3 | Uses PyQt to access the Qt GUI framework.
 4 | This uses the C++ bindings to access Webkit browser that
 5 | we can use to fully render a Web page before parsing its HTML.
 6 | 
 7 | # Example Usage
 8 | `see code at bottom`
 9 | """
10 | import argparse
11 | from sys import platform
12 | 
13 | from PyQt5.QtCore import QUrl
14 | from PyQt5.QtWidgets import QApplication
15 | from PyQt5.QtWebEngineWidgets import QWebEnginePage
16 | 
17 | 
18 | class WebkitRenderer(QWebEnginePage):
19 |     """ Class to render a given URL """
20 | 
21 |     def __init__(self, rendered_callback):
22 |         """
23 |         Args:
24 |             rendered_callback (func): called once a Web page is rendered.
25 | 
26 |         Callback Args:
27 |             url (str): The URL of the Web page.
28 |             html (str): HTML of the rendered Web page.
29 |         """
30 |         self.app = QApplication([])
31 |         super(WebkitRenderer, self).__init__()
32 |         self.loadFinished.connect(self._loadFinished)
33 |         self.rendered_callback = rendered_callback
34 | 
35 |     def javaScriptConsoleMessage(self, msg_level, p_str, p_int, p_str_1):
36 |         """ Ignore console messages """
37 |         pass
38 | 
39 |     def render(self, url):
40 |         """ Download and render the URL
41 |         Args:
42 |             url (str): The URL to load.
43 |         """
44 |         self.load(QUrl(url))
45 |         self.app.exec()  # put app into infinite loop, listening to signals/events
46 | 
47 |     def _loadFinished(self, result):
48 |         """ Event handler - A Web page finished loading
49 |         Args:
50 |             result (bool): success indicator
51 |         """
52 |         if result:
53 |             self.toHtml(self.html_callback)  # async and takes a callback
54 |         else:
55 |             url = self.url().toString()
56 |             self.rendered_callback(url, None)
57 |             self.app.quit()
58 | 
59 |     def html_callback(self, data):
60 |         """ Receives rendered Web Page's HTML """
61 |         url = self.url().toString()
62 |         self.rendered_callback(url, data)
63 |         self.app.quit()  # break app out of infinite loop
64 | 
65 | 
66 | if __name__ == '__main__':
67 | 
68 |     if platform == 'darwin':  # if mac: hide python launch icons
69 |         import AppKit
70 |         info = AppKit.NSBundle.mainBundle().infoDictionary()
71 |         info["LSBackgroundOnly"] = "1"
72 |     # render_engine.py needs to be able to run as a
73 |     # standalone script to achieve parallelization.
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument('url', type=str)
76 |     args = parser.parse_args()
77 | 
78 |     def cb(url, html):
79 |         print(html)
80 | 
81 |     wr = WebkitRenderer(cb)
82 |     wr.render(args.url)
83 | 


--------------------------------------------------------------------------------
/scraper/scheduler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module schedules Web page downloads.
  3 | it "crawls" new links it discovers.
  4 | And routes downloaded Web pages to appropriate callbacks.
  5 | 
  6 | # Example Usage
  7 | ```
  8 | def callback(url, success, html):
  9 |     print(html)
 10 | 
 11 | s = DownloadScheduler(callback, initial=['https://www.google.com/search?q=shark+week'])
 12 | s.schedule()
 13 | ```
 14 | """
 15 | from collections import deque
 16 | from concurrent.futures import ProcessPoolExecutor, as_completed
 17 | import os
 18 | import subprocess
 19 | 
 20 | from urls import urls_from_html
 21 | 
 22 | 
 23 | class DownloadScheduler:
 24 | 
 25 |     def __init__(self, callback, initial=None, processes=5, url_filter=None):
 26 |         """ DownloadScheduler downloads Web pages at certain URLs
 27 |         Schedules newly discovered links, adding them to a queue, in a "crawling" fashion
 28 |         Args:
 29 |             callback (func): Callback function whenever a Web page downloads
 30 |             initial ([Url]): List of `Url`s to start the "crawling"
 31 |             processes (int): The maximum number of download processes to parallelize
 32 |         """
 33 |         self.callback = callback
 34 |         self.queue = deque(initial or [])
 35 |         self.visited = set()
 36 |         self.processes = processes
 37 |         self.url_filter = url_filter
 38 | 
 39 |     def download_complete(self, future, url):
 40 |         """ Callback when a download completes
 41 |         Args:
 42 |             future (Future): the (completed) future containing a Web site's HTML content.
 43 |             url (Url): the URL of downloaded Web page.
 44 |         """
 45 |         try:
 46 |             html = future.result()
 47 |         except Exception as e:
 48 |             print(f'Exception {e}')
 49 |         else:
 50 |             urls = list(filter(self.url_filter, urls_from_html(html, url.url)))
 51 |             self.queue.extendleft(urls)
 52 |             self.callback(url.url, html)
 53 | 
 54 |     def schedule(self):
 55 |         """ Begins downloading the Web pages in the queue.
 56 |         Calls `download_complete()` when a download finishes.
 57 |         """
 58 |         with ProcessPoolExecutor(max_workers=self.processes) as executor:
 59 |             while self.queue:
 60 |                 urls = pop_chunk(self.processes, self.queue.pop)
 61 |                 self.visited |= set(urls)
 62 |                 future_to_url = {executor.submit(download, url): url for url in urls}
 63 |                 for f in as_completed(future_to_url, timeout=15):
 64 |                     self.download_complete(f, future_to_url[f])
 65 | 
 66 | 
 67 | def pop_chunk(n, fn):
 68 |     """ Calls fn() n-times, putting the return value in a list
 69 |     Args:
 70 |         n (int): maximum size of the chunk
 71 |         fn (func): function to call (probably some collection instance's pop() function)
 72 |     Returns:
 73 |         ([]) list of whatever items that were in 
 74 |     Example:
 75 |         >>> foo = [1, 2, 3, 4, 5]
 76 |         >>> pop_chunk(3, foo.pop)
 77 |         [5, 4, 3]
 78 |         >>> foo
 79 |         [1, 2]
 80 |     """
 81 |     return_values = []
 82 |     for _ in range(n):
 83 |         try:
 84 |             return_values.append(fn())
 85 |         except IndexError:
 86 |             break
 87 |     return return_values
 88 | 
 89 | 
 90 | def download(url):
 91 |     """ Uses 'downloader.py' to download a Web page's HTML content.
 92 |     Args:
 93 |         url (Url): The URL whose HTML we want to download/fetch
 94 |     Returns:
 95 |         (str) A string of the HTML content found at the given URL
 96 |     Note:
 97 |         This method is parallelized
 98 |     """
 99 |     abs_path = os.path.dirname(os.path.abspath(__file__))
100 |     script_path = os.path.join(abs_path, 'downloader.py')
101 |     args = ['python', script_path, url.url]
102 |     proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
103 |     html = proc.stdout.read()
104 |     return html.decode('utf-8')
105 | 


--------------------------------------------------------------------------------
/scraper/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities to handle and deal with URLs.
 3 | """
 4 | 
 5 | import re
 6 | from urllib.parse import urljoin, urlparse
 7 | 
 8 | 
 9 | class Url:
10 |     """ Validation, normalization of a URL.
11 | 
12 |     Note on Url "uniqueness":
13 |         >>> Url('http://google.com') == Url('https://google.com') == Url('https://google.com?q=shark+week')
14 |         True
15 |     """
16 |     def __init__(self, url):
17 |         self.url = url.lower()
18 |         parsed = urlparse(self.url)
19 |         if not parsed.netloc:
20 |             raise ValueError(f'{self.url} is not a complete URL.')
21 | 
22 |     def normalized(self):
23 |         parsed = urlparse(self.url)
24 |         return f'http://{parsed.netloc}{parsed.path}'
25 | 
26 |     def __hash__(self):
27 |         return hash(self.normalized())
28 | 
29 |     def __eq__(self, other):
30 |         return self.normalized() == other.normalized()
31 | 
32 |     def __str__(self):
33 |         return self.normalized()
34 | 
35 |     def __repr__(self):
36 |         return self.normalized()
37 | 
38 | 
39 | def url_filter(url):
40 |     """ Filter to remove non HTML URLs """
41 |     if url.endswith(('.json', '.css', '.png', '.jpg', '.svg', '.ico', '.js', '.gif', '.pdf', '.xml')):
42 |         return False
43 |     if url.startswith(('mailto',)):
44 |         return False
45 |     return True
46 | 
47 | 
48 | def urls_from_html(html, html_url, Class_=Url):
49 |     """ Parses HTML for URLs
50 | 
51 |     Args:
52 |         html (str): HTML content
53 |         html_url (str): URL of the HTML content. Required to create full URLs from relative paths.
54 |         Class_ (class): The type of URL objects to return
55 | 
56 |     Returns:
57 |         ([Class_]) list of Class_ instances.
58 |     """
59 | 
60 |     urls = re.findall(r'href="(.*?)"', html)
61 | 
62 |     # build absolute URLs from relative paths
63 |     for i, url in enumerate(urls):
64 |         parsed = urlparse(url)
65 |         if not parsed.netloc:
66 |             urls[i] = urljoin(html_url, url)
67 | 
68 |     # Create `Class_` instances from URLs we found in the HTML
69 |     unique = set()
70 |     for u in urls:
71 |         try:
72 |             if url_filter(u):
73 |                 unique.add(Class_(u))
74 |         except ValueError:
75 |             pass
76 | 
77 |     return list(unique)
78 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Tests
2 | 
3 | ```
4 | python runtests.py
5 | ```
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelwayman/web-scraper/ac3a4493e6329b5b028fcb7215aa54ce7d7a6ce9/tests/__init__.py


--------------------------------------------------------------------------------
/tests/runtests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import unittest
 6 | 
 7 | 
 8 | sys.path.insert(0, os.path.abspath('..'))
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     testsuite = unittest.TestLoader().discover('.')
13 |     unittest.TextTestRunner(verbosity=2).run(testsuite)
14 | 


--------------------------------------------------------------------------------
/tests/test_data/test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Title</title>
 6 |     <link rel="base.css"/>
 7 | </head>
 8 | <body>
 9 | <img src="http://foobarbaz.net/logo.png"/>
10 | <nav>
11 |     <ul>
12 |         <li><a href="index.html">Home</a></li>
13 |         <li><a href="/about">About</a></li>
14 |         <li><a href="mailto:email@gmail.com">Contact</a></li>
15 |         <li><a href="https://my.3rdpartyblog.com">Blog</a></li>
16 |     </ul>
17 | </nav>
18 | <h1>Hello testers</h1>
19 | <p>Today we are going to discuss local <a href="/?q=local_query">query</a> testing.</p>
20 | <script src="foobar.js"></script>
21 | </body>
22 | </html>


--------------------------------------------------------------------------------
/tests/test_urls.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from schireson.urls import urls_from_html, Url
 5 | 
 6 | 
 7 | class TestUrlsFromHtml(unittest.TestCase):
 8 |     """
 9 |     Tests `urls_from_html` function
10 |     """
11 | 
12 |     def _html(self):
13 |         base_path = os.path.abspath(os.path.dirname(__file__))
14 |         test_file_path = os.path.join(base_path, 'test_data', 'test.html')
15 |         with open(test_file_path, 'r') as f:
16 |             return f.read()
17 | 
18 |     def setUp(self):
19 |         self.test_html = self._html()
20 | 
21 |     def test_urls(self):
22 |         urls = urls_from_html(self.test_html, 'http://foobar.com/')
23 |         urls = {x.url for x in urls}
24 |         self.assertSetEqual(
25 |             urls,
26 |             {'https://my.3rdpartyblog.com', 'http://foobar.com/index.html',
27 |              'http://foobar.com/?q=local_query', 'http://foobar.com/about'}
28 |         )
29 | 
30 | 
31 | class TestUrl(unittest.TestCase):
32 |     """
33 |     Tests `Url` class
34 |     """
35 | 
36 |     def test_normalized(self):
37 |         url = Url('https://m.facebook.com/mike.test?q=hello&foo=bar')
38 |         self.assertEquals(url.normalized(), 'http://m.facebook.com/mike.test')
39 | 
40 |         url = Url('https://www.facebook.com/mike.test/wow/')
41 |         self.assertEquals(url.normalized(), 'http://www.facebook.com/mike.test/wow')
42 | 
43 |     def test_complete_url_assertion(self):
44 |         with self.assertRaises(ValueError):
45 |             Url('fb.com/mike.test')
46 | 
47 |     def test_hash(self):
48 |         url1 = Url('https://m.facebook.com/mike.test')
49 |         url2 = Url('https://m.facebook.com/mike.test?q=q1')
50 |         url3 = Url('https://m.facebook.com/mike.test?q=q2')
51 | 
52 |         s = {url1, url2, url3}
53 |         self.assertEquals(len(s), 1)
54 | 
55 |     def test___eq__(self):
56 |         url1 = Url('https://m.facebook.com/mike.test')
57 |         url2 = Url('https://m.facebook.com/mike.test?q=q1')
58 |         url3 = Url('https://m.facebook.com/mike.test?q=q2')
59 | 
60 |         self.assertTrue(url1 == url2 == url3)
61 | 


--------------------------------------------------------------------------------