├── .coveragerc
├── .gitignore
├── .travis.yml
├── README.rst
├── examples
    └── books.py
├── requirements.txt
├── scrapy_pyppeteer
    ├── __init__.py
    ├── browser_request.py
    ├── browser_response.py
    └── middleware.py
├── setup.py
├── tests
    ├── __init__.py
    ├── mockserver.py
    └── test_crawl.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.egg-info
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: true
 3 | dist: xenial
 4 | 
 5 | branches:
 6 |   only:
 7 |   - master
 8 |   - /^\d\.\d+$/
 9 | 
10 | matrix:
11 |   include:
12 |   - python: 3.6
13 |     env: TOXENV=py36
14 |   - python: 3.7
15 |     env: TOXENV=py37
16 | 
17 | install:
18 | - pip install -U tox codecov
19 | 
20 | script: tox
21 | 
22 | after_success:
23 | - codecov
24 | 
25 | cache:
26 |   directories:
27 |   - $HOME/.cache/pip
28 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | scrapy-pyppeeteer: use pyppeteer from a Scrapy spider
  2 | =====================================================
  3 | 
  4 | .. image:: https://img.shields.io/travis/lopuhin/scrapy-pyppeteer/master.svg
  5 |    :target: http://travis-ci.org/lopuhin/scrapy-pyppeteer
  6 |    :alt: Build Status
  7 | 
  8 | .. image:: https://codecov.io/github/lopuhin/scrapy-pyppeteer/coverage.svg?branch=master
  9 |    :target: https://codecov.io/github/lopuhin/scrapy-pyppeteer?branch=master
 10 |    :alt: Code Coverage
 11 | 
 12 | The goal is to allow using `pyppeteer <https://github.com/miyakogi/pyppeteer>`_
 13 | (a python port of puppeteer) from a `scrapy <https://scrapy.org>`_ spider.
 14 | This allows to scrape sites that require JS to function properly
 15 | and to make the scraper more similar to humans.
 16 | 
 17 | Current status is experimental, most likely the library will remain
 18 | in experimental state, with a proper solution included in Scrapy later
 19 | (which will be very different).
 20 | Documentation assumes Scrapy knowledge.
 21 | 
 22 | Installation
 23 | ------------
 24 | 
 25 | Python 3.6+ is required for
 26 | `PEP 525 <https://www.python.org/dev/peps/pep-0525/>`_ (Asynchronous Generators).
 27 | 
 28 | The library requires async def parse support in scrapy, until this is merged,
 29 | please install scrapy from a branch::
 30 | 
 31 |     pip install git+https://github.com/lopuhin/scrapy.git@async-def-parse
 32 | 
 33 | Also this requires a pyppeteer fork for some fixes that are not released yet::
 34 | 
 35 |     pip install git+https://github.com/lopuhin/pyppeteer.git
 36 | 
 37 | Finally, install scrapy-pyppeteer itself::
 38 | 
 39 |     pip install git+https://github.com/lopuhin/scrapy-pyppeteer.git
 40 | 
 41 | Usage
 42 | -----
 43 | 
 44 | At the moment, browser management is implemented as a downloader middleware,
 45 | which you need to activate (update ``DOWNLOADER_MIDDLEWARES`` in settings)::
 46 | 
 47 |    DOWNLOADER_MIDDLEWARES = {
 48 |        'scrapy_pyppeteer.ScrapyPyppeteerDownloaderMiddleware': 1000,
 49 |    }
 50 | 
 51 | After that you can use ``scrapy_pyppeteer.BrowserRequest``, and you'll get
 52 | ``scrapy_pyppeteer.BrowserResponse`` in your ``parse`` method.
 53 | ``BrowserResponse`` has an empty body, and has an extra attribute
 54 | ``browser_window`` which is a ``pyppeteer.page.Page`` instance (a browser tab).
 55 | If you used ``BrowserResponse.empty()``, you'll get an empty tab,
 56 | and if you specified a URL, then you'll get a tab where ``page.goto(url)``
 57 | has been awaited.
 58 | 
 59 | To do anything with ``response.browser_window``, you need to define your
 60 | parse callback as ``async def``, and use ``await`` syntax.
 61 | All actions performed via ``await`` are executed directly, without going
 62 | to the scheduler (although in the same global event loop). You can also
 63 | ``yield`` items and new requests, which will work normally.
 64 | 
 65 | Short example of the parse method
 66 | (see more self-contained examples in "examples" folder of this repo)::
 67 | 
 68 |     async def parse(self, response: BrowserResponse):
 69 |         page = response.browser_tab
 70 |         yield {'url': response.url}
 71 |         for link in await page.querySelectorAll('a'):
 72 |             url = await page.evaluate('link => link.href', link)
 73 |             yield BrowserRequest(url)
 74 |         await page.close()
 75 | 
 76 | Settings
 77 | --------
 78 | 
 79 | - ``PYPPETEER_LAUNCH``: a dict with pyppeteer launch options, see
 80 |   ``pyppeteer.launch`` docstring.
 81 | 
 82 | 
 83 | Notes on memory usage
 84 | ---------------------
 85 | 
 86 | - You need to explicitly close the browser tab once you don't need it
 87 |   (e.g. at the end of the parse method).
 88 | - Items yielded from a single parse method are kept in memory
 89 |   while the parse method is running, as well as all local variables
 90 |   (the former is less obvious). Yielding a large number of big items from one
 91 |   parse method can increase the memory usage of your spider.
 92 |   Consider splitting work into several other parse methods.
 93 | 
 94 | Debugging
 95 | ---------
 96 | 
 97 | If you wanted to put a ``pdb.set_trace()`` into the spider parse method
 98 | and check results of some manipulations with the page which need to be awaited,
 99 | this won't work, because ``pdb`` is blocking the event loop. One way which
100 | works is shown below
101 | (although this does not use the spider or this library at all)::
102 | 
103 |     import asyncio
104 |     import pyppeteer
105 |     loop = asyncio.new_event_loop()
106 |     run = loop.run_until_complete  # use "run(x)" instead of "await x" here
107 |     browser = run(pyppeteer.launch(headless=False))  # headless=True to see what is executed
108 |     page = run(browser.newPage())
109 |     run(page.goto('http://books.toscrape.com'))
110 |     print(len(run(page.xpath('//a[@href]'))))  # print number
111 | 
112 | -- this allows to interact with a page from a REPL and observe effects in the
113 | browser window.
114 | 
115 | TODO
116 | ----
117 | 
118 | - Set response status and headers
119 | - A more ergonomic way to close the tab by default?
120 | - More tests
121 | - A way to schedule interactions reusing the same window
122 |   (to continue working in the same time but go through the scheduler), making
123 |   sure one tab is used only by one parse method.
124 | - Nice extraction API (like parsel)
125 | - A way to limit max number of tabs open (a bit tricky)
126 | 


--------------------------------------------------------------------------------
/examples/books.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy_pyppeteer import BrowserRequest, BrowserResponse
 3 | 
 4 | 
 5 | class BooksSpider(scrapy.Spider):
 6 |     """ Example spider for books.toscrape.com, using the headless browser
 7 |     for all scraping, based on https://github.com/scrapy/booksbot/.
 8 |     Run with::
 9 | 
10 |         scrapy runspider examples/books.py
11 | 
12 |     """
13 |     name = 'books'
14 |     start_url = 'http://books.toscrape.com'
15 |     custom_settings = {
16 |         'DOWNLOADER_MIDDLEWARES': {
17 |             'scrapy_pyppeteer.ScrapyPyppeteerDownloaderMiddleware': 1000,
18 |         },
19 |         'CONCURRENT_REQUESTS': 8,
20 |     }
21 | 
22 |     def start_requests(self):
23 |         yield BrowserRequest(self.start_url)
24 | 
25 |     async def parse(self, response: BrowserResponse):
26 |         page = response.browser_tab
27 |         yield {'url': response.url}
28 |         for link in await page.querySelectorAll('a'):
29 |             url = await page.evaluate('link => link.href', link)
30 |             yield BrowserRequest(url)
31 |         await page.close()
32 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/lopuhin/scrapy.git@async-def-parse
2 | git+https://github.com/lopuhin/pyppeteer.git
3 | 


--------------------------------------------------------------------------------
/scrapy_pyppeteer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Public API
 2 | from .browser_request import BrowserRequest
 3 | from .browser_response import BrowserResponse
 4 | from .middleware import ScrapyPyppeteerDownloaderMiddleware
 5 | 
 6 | 
 7 | # Make pyppeteer and websockets less noisy with scrapy debug logging level
 8 | import logging
 9 | logging.getLogger('pyppeteer').setLevel(logging.INFO)
10 | logging.getLogger('websockets.protocol').setLevel(logging.INFO)
11 | del logging
12 | 


--------------------------------------------------------------------------------
/scrapy_pyppeteer/browser_request.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class BrowserRequest(scrapy.Request):
 5 |     _BLANK_URL = 'about:blank'
 6 | 
 7 |     @classmethod
 8 |     def blank(cls):
 9 |         return BrowserRequest(cls._BLANK_URL, dont_filter=True)
10 | 
11 |     @property
12 |     def is_blank(self):
13 |         return self.url == self._BLANK_URL
14 | 


--------------------------------------------------------------------------------
/scrapy_pyppeteer/browser_response.py:
--------------------------------------------------------------------------------
1 | from scrapy.http.response import Response
2 | from pyppeteer.page import Page
3 | 
4 | 
5 | class BrowserResponse(Response):
6 |     def __init__(self, *args, **kwargs):
7 |         self.browser_tab: Page = kwargs.pop('browser_tab')
8 |         super(BrowserResponse, self).__init__(*args, **kwargs)
9 | 


--------------------------------------------------------------------------------
/scrapy_pyppeteer/middleware.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from typing import Optional
 4 | 
 5 | import pyppeteer
 6 | from pyppeteer.browser import Browser
 7 | from scrapy.settings import Settings
 8 | from twisted.internet.defer import Deferred
 9 | 
10 | from .browser_request import BrowserRequest
11 | from .browser_response import BrowserResponse
12 | 
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class ScrapyPyppeteerDownloaderMiddleware:
18 |     """ Handles launching browser tabs, acts as a downloader.
19 |     Probably eventually this should be moved to scrapy core as a downloader.
20 |     """
21 |     def __init__(self, settings: Settings):
22 |         self._browser: Optional[Browser] = None
23 |         self._launch_options = settings.getdict('PYPPETEER_LAUNCH') or {}
24 | 
25 |     @classmethod
26 |     def from_crawler(cls, crawler):
27 |         return cls(crawler.settings)
28 | 
29 |     def process_request(self, request, spider):
30 |         if isinstance(request, BrowserRequest):
31 |             return _aio_as_deferred(self.process_browser_request(request))
32 |         else:
33 |             return request
34 | 
35 |     async def process_browser_request(self, request: BrowserRequest):
36 |         if self._browser is None:
37 |             self._browser = await pyppeteer.launch(**self._launch_options)
38 |         page = await self._browser.newPage()
39 |         n_tabs = _n_browser_tabs(self._browser)
40 |         logger.debug(f'{n_tabs} tabs open')
41 |         if request.is_blank:
42 |             url = request.url
43 |         else:
44 |             await page.goto(request.url)
45 |             url = page.url
46 |             # TODO set status and headers
47 |         return BrowserResponse(url=url, browser_tab=page)
48 | 
49 | 
50 | def _n_browser_tabs(browser: Browser) -> int:
51 |     """ A quick way to get the number of browser tabs.
52 |     """
53 |     n_tabs = 0
54 |     for context in browser.browserContexts:
55 |         for target in context.targets():
56 |             if target.type == 'page':
57 |                 n_tabs += 1
58 |     return n_tabs
59 | 
60 | 
61 | def _aio_as_deferred(f):
62 |     return Deferred.fromFuture(asyncio.ensure_future(f))
63 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | setup(
 5 |     name='scrapy-pyppeteer',
 6 |     packages=['scrapy_pyppeteer'],
 7 |     install_requires=[
 8 |         'scrapy',
 9 |         'pyppeteer',
10 |     ],
11 | )
12 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lopuhin/scrapy-pyppeteer/3839e36521a6757bcfd9afc78e47a1ae3705e931/tests/__init__.py


--------------------------------------------------------------------------------
/tests/mockserver.py:
--------------------------------------------------------------------------------
 1 | import sys, time
 2 | from subprocess import Popen, PIPE
 3 | 
 4 | from scrapy.utils.log import configure_logging
 5 | from twisted.internet import reactor
 6 | from twisted.web.resource import Resource
 7 | from twisted.web.server import Site
 8 | 
 9 | 
10 | configure_logging()
11 | 
12 | 
13 | class MockServer:
14 |     def __init__(self, module='tests.mockserver'):
15 |         self.proc = None
16 |         self.module = module
17 | 
18 |     def __enter__(self):
19 |         self.proc = Popen(
20 |             [sys.executable, '-u', '-m', self.module], stdout=PIPE)
21 |         self.proc.stdout.readline()
22 |         return self
23 | 
24 |     def __exit__(self, exc_type, exc_value, traceback):
25 |         self.proc.kill()
26 |         self.proc.wait()
27 |         time.sleep(0.2)
28 | 
29 | 
30 | class Leaf(Resource):
31 |     isLeaf = True
32 | 
33 |     def __init__(self, data: str):
34 |         super().__init__()
35 |         self._data = data
36 | 
37 |     def render(self, request):
38 |         return f'<div id="foo">{self._data}</div>'.encode('utf8')
39 | 
40 | 
41 | class Root(Resource):
42 |     def __init__(self):
43 |         super().__init__()
44 |         self._resources = [
45 |             ('leaf-1', Leaf('data-1')),
46 |             ('leaf-2', Leaf('data-2')),
47 |         ]
48 |         for url, r in self._resources:
49 |             self.putChild(url.encode('utf8'), r)
50 | 
51 |     def getChild(self, name, request):
52 |         return self
53 | 
54 |     def render(self, request):
55 |         return (
56 |             '<h1>Hi</h1>' +
57 |             '<br/>'.join(
58 |                 f'<a href="{url}">{url}</a>' for url, _ in self._resources)
59 |         ).encode('utf8')
60 | 
61 | 
62 | PORT = 8781
63 | ROOT_URL = f'http://127.0.0.1:{PORT}'
64 | 
65 | 
66 | def main():
67 |     http_port = reactor.listenTCP(PORT, Site(Root()))
68 | 
69 |     def print_listening():
70 |         host = http_port.getHost()
71 |         print('Mock server running at http://{}:{}'.format(
72 |             host.host, host.port))
73 | 
74 |     reactor.callWhenRunning(print_listening)
75 |     reactor.run()
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/tests/test_crawl.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy.crawler import CrawlerRunner
 3 | from twisted.internet import defer
 4 | from twisted.trial.unittest import TestCase
 5 | 
 6 | from scrapy_pyppeteer import BrowserRequest, BrowserResponse
 7 | from .mockserver import MockServer, ROOT_URL
 8 | 
 9 | 
10 | class BaseSpider(scrapy.Spider):
11 |     name = 'spider'
12 |     custom_settings = {
13 |         'DOWNLOADER_MIDDLEWARES': {
14 |             'scrapy_pyppeteer.ScrapyPyppeteerDownloaderMiddleware': 1000,
15 |         }
16 |     }
17 | 
18 | 
19 | class FollowAllSpider(BaseSpider):
20 |     def start_requests(self):
21 |         yield BrowserRequest(ROOT_URL)
22 | 
23 |     async def parse(self, response: BrowserResponse):
24 |         page = response.browser_tab
25 |         yield {'url': response.url}
26 |         for link in await page.querySelectorAll('a'):
27 |             url = await page.evaluate('link => link.href', link)
28 |             yield BrowserRequest(url)
29 |         await page.close()
30 | 
31 | 
32 | class CrawlTestCase(TestCase):
33 | 
34 |     def setUp(self):
35 |         self.mockserver = MockServer()
36 |         self.mockserver.__enter__()
37 |         self.runner = CrawlerRunner()
38 | 
39 |     def tearDown(self):
40 |         self.mockserver.__exit__(None, None, None)
41 | 
42 |     @defer.inlineCallbacks
43 |     def test_follow_all(self):
44 |         crawler = self.runner.create_crawler(FollowAllSpider)
45 |         yield crawler.crawl()
46 |         assert crawler.stats.get_value('item_scraped_count') == 3
47 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36,py37
 3 | 
 4 | [testenv]
 5 | deps=
 6 |     pytest
 7 |     pytest-cov
 8 | 
 9 | commands=
10 |     pip install -r requirements.txt
11 |     pyppeteer-install
12 |     pip install -e .
13 |     py.test --doctest-modules --cov=scrapy_pyppeteer {posargs: scrapy_pyppeteer tests}
14 | 


--------------------------------------------------------------------------------