├── .coveragerc ├── .gitignore ├── .travis.yml ├── README.rst ├── examples └── books.py ├── requirements.txt ├── scrapy_pyppeteer ├── __init__.py ├── browser_request.py ├── browser_response.py └── middleware.py ├── setup.py ├── tests ├── __init__.py ├── mockserver.py └── test_crawl.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.egg-info 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: true 3 | dist: xenial 4 | 5 | branches: 6 | only: 7 | - master 8 | - /^\d\.\d+$/ 9 | 10 | matrix: 11 | include: 12 | - python: 3.6 13 | env: TOXENV=py36 14 | - python: 3.7 15 | env: TOXENV=py37 16 | 17 | install: 18 | - pip install -U tox codecov 19 | 20 | script: tox 21 | 22 | after_success: 23 | - codecov 24 | 25 | cache: 26 | directories: 27 | - $HOME/.cache/pip 28 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | scrapy-pyppeeteer: use pyppeteer from a Scrapy spider 2 | ===================================================== 3 | 4 | .. image:: https://img.shields.io/travis/lopuhin/scrapy-pyppeteer/master.svg 5 | :target: http://travis-ci.org/lopuhin/scrapy-pyppeteer 6 | :alt: Build Status 7 | 8 | .. image:: https://codecov.io/github/lopuhin/scrapy-pyppeteer/coverage.svg?branch=master 9 | :target: https://codecov.io/github/lopuhin/scrapy-pyppeteer?branch=master 10 | :alt: Code Coverage 11 | 12 | The goal is to allow using `pyppeteer `_ 13 | (a python port of puppeteer) from a `scrapy `_ spider. 14 | This allows to scrape sites that require JS to function properly 15 | and to make the scraper more similar to humans. 16 | 17 | Current status is experimental, most likely the library will remain 18 | in experimental state, with a proper solution included in Scrapy later 19 | (which will be very different). 20 | Documentation assumes Scrapy knowledge. 21 | 22 | Installation 23 | ------------ 24 | 25 | Python 3.6+ is required for 26 | `PEP 525 `_ (Asynchronous Generators). 27 | 28 | The library requires async def parse support in scrapy, until this is merged, 29 | please install scrapy from a branch:: 30 | 31 | pip install git+https://github.com/lopuhin/scrapy.git@async-def-parse 32 | 33 | Also this requires a pyppeteer fork for some fixes that are not released yet:: 34 | 35 | pip install git+https://github.com/lopuhin/pyppeteer.git 36 | 37 | Finally, install scrapy-pyppeteer itself:: 38 | 39 | pip install git+https://github.com/lopuhin/scrapy-pyppeteer.git 40 | 41 | Usage 42 | ----- 43 | 44 | At the moment, browser management is implemented as a downloader middleware, 45 | which you need to activate (update ``DOWNLOADER_MIDDLEWARES`` in settings):: 46 | 47 | DOWNLOADER_MIDDLEWARES = { 48 | 'scrapy_pyppeteer.ScrapyPyppeteerDownloaderMiddleware': 1000, 49 | } 50 | 51 | After that you can use ``scrapy_pyppeteer.BrowserRequest``, and you'll get 52 | ``scrapy_pyppeteer.BrowserResponse`` in your ``parse`` method. 53 | ``BrowserResponse`` has an empty body, and has an extra attribute 54 | ``browser_window`` which is a ``pyppeteer.page.Page`` instance (a browser tab). 55 | If you used ``BrowserResponse.empty()``, you'll get an empty tab, 56 | and if you specified a URL, then you'll get a tab where ``page.goto(url)`` 57 | has been awaited. 58 | 59 | To do anything with ``response.browser_window``, you need to define your 60 | parse callback as ``async def``, and use ``await`` syntax. 61 | All actions performed via ``await`` are executed directly, without going 62 | to the scheduler (although in the same global event loop). You can also 63 | ``yield`` items and new requests, which will work normally. 64 | 65 | Short example of the parse method 66 | (see more self-contained examples in "examples" folder of this repo):: 67 | 68 | async def parse(self, response: BrowserResponse): 69 | page = response.browser_tab 70 | yield {'url': response.url} 71 | for link in await page.querySelectorAll('a'): 72 | url = await page.evaluate('link => link.href', link) 73 | yield BrowserRequest(url) 74 | await page.close() 75 | 76 | Settings 77 | -------- 78 | 79 | - ``PYPPETEER_LAUNCH``: a dict with pyppeteer launch options, see 80 | ``pyppeteer.launch`` docstring. 81 | 82 | 83 | Notes on memory usage 84 | --------------------- 85 | 86 | - You need to explicitly close the browser tab once you don't need it 87 | (e.g. at the end of the parse method). 88 | - Items yielded from a single parse method are kept in memory 89 | while the parse method is running, as well as all local variables 90 | (the former is less obvious). Yielding a large number of big items from one 91 | parse method can increase the memory usage of your spider. 92 | Consider splitting work into several other parse methods. 93 | 94 | Debugging 95 | --------- 96 | 97 | If you wanted to put a ``pdb.set_trace()`` into the spider parse method 98 | and check results of some manipulations with the page which need to be awaited, 99 | this won't work, because ``pdb`` is blocking the event loop. One way which 100 | works is shown below 101 | (although this does not use the spider or this library at all):: 102 | 103 | import asyncio 104 | import pyppeteer 105 | loop = asyncio.new_event_loop() 106 | run = loop.run_until_complete # use "run(x)" instead of "await x" here 107 | browser = run(pyppeteer.launch(headless=False)) # headless=True to see what is executed 108 | page = run(browser.newPage()) 109 | run(page.goto('http://books.toscrape.com')) 110 | print(len(run(page.xpath('//a[@href]')))) # print number 111 | 112 | -- this allows to interact with a page from a REPL and observe effects in the 113 | browser window. 114 | 115 | TODO 116 | ---- 117 | 118 | - Set response status and headers 119 | - A more ergonomic way to close the tab by default? 120 | - More tests 121 | - A way to schedule interactions reusing the same window 122 | (to continue working in the same time but go through the scheduler), making 123 | sure one tab is used only by one parse method. 124 | - Nice extraction API (like parsel) 125 | - A way to limit max number of tabs open (a bit tricky) 126 | -------------------------------------------------------------------------------- /examples/books.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy_pyppeteer import BrowserRequest, BrowserResponse 3 | 4 | 5 | class BooksSpider(scrapy.Spider): 6 | """ Example spider for books.toscrape.com, using the headless browser 7 | for all scraping, based on https://github.com/scrapy/booksbot/. 8 | Run with:: 9 | 10 | scrapy runspider examples/books.py 11 | 12 | """ 13 | name = 'books' 14 | start_url = 'http://books.toscrape.com' 15 | custom_settings = { 16 | 'DOWNLOADER_MIDDLEWARES': { 17 | 'scrapy_pyppeteer.ScrapyPyppeteerDownloaderMiddleware': 1000, 18 | }, 19 | 'CONCURRENT_REQUESTS': 8, 20 | } 21 | 22 | def start_requests(self): 23 | yield BrowserRequest(self.start_url) 24 | 25 | async def parse(self, response: BrowserResponse): 26 | page = response.browser_tab 27 | yield {'url': response.url} 28 | for link in await page.querySelectorAll('a'): 29 | url = await page.evaluate('link => link.href', link) 30 | yield BrowserRequest(url) 31 | await page.close() 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/lopuhin/scrapy.git@async-def-parse 2 | git+https://github.com/lopuhin/pyppeteer.git 3 | -------------------------------------------------------------------------------- /scrapy_pyppeteer/__init__.py: -------------------------------------------------------------------------------- 1 | # Public API 2 | from .browser_request import BrowserRequest 3 | from .browser_response import BrowserResponse 4 | from .middleware import ScrapyPyppeteerDownloaderMiddleware 5 | 6 | 7 | # Make pyppeteer and websockets less noisy with scrapy debug logging level 8 | import logging 9 | logging.getLogger('pyppeteer').setLevel(logging.INFO) 10 | logging.getLogger('websockets.protocol').setLevel(logging.INFO) 11 | del logging 12 | -------------------------------------------------------------------------------- /scrapy_pyppeteer/browser_request.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class BrowserRequest(scrapy.Request): 5 | _BLANK_URL = 'about:blank' 6 | 7 | @classmethod 8 | def blank(cls): 9 | return BrowserRequest(cls._BLANK_URL, dont_filter=True) 10 | 11 | @property 12 | def is_blank(self): 13 | return self.url == self._BLANK_URL 14 | -------------------------------------------------------------------------------- /scrapy_pyppeteer/browser_response.py: -------------------------------------------------------------------------------- 1 | from scrapy.http.response import Response 2 | from pyppeteer.page import Page 3 | 4 | 5 | class BrowserResponse(Response): 6 | def __init__(self, *args, **kwargs): 7 | self.browser_tab: Page = kwargs.pop('browser_tab') 8 | super(BrowserResponse, self).__init__(*args, **kwargs) 9 | -------------------------------------------------------------------------------- /scrapy_pyppeteer/middleware.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from typing import Optional 4 | 5 | import pyppeteer 6 | from pyppeteer.browser import Browser 7 | from scrapy.settings import Settings 8 | from twisted.internet.defer import Deferred 9 | 10 | from .browser_request import BrowserRequest 11 | from .browser_response import BrowserResponse 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class ScrapyPyppeteerDownloaderMiddleware: 18 | """ Handles launching browser tabs, acts as a downloader. 19 | Probably eventually this should be moved to scrapy core as a downloader. 20 | """ 21 | def __init__(self, settings: Settings): 22 | self._browser: Optional[Browser] = None 23 | self._launch_options = settings.getdict('PYPPETEER_LAUNCH') or {} 24 | 25 | @classmethod 26 | def from_crawler(cls, crawler): 27 | return cls(crawler.settings) 28 | 29 | def process_request(self, request, spider): 30 | if isinstance(request, BrowserRequest): 31 | return _aio_as_deferred(self.process_browser_request(request)) 32 | else: 33 | return request 34 | 35 | async def process_browser_request(self, request: BrowserRequest): 36 | if self._browser is None: 37 | self._browser = await pyppeteer.launch(**self._launch_options) 38 | page = await self._browser.newPage() 39 | n_tabs = _n_browser_tabs(self._browser) 40 | logger.debug(f'{n_tabs} tabs open') 41 | if request.is_blank: 42 | url = request.url 43 | else: 44 | await page.goto(request.url) 45 | url = page.url 46 | # TODO set status and headers 47 | return BrowserResponse(url=url, browser_tab=page) 48 | 49 | 50 | def _n_browser_tabs(browser: Browser) -> int: 51 | """ A quick way to get the number of browser tabs. 52 | """ 53 | n_tabs = 0 54 | for context in browser.browserContexts: 55 | for target in context.targets(): 56 | if target.type == 'page': 57 | n_tabs += 1 58 | return n_tabs 59 | 60 | 61 | def _aio_as_deferred(f): 62 | return Deferred.fromFuture(asyncio.ensure_future(f)) 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='scrapy-pyppeteer', 6 | packages=['scrapy_pyppeteer'], 7 | install_requires=[ 8 | 'scrapy', 9 | 'pyppeteer', 10 | ], 11 | ) 12 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lopuhin/scrapy-pyppeteer/3839e36521a6757bcfd9afc78e47a1ae3705e931/tests/__init__.py -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | from subprocess import Popen, PIPE 3 | 4 | from scrapy.utils.log import configure_logging 5 | from twisted.internet import reactor 6 | from twisted.web.resource import Resource 7 | from twisted.web.server import Site 8 | 9 | 10 | configure_logging() 11 | 12 | 13 | class MockServer: 14 | def __init__(self, module='tests.mockserver'): 15 | self.proc = None 16 | self.module = module 17 | 18 | def __enter__(self): 19 | self.proc = Popen( 20 | [sys.executable, '-u', '-m', self.module], stdout=PIPE) 21 | self.proc.stdout.readline() 22 | return self 23 | 24 | def __exit__(self, exc_type, exc_value, traceback): 25 | self.proc.kill() 26 | self.proc.wait() 27 | time.sleep(0.2) 28 | 29 | 30 | class Leaf(Resource): 31 | isLeaf = True 32 | 33 | def __init__(self, data: str): 34 | super().__init__() 35 | self._data = data 36 | 37 | def render(self, request): 38 | return f'
{self._data}
'.encode('utf8') 39 | 40 | 41 | class Root(Resource): 42 | def __init__(self): 43 | super().__init__() 44 | self._resources = [ 45 | ('leaf-1', Leaf('data-1')), 46 | ('leaf-2', Leaf('data-2')), 47 | ] 48 | for url, r in self._resources: 49 | self.putChild(url.encode('utf8'), r) 50 | 51 | def getChild(self, name, request): 52 | return self 53 | 54 | def render(self, request): 55 | return ( 56 | '

Hi

' + 57 | '
'.join( 58 | f'{url}' for url, _ in self._resources) 59 | ).encode('utf8') 60 | 61 | 62 | PORT = 8781 63 | ROOT_URL = f'http://127.0.0.1:{PORT}' 64 | 65 | 66 | def main(): 67 | http_port = reactor.listenTCP(PORT, Site(Root())) 68 | 69 | def print_listening(): 70 | host = http_port.getHost() 71 | print('Mock server running at http://{}:{}'.format( 72 | host.host, host.port)) 73 | 74 | reactor.callWhenRunning(print_listening) 75 | reactor.run() 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /tests/test_crawl.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.crawler import CrawlerRunner 3 | from twisted.internet import defer 4 | from twisted.trial.unittest import TestCase 5 | 6 | from scrapy_pyppeteer import BrowserRequest, BrowserResponse 7 | from .mockserver import MockServer, ROOT_URL 8 | 9 | 10 | class BaseSpider(scrapy.Spider): 11 | name = 'spider' 12 | custom_settings = { 13 | 'DOWNLOADER_MIDDLEWARES': { 14 | 'scrapy_pyppeteer.ScrapyPyppeteerDownloaderMiddleware': 1000, 15 | } 16 | } 17 | 18 | 19 | class FollowAllSpider(BaseSpider): 20 | def start_requests(self): 21 | yield BrowserRequest(ROOT_URL) 22 | 23 | async def parse(self, response: BrowserResponse): 24 | page = response.browser_tab 25 | yield {'url': response.url} 26 | for link in await page.querySelectorAll('a'): 27 | url = await page.evaluate('link => link.href', link) 28 | yield BrowserRequest(url) 29 | await page.close() 30 | 31 | 32 | class CrawlTestCase(TestCase): 33 | 34 | def setUp(self): 35 | self.mockserver = MockServer() 36 | self.mockserver.__enter__() 37 | self.runner = CrawlerRunner() 38 | 39 | def tearDown(self): 40 | self.mockserver.__exit__(None, None, None) 41 | 42 | @defer.inlineCallbacks 43 | def test_follow_all(self): 44 | crawler = self.runner.create_crawler(FollowAllSpider) 45 | yield crawler.crawl() 46 | assert crawler.stats.get_value('item_scraped_count') == 3 47 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36,py37 3 | 4 | [testenv] 5 | deps= 6 | pytest 7 | pytest-cov 8 | 9 | commands= 10 | pip install -r requirements.txt 11 | pyppeteer-install 12 | pip install -e . 13 | py.test --doctest-modules --cov=scrapy_pyppeteer {posargs: scrapy_pyppeteer tests} 14 | --------------------------------------------------------------------------------