├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── pull_request_template.md └── workflows │ ├── python-publish.yml │ ├── python-test.yml │ └── ruff.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── examples ├── __init__.py ├── settings.py └── spiders │ ├── __init__.py │ ├── auto_recaptcha.py │ ├── compose.py │ ├── fill_form.py │ ├── follow.py │ ├── har.py │ ├── manual_recaptcha.py │ ├── meduza.py │ └── webscraperio.py ├── pyproject.toml ├── pytest.ini.example ├── requirements.txt ├── scrapy.cfg ├── scrapypuppeteer ├── __init__.py ├── actions.py ├── browser_managers │ ├── __init__.py │ ├── playwright_browser_manager.py │ ├── pyppeteer_browser_manager.py │ └── service_browser_manager.py ├── middleware.py ├── request.py └── response.py ├── setup.py └── tests ├── actions ├── constants.py └── test_actions.py ├── middleware ├── test_middleware.py └── view.py ├── mockserver.py ├── scrapy_logo.png └── spiders.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug**. 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce**. 14 | Steps to reproduce the behavior: 15 | 1. Set essential settings.py 16 | 2. Provide sequence of actions you made in spider 17 | 3. Make log-file with error 18 | 4. See error 19 | 20 | **Expected behavior**. 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots**. 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Python version: [e.g. 3.11.4] 29 | - Scrapy version: [e.g. 2.10] 30 | - Scrapy-puppeteer version: [e.g. 0.1.0] 31 | 32 | **Additional context**. 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for scrapy-puppeteer 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like**. 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered**. 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context**. 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Title 2 | 3 | Please, provide meaningful title 4 | 5 | # Description 6 | 7 | Please include a summary of the changes and the related issue. Please also include relevant motivation and context. 8 | List any dependencies that are required for this change. 9 | 10 | ## Type of change 11 | 12 | Please delete options that are not relevant. 13 | 14 | - [ ] Bug fix (non-breaking change which fixes an issue) 15 | - [ ] New feature (non-breaking change which adds functionality) 16 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 17 | - [ ] This change requires a documentation update 18 | 19 | # How Has This Been Tested? 20 | 21 | Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. 22 | Please also list any relevant details for your test configuration 23 | 24 | - [ ] Test A 25 | - [ ] Test B 26 | 27 | **Test Configuration**: 28 | * Software version: 29 | * Hardware: 30 | 31 | # Checklist: 32 | 33 | - [ ] My code follows the style guidelines of this project 34 | - [ ] I have performed a self-review of my code 35 | - [ ] I have commented my code, particularly in hard-to-understand areas 36 | - [ ] I have made corresponding changes to the documentation 37 | - [ ] My changes generate no new warnings -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools wheel twine 23 | - name: Build and publish 24 | env: 25 | TWINE_USERNAME: __token__ 26 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | twine upload dist/* 30 | -------------------------------------------------------------------------------- /.github/workflows/python-test.yml: -------------------------------------------------------------------------------- 1 | name: Test Scrapy-Puppeteer Library 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | tests: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | include: 11 | - python-version: "3.8.x" # Min Python version (No 3.7 version in GitHub repository) 12 | - python-version: "3.9.x" 13 | - python-version: "3.10.x" 14 | - python-version: "3.11.x" 15 | - python-version: "3.12.x" 16 | - python-version: "3.13.x" 17 | - python-version: "3.x" # Last Python version 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - name: Set Python version ${{ matrix.python-version }} Up 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install Dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install pytest 30 | pip install -r requirements.txt 31 | 32 | - name: Run Tests 33 | run: | 34 | python -m pytest 35 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff Code Check 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | ruff: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v3 11 | 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: "3.x" 16 | 17 | - name: Install Ruff 18 | run: | 19 | pip install ruff 20 | 21 | - name: Run Ruff Format 22 | run: | 23 | ruff format --check 24 | 25 | - name: Run Ruff Check 26 | run: | 27 | ruff check . 28 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # *Code Of Conduct* 2 | 3 | --- 4 | Just like any other technical community, Scrapy-puppeteer team and community is made up 5 | of many people from all over the world. We want to join them into one huge community of *Scrapy-puppeteer*. 6 | 7 | As there are many diverse people with different points of view, we decided to write rules for contributing, 8 | issue declaring and discussion. We hope everybody would follow them and enjoy spending time within the community! 9 | 10 | This code of conduct applies to all spaces managed by Scrapy-puppeteer project and to all members 11 | of its community. 12 | 13 | If you believe someone is violating the code of conduct, we ask that you report it by 14 | emailing us. 15 | 16 | * Be friendly and patient. 17 | * Be welcoming. We strive to be a community that welcomes and supports people of all backgrounds and identities. 18 | This includes, but is not limited to members of any race, ethnicity, culture, national origin, colour, immigration 19 | status, social and economic class, educational level, sex, age, size, family status, political belief, religion, 20 | and mental and physical ability. 21 | * Be considerate. Your work will be used by other people, and you in turn will depend on the work of others. 22 | Any decision you take will affect users and colleagues, and you should take those consequences into account 23 | when making decisions. Remember that we're a world-wide community, so you might not be communicating in 24 | someone else's primary language. 25 | * Be respectful. Not all of us will agree all the time, but disagreement is no excuse for poor behavior or 26 | bad manners. We might all experience some frustration now and then, but we cannot allow that frustration to 27 | turn into a personal attack. It’s important to remember that a community where people feel uncomfortable or 28 | threatened is not a productive one. Members of the Scrapy-puppeteer community should be respectful when dealing 29 | with other members as well as with people outside the community. 30 | * Be careful in the words that you choose. We are a community of professionals, and we conduct ourselves 31 | professionally. Be kind to others. Do not insult or put down other participants. Harassment and other exclusionary 32 | behavior aren't acceptable. This includes, but is not limited to: 33 | 1. Violent threats or language directed against another person. 34 | 2. Discriminatory jokes and language. 35 | 3. Posting (or threatening to post) other people's personally identifying information ("doxing"). 36 | 4. Personal insults. 37 | 5. Advocating for, or encouraging, any of the above behavior. 38 | 6. Repeated harassment of others. In general, if someone asks you to stop, then stop. 39 | * When we disagree, try to understand why. Disagreements, both social and technical, happen all the time and 40 | Scrapy-puppeteer is no exception. It is important that we resolve disagreements and differing views constructively. 41 | Remember that we’re different. The strength of Scrapy-puppeteer comes from its varied community, people from a 42 | wide range of backgrounds. Different people have different perspectives on issues. Being unable to understand why 43 | someone holds a viewpoint doesn’t mean that they’re wrong. Don’t forget that it is human to err and blaming 44 | each other doesn’t get us anywhere. Instead, focus on helping to resolve issues and learning from mistakes. 45 | 46 | 47 | Original text from [DJango](https://www.djangoproject.com/conduct/). 48 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Scrapy-puppeteer 2 | 3 | --- 4 | As an open source project, Scrapy-puppeteer welcomes contributions of many forms. 5 | 6 | Examples of contributions include: 7 | 8 | * Code patches 9 | * Documentation improvements 10 | * Bug reports and patch reviews 11 | 12 | **Warning: any pull request without informative title, meaningful description will be closed!** 13 | 14 | `Please formalize your pull request (PR)` you will get. 15 | 16 | **Before each push or PR, run in the root directory of the project:** 17 | 18 | ```bash 19 | ruff check 20 | 21 | ruff format 22 | ``` 23 | 24 | --- 25 | # Code of Conduct 26 | 27 | As a contributor, you can help us keep the Scrapy-puppeteer community open and inclusive. 28 | Please read and follow our [Code Of Conduct](https://github.com/ispras/scrapy-puppeteer/blob/master/CODE_OF_CONDUCT.md). 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Institute for System Programming, Russian Academy of Sciences 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy-puppeteer-client 2 | This package aims to manage Chrome browser with [Puppeteer](https://github.com/GoogleChrome/puppeteer) from [Scrapy](https://github.com/scrapy/scrapy/) spiders. 3 | This allows to scrape sites that require JS to function properly and to make the scraper more similar to humans. 4 | It is a client library for [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service). 5 | 6 | ## ⚠️ This repository is under development. 7 | 8 | This project is under development. Use it at your own risk. 9 | 10 | ## Installation 11 | 12 | Using pip (master branch): 13 | ```shell script 14 | $ pip install scrapy-puppeteer-client 15 | ``` 16 | 17 | ## Configuration 18 | 19 | You should have [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service) started. 20 | Then add its URL to `settings.py` and enable puppeteer downloader middleware: 21 | ```python 22 | DOWNLOADER_MIDDLEWARES = { 23 | 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 24 | } 25 | 26 | PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods 27 | 28 | # To change the execution method, you must add the corresponding setting: 29 | EXECUTION_METHOD = "Puppeteer" 30 | ``` 31 | Available methods: `Puppeteer`, `Pyppeteer`, `Playwright` 32 | 33 | `Pyppeteer` and `Playwright` methods do not require a running service. 34 | They use the pyppeteer and playwright libraries for Python to interact with the browser. 35 | Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods. 36 | 37 | To use `Pyppeteer` or `Playwright` methods you need to install Chromium. 38 | 39 | ## Basic usage 40 | 41 | Use `scrapypuppeteer.PuppeteerRequest` instead of `scrapy.Request` to render URLs with Puppeteer: 42 | ```python 43 | import scrapy 44 | from scrapypuppeteer import PuppeteerRequest 45 | 46 | class MySpider(scrapy.Spider): 47 | ... 48 | def start_requests(self): 49 | yield PuppeteerRequest('https://exapmle.com', callback=self.parse) 50 | 51 | def parse(self, response): 52 | links = response.css(...) 53 | ... 54 | ``` 55 | 56 | ## Puppeter responses 57 | 58 | There is a parent `PuppeteerResponse` class from which other response classes are inherited. 59 | 60 | Here is a list of them all: 61 | - `PuppeteerHtmlResponse` - has `html` and `cookies` properties 62 | - `PuppeteerScreenshotResponse` - has `screenshot` property 63 | - `PuppeteerHarResponse` - has `har` property 64 | - `PuppeteerJsonResponse` - has `data` property and `to_html()` method which tries to transform itself to `PuppeteerHtmlResponse` 65 | - `PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse)` - has `recaptcha_data` property 66 | 67 | ## Advanced usage 68 | 69 | `PuppeteerRequest`'s first argument is a browser action. 70 | Available actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`. 71 | Passing a URL into request is a shortcut for `GoTo(url)` action. 72 | 73 | Here is the list of available actions: 74 | - `GoTo(url, options)` - navigate to URL 75 | - `GoForward(options)` - navigate forward in history 76 | - `GoBack(options)` - navigate back in history 77 | - `Click(selector, click_options, wait_options)` - click on element on page 78 | - `Compose(*actions)` - composition of several puppeteer action 79 | - `Scroll(selector, wait_options)` - scroll page 80 | - `Screenshot(options)` - take screenshot 81 | - `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution. 82 | - `FillForm(input_mapping, submit_button)` - to fill out and submit forms on page. 83 | - `RecaptchaSolver(solve_recaptcha, close_on_empty, options)` - find or solve recaptcha on page 84 | - `CustomJsAction(js_function)` - evaluate JS function on page 85 | 86 | Available options essentially mirror [service](https://github.com/ispras/scrapy-puppeteer-service) method parameters, which in turn mirror puppeteer API functions to some extent. 87 | See `scrapypuppeteer.actions` module for details. 88 | 89 | You may pass `close_page=False` option to a request to retain browser tab and its state after request's completion. 90 | Then use `response.follow` to continue interacting with the same tab: 91 | 92 | ```python 93 | import scrapy 94 | from scrapypuppeteer import PuppeteerRequest, PuppeteerHtmlResponse 95 | from scrapypuppeteer.actions import Click 96 | 97 | class MySpider(scrapy.Spider): 98 | ... 99 | def start_requests(self): 100 | yield PuppeteerRequest( 101 | 'https://exapmle.com', # will be transformed into GoTo action 102 | close_page=False, 103 | callback=self.parse, 104 | ) 105 | 106 | def parse(self, response: PuppeteerHtmlResponse): 107 | ... 108 | # parse and yield some items 109 | ... 110 | next_page_selector = 'button.next-page-or-smth' 111 | if response.css(next_page_selector ): 112 | yield response.follow( 113 | Click( 114 | next_page_selector, 115 | wait_options={'selectorOrTimeout': 3000}, # wait 3 seconds 116 | ), 117 | close_page=False, 118 | callback=self.parse, 119 | ) 120 | ``` 121 | 122 | You may also use `follow_all` method to continue interacting. 123 | 124 | On your first request service will create new incognito browser context and new page in it. 125 | Their ids will be in returned in response object as `context_id` and `page_id` attributes. 126 | Following such response means passing context and page ids to next request. 127 | You also may specify requests context and page ids directly. 128 | 129 | Right before your spider has done the crawling, the service middleware will take care 130 | of closing all used browser contexts with `scrapypuppeteer.CloseContextRequest`. 131 | It accepts a list of all browser contexts to be closed. 132 | 133 | One may customize which `PuppeteerRequest`'s headers will be sent to remote website by the service 134 | via `include_headers` attribute in request or globally with `PUPPETEER_INCLUDE_HEADERS` setting. 135 | Available values are True (all headers), False (no headers) or list of header names. 136 | By default, only cookies are sent. 137 | 138 | You would also like to send meta with your request. By default, you are not allowed to do this 139 | in order to sustain backward compatibility. You can change this behaviour by setting `PUPPETEER_INCLUDE_META` to True. 140 | 141 | ## Automatic recaptcha solving 142 | 143 | Enable PuppeteerRecaptchaDownloaderMiddleware to automatically solve recaptcha during scraping. We do not recommend 144 | to use RecaptchaSolver action when the middleware works. 145 | 146 | ```Python 147 | DOWNLOADER_MIDDLEWARES = { 148 | 'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1041, 149 | 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 150 | } 151 | ``` 152 | Note that the number of RecaptchaMiddleware has to be lower than ServiceMiddleware's. 153 | You must provide some settings to use the middleware: 154 | ```Python 155 | PUPPETEER_INCLUDE_META = True # Essential to send meta 156 | 157 | RECAPTCHA_ACTIVATION = True # Enables the middleware 158 | RECAPTCHA_SOLVING = False # Automatic recaptcha solving 159 | RECAPTCHA_SUBMIT_SELECTORS = { # Selectors for "submit recaptcha" button 160 | 'www.google.com/recaptcha/api2/demo': '', # No selectors needed 161 | } 162 | ``` 163 | If you set RECAPTCHA_SOLVING to False the middleware will try to find captcha 164 | and will notify you about number of found captchas on the page. 165 | 166 | If you don't want the middleware to work on specific request you may provide special meta key: `'dont_recaptcha': True`. 167 | In this case RecaptchaMiddleware will just skip the request. 168 | 169 | ## TODO 170 | 171 | - [x] skeleton that could handle goto, click, scroll, and actions 172 | - [ ] headers and cookies management 173 | - [ ] proxy support for puppeteer 174 | - [x] error handling for requests 175 | - [x] har support 176 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Currently, we are able to support only last version of Scrapy-puppeteer. We are actively maintaining it 6 | and trying to improve its stability, performance and user experience. 7 | 8 | If you are able to maintain old versions of Scrapy-puppeteer contact us! 9 | 10 | | Version | Supported | 11 | |-----------|-----------| 12 | | latest | YES | 13 | | < latest | NO | 14 | 15 | 16 | ## Reporting a Vulnerability 17 | 18 | It is possible that you find a bug, vulnerability or other issue. 19 | Please, make it known to us via issue tab within GitHub or contacting us. 20 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ispras/scrapy-puppeteer/f666232c98a34bbfcaf21aabff51cab54627e62e/examples/__init__.py -------------------------------------------------------------------------------- /examples/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = "scrapypuppeteer" 2 | 3 | SPIDER_MODULES = ["examples.spiders"] 4 | NEWSPIDER_MODULE = "examples.spiders" 5 | 6 | CONCURRENT_REQUESTS = 1 7 | 8 | DOWNLOADER_MIDDLEWARES = { 9 | "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042 10 | } 11 | 12 | PUPPETEER_SERVICE_URL = "http://localhost:3000" 13 | 14 | PUPPETEER_LOCAL = False 15 | -------------------------------------------------------------------------------- /examples/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ispras/scrapy-puppeteer/f666232c98a34bbfcaf21aabff51cab54627e62e/examples/spiders/__init__.py -------------------------------------------------------------------------------- /examples/spiders/auto_recaptcha.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | 4 | import scrapy 5 | from twisted.python.failure import Failure 6 | 7 | from scrapypuppeteer import PuppeteerRequest 8 | from scrapypuppeteer.actions import GoTo, Screenshot 9 | from scrapypuppeteer.response import PuppeteerResponse, PuppeteerScreenshotResponse 10 | 11 | 12 | class AutoRecaptchaSpider(scrapy.Spider): 13 | name = "auto_recaptcha" 14 | 15 | start_urls = ["https://www.google.com/recaptcha/api2/demo"] 16 | 17 | custom_settings = { 18 | "DOWNLOADER_MIDDLEWARES": { 19 | "scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware": 1041, 20 | "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, 21 | }, 22 | "PUPPETEER_INCLUDE_META": True, 23 | "RECAPTCHA_ACTIVATION": True, 24 | "RECAPTCHA_SOLVING": True, 25 | "RECAPTCHA_SUBMIT_SELECTORS": { 26 | "www.google.com/recaptcha/api2/demo": "#recaptcha-demo-submit", 27 | }, 28 | } 29 | 30 | def start_requests(self): 31 | for url in self.start_urls: 32 | action = GoTo(url=url) 33 | yield PuppeteerRequest( 34 | action=action, 35 | callback=self.parse_html, 36 | errback=self.error, 37 | close_page=False, 38 | ) 39 | 40 | def parse_html(self, response: PuppeteerResponse, **kwargs): 41 | with open("recaptcha_page.html", "wb") as f: 42 | f.write(response.body) 43 | action = Screenshot( 44 | options={ 45 | "full_page": True, 46 | } 47 | ) 48 | yield response.follow( 49 | action, callback=self.make_screenshot, errback=self.error, close_page=True 50 | ) 51 | 52 | @staticmethod 53 | def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs): 54 | data = ( 55 | response.screenshot 56 | ) # Note that data is string containing bytes, don't forget to decode them! 57 | with open("imageToSave.png", "wb") as fh: 58 | fh.write(base64.b64decode(data)) 59 | 60 | def error(self, failure: Failure): 61 | self.log("We are in error function!", level=logging.WARNING) 62 | -------------------------------------------------------------------------------- /examples/spiders/compose.py: -------------------------------------------------------------------------------- 1 | from logging import ERROR 2 | 3 | import scrapy 4 | from scrapy.utils.log import failure_to_exc_info 5 | from twisted.python.failure import Failure 6 | 7 | from scrapypuppeteer import ( 8 | PuppeteerRequest, 9 | PuppeteerResponse, 10 | PuppeteerScreenshotResponse, 11 | ) 12 | from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll 13 | 14 | 15 | class ComposeSpider(scrapy.Spider): 16 | name = "compose" 17 | 18 | custom_settings = { 19 | "DOWNLOADER_MIDDLEWARES": { 20 | "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, 21 | }, 22 | } 23 | 24 | def start_requests(self): 25 | goto = GoTo("https://pptr.dev") 26 | click_1 = Click( 27 | "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" 28 | ) 29 | click_2 = Click( 30 | "#__docusaurus_skipToContent_fallback > div > div > aside > div > " 31 | "div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a" 32 | ) 33 | click = Compose(click_1, click_2) 34 | scroll = Scroll() 35 | screenshot = Screenshot(options={"full_page": True, "type": "jpeg"}) 36 | 37 | compose_action = Compose( 38 | goto, 39 | click, 40 | scroll, 41 | screenshot, 42 | ) 43 | 44 | yield PuppeteerRequest( 45 | compose_action, 46 | callback=self.parse, 47 | errback=self.errback, 48 | close_page=True, 49 | ) 50 | 51 | def parse(self, response: PuppeteerResponse): 52 | assert isinstance(response, PuppeteerScreenshotResponse) 53 | self.log("Spider worked fine!") 54 | 55 | def errback(self, failure: Failure): 56 | print(failure) 57 | self.log(failure_to_exc_info(failure), level=ERROR) 58 | -------------------------------------------------------------------------------- /examples/spiders/fill_form.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | import scrapy 4 | 5 | from scrapypuppeteer import PuppeteerRequest, PuppeteerScreenshotResponse 6 | from scrapypuppeteer.actions import FillForm, Screenshot 7 | 8 | 9 | class FormActionSpider(scrapy.Spider): 10 | name = "fill_form" 11 | start_urls = ["https://www.roboform.com/filling-test-all-fields"] 12 | 13 | def start_requests(self): 14 | for url in self.start_urls: 15 | yield PuppeteerRequest(url, callback=self.form_action, close_page=False) 16 | 17 | def form_action(self, response): 18 | input_mapping = { 19 | 'input[name="02frstname"]': {"value": "SomeName", "delay": 50}, 20 | 'input[name="05_company"]': {"value": "SomeCompany", "delay": 100}, 21 | 'input[name="06position"]': {"value": "SomePosition", "delay": 100}, 22 | } 23 | 24 | yield response.follow( 25 | FillForm(input_mapping), close_page=False, callback=self.screenshot 26 | ) 27 | 28 | def screenshot(self, response): 29 | action = Screenshot( 30 | options={ 31 | "fullPage": True, 32 | } 33 | ) 34 | yield response.follow(action, callback=self.make_screenshot, close_page=False) 35 | 36 | @staticmethod 37 | def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs): 38 | data = response.screenshot 39 | with open("screenshot.png", "wb") as fh: 40 | fh.write(base64.b64decode(data)) 41 | -------------------------------------------------------------------------------- /examples/spiders/follow.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | from scrapy.http import Response 3 | 4 | from scrapypuppeteer import GoTo, PuppeteerRequest, PuppeteerResponse 5 | 6 | 7 | class FollowSpider(Spider): 8 | name = "follow" 9 | 10 | start_urls = ["http://quotes.toscrape.com/page/1/"] 11 | 12 | def start_requests(self): 13 | for url in self.start_urls: 14 | yield PuppeteerRequest( 15 | GoTo(url), 16 | close_page=False, 17 | callback=self.goto_about, 18 | errback=self.errback, 19 | ) 20 | 21 | def goto_about(self, response: PuppeteerResponse): 22 | # yield response.follow( 23 | # response.css("div.quote span a")[0], 24 | # callback=self.parse, 25 | # errback=self.errback, 26 | # close_page=False, 27 | # ) 28 | 29 | # Or: 30 | yield from response.follow_all( 31 | response.css("div.quote span a"), 32 | callback=self.parse, 33 | errback=self.errback, 34 | close_page=True, 35 | ) 36 | 37 | # Or: 38 | # yield from response.follow_all( 39 | # css="div.quote span a", 40 | # callback=self.parse, 41 | # errback=self.errback, 42 | # close_page=False, 43 | # ) 44 | 45 | def parse(self, response: Response, **kwargs): 46 | self.log(response.url.split("/")[-1]) 47 | 48 | def errback(self, failure): 49 | self.log(failure) 50 | -------------------------------------------------------------------------------- /examples/spiders/har.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from scrapypuppeteer import PuppeteerRequest 4 | from scrapypuppeteer.actions import Har 5 | 6 | 7 | def write_to_file(file_path, content): 8 | with open(file_path, "a", encoding="utf-8") as file: 9 | file.write(content) 10 | 11 | 12 | class HarSpider(scrapy.Spider): 13 | name = "har" 14 | start_urls = ["https://github.com/pyppeteer/pyppeteer"] 15 | 16 | def start_requests(self): 17 | for url in self.start_urls: 18 | yield PuppeteerRequest( 19 | url, callback=self.har, close_page=False, har_recording=True 20 | ) 21 | 22 | def har(self, response): 23 | yield response.follow( 24 | Har(), 25 | close_page=False, 26 | callback=self.save_har, 27 | ) 28 | 29 | def save_har(self, response): 30 | write_to_file("result.har", response.har) 31 | -------------------------------------------------------------------------------- /examples/spiders/manual_recaptcha.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | 4 | import scrapy 5 | from twisted.python.failure import Failure 6 | 7 | from scrapypuppeteer import PuppeteerRequest 8 | from scrapypuppeteer.actions import Click, GoTo, RecaptchaSolver, Screenshot 9 | from scrapypuppeteer.response import PuppeteerResponse, PuppeteerScreenshotResponse 10 | 11 | 12 | class ManualRecaptchaSpider(scrapy.Spider): 13 | name = "manual_recaptcha" 14 | 15 | start_urls = ["https://www.google.com/recaptcha/api2/demo"] 16 | 17 | def start_requests(self): 18 | for url in self.start_urls: 19 | action = GoTo(url=url) 20 | yield PuppeteerRequest( 21 | action=action, 22 | callback=self.solve_recaptcha, 23 | errback=self.error, 24 | close_page=False, 25 | ) 26 | 27 | def solve_recaptcha(self, response: PuppeteerResponse, **kwargs): 28 | action = RecaptchaSolver() 29 | yield response.follow( 30 | action=action, 31 | callback=self.submit_recaptcha, 32 | errback=self.error, 33 | close_page=False, 34 | ) 35 | 36 | def submit_recaptcha(self, response, **kwargs): 37 | action = Click("#recaptcha-demo-submit") 38 | yield response.follow( 39 | action=action, 40 | callback=self.parse_html, 41 | errback=self.error, 42 | close_page=False, 43 | ) 44 | 45 | def parse_html(self, response: PuppeteerResponse, **kwargs): 46 | with open("recaptcha_page.html", "wb") as f: 47 | f.write(response.body) 48 | action = Screenshot( 49 | options={ 50 | "full_page": True, 51 | } 52 | ) 53 | yield response.follow( 54 | action, callback=self.make_screenshot, errback=self.error, close_page=True 55 | ) 56 | 57 | @staticmethod 58 | def make_screenshot(response: PuppeteerScreenshotResponse, **kwargs): 59 | data = ( 60 | response.screenshot 61 | ) # Note that data is string containing bytes, don't forget to decode them! 62 | with open("imageToSave.png", "wb") as fh: 63 | fh.write(base64.b64decode(data)) 64 | 65 | def error(self, failure: Failure): 66 | self.log("We are in error function!", level=logging.WARNING) 67 | -------------------------------------------------------------------------------- /examples/spiders/meduza.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from scrapypuppeteer import PuppeteerHtmlResponse, PuppeteerRequest 4 | 5 | 6 | class MeduzaSpider(scrapy.Spider): 7 | name = "meduza" 8 | 9 | def start_requests(self): 10 | yield PuppeteerRequest("https://meduza.io", callback=self.parse_main_page) 11 | 12 | def parse_main_page(self, response: PuppeteerHtmlResponse): 13 | for article_url in response.css("a.Link-isInBlockTitle::attr(href)").getall(): 14 | yield response.follow(article_url, callback=self.parse_article) 15 | 16 | def parse_article(self, response: PuppeteerHtmlResponse): 17 | yield { 18 | "url": response.url, 19 | "title": response.css("h1::text").get(), 20 | "text": "\n".join(response.css("p.SimpleBlock-p::text").getall()), 21 | } 22 | -------------------------------------------------------------------------------- /examples/spiders/webscraperio.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from scrapypuppeteer import PuppeteerRequest 4 | from scrapypuppeteer.actions import Click, GoTo, Scroll 5 | 6 | 7 | class EcommerceSiteSpider(scrapy.Spider): 8 | @staticmethod 9 | def extract_items(list_page_response): 10 | for item_selector in list_page_response.css("div.row div.thumbnail"): 11 | yield { 12 | "link": item_selector.css("a.title::attr(href)").get(), 13 | "title": item_selector.css("a.title::attr(title)").get(), 14 | "price": item_selector.css("h4.price::text").get(), 15 | "description": item_selector.css("p.description::text").get(), 16 | "rating": len(item_selector.css("span.glyphicon-star")), 17 | "reviews_count": int( 18 | item_selector.css(".ratings p.pull-right::text").re_first(r"\d+") 19 | ), 20 | } 21 | 22 | @staticmethod 23 | def extract_item(detail_page_response): 24 | yield { 25 | "link": detail_page_response.url, 26 | "title": detail_page_response.css("h4.price + h4::text").get(), 27 | "price": detail_page_response.css("h4.price::text").get(), 28 | "description": detail_page_response.css("p.description::text").get(), 29 | "rating": len(detail_page_response.css("span.glyphicon-star")), 30 | "reviews_count": int( 31 | detail_page_response.css(".ratings::text").re_first(r"\d+") 32 | ), 33 | } 34 | 35 | 36 | class AjaxPaginationSpider(EcommerceSiteSpider): 37 | name = "e-commerce-ajax" 38 | 39 | def __init__(self, **kwargs): 40 | super().__init__(**kwargs) 41 | self.start_url = ( 42 | "https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops" 43 | ) 44 | self.next_page_ix = 1 45 | 46 | def start_requests(self): 47 | yield PuppeteerRequest( 48 | GoTo(self.start_url), close_page=False, callback=self.process_list_page 49 | ) 50 | 51 | def process_list_page(self, response): 52 | yield from self.extract_items(response) 53 | self.next_page_ix += 1 54 | next_page_selector = f'button[data-id="{self.next_page_ix}"]' 55 | if response.css(next_page_selector): 56 | yield response.follow( 57 | Click(next_page_selector, wait_options={"selectorOrTimeout": 3000}), 58 | close_page=False, 59 | callback=self.process_list_page, 60 | ) 61 | 62 | 63 | class MoreSpider(EcommerceSiteSpider): 64 | name = "e-commerce-more" 65 | 66 | def __init__(self, **kwargs): 67 | super().__init__(**kwargs) 68 | self.start_url = ( 69 | "https://webscraper.io/test-sites/e-commerce/more/computers/laptops" 70 | ) 71 | self.seen_item_links = set() 72 | 73 | def start_requests(self): 74 | yield PuppeteerRequest( 75 | GoTo(self.start_url, wait_options={"selectorOrTimeout": 10000}), 76 | close_page=False, 77 | callback=self.process_list_page, 78 | ) 79 | 80 | def process_list_page(self, response): 81 | for item in self.extract_items(response): 82 | if item["link"] not in self.seen_item_links: 83 | self.seen_item_links.add(item["link"]) 84 | yield item 85 | more_selector = ".ecomerce-items-scroll-more" 86 | more_button = response.css(more_selector) 87 | if "style" not in more_button.attrib: 88 | yield response.follow( 89 | Click(more_selector, wait_options={"selectorOrTimeout": 1000}), 90 | close_page=False, 91 | callback=self.process_list_page, 92 | ) 93 | 94 | 95 | class ScrollSpider(EcommerceSiteSpider): 96 | name = "e-commerce-scroll" 97 | 98 | def __init__(self, **kwargs): 99 | super().__init__(**kwargs) 100 | self.start_url = ( 101 | "https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops" 102 | ) 103 | self.seen_item_links = set() 104 | 105 | def start_requests(self): 106 | yield PuppeteerRequest( 107 | GoTo(self.start_url), close_page=False, callback=self.process_list_page 108 | ) 109 | 110 | def process_list_page(self, response): 111 | items = self.extract_items(response) 112 | new_items = [i for i in items if i["link"] not in self.seen_item_links] 113 | if new_items: 114 | for item in new_items: 115 | self.seen_item_links.add(item["link"]) 116 | yield item 117 | yield response.follow( 118 | Scroll(wait_options={"selectorOrTimeout": 1000}), 119 | close_page=False, 120 | callback=self.process_list_page, 121 | ) 122 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 88 3 | fix = false 4 | indent-width = 4 5 | 6 | [tool.ruff.lint] 7 | select = ["F", "C", "W", "I"] 8 | ignore = ["E203", "E501", "F401", "C408", "F811", "N807"] 9 | 10 | [tool.ruff.format] 11 | indent-style = "space" 12 | line-ending = "auto" 13 | quote-style = "double" 14 | skip-magic-trailing-comma = false 15 | docstring-code-line-length = 88 16 | docstring-code-format = true -------------------------------------------------------------------------------- /pytest.ini.example: -------------------------------------------------------------------------------- 1 | Maybe you want to test new features or minor changes on your local PC. 2 | In this case you need to provide `pytest.ini` file in the root of the project: 3 | 4 | [pytest] 5 | 6 | pythonpath = 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=2.6 2 | greenlet>=3.1 # For installing with python3.13 (see https://github.com/python-greenlet/greenlet/issues/406) 3 | pyppeteer 4 | syncer 5 | bs4 6 | playwright -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = examples.settings 3 | 4 | [deploy] 5 | #url = http://localhost:6800/ 6 | project = scrapypuppeteer 7 | -------------------------------------------------------------------------------- /scrapypuppeteer/__init__.py: -------------------------------------------------------------------------------- 1 | from .actions import ( 2 | Click, 3 | CustomJsAction, 4 | FillForm, 5 | GoBack, 6 | GoForward, 7 | GoTo, 8 | Har, 9 | PuppeteerServiceAction, 10 | RecaptchaSolver, 11 | Screenshot, 12 | Scroll, 13 | ) 14 | from .request import CloseContextRequest, PuppeteerRequest 15 | from .response import ( 16 | PuppeteerHtmlResponse, 17 | PuppeteerJsonResponse, 18 | PuppeteerRecaptchaSolverResponse, 19 | PuppeteerResponse, 20 | PuppeteerScreenshotResponse, 21 | ) 22 | -------------------------------------------------------------------------------- /scrapypuppeteer/actions.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Tuple 3 | 4 | 5 | class PuppeteerServiceAction(ABC): 6 | content_type = "application/json" 7 | 8 | @property 9 | @abstractmethod 10 | def endpoint(self): ... 11 | 12 | @abstractmethod 13 | def payload(self): ... 14 | 15 | 16 | class GoTo(PuppeteerServiceAction): 17 | """ 18 | Navigate page to given URL. 19 | 20 | :param str url: URL to navigate to. May be relative for following requests. 21 | :param dict navigation_options: Navigation options. 22 | :param dict wait_options: Options specifying wait after navigation. 23 | 24 | Available navigation options (see puppeteer `page.goto 25 | `_): 26 | 27 | * ``timeout`` (int): Maximum navigation time in milliseconds, defaults 28 | to 30 seconds, pass ``0`` to disable timeout. The default value can 29 | be changed by using the :meth:`setDefaultNavigationTimeout` method. 30 | * ``waitUntil`` (str|List[str]): When to consider navigation succeeded, 31 | defaults to ``load``. Given a list of event strings, navigation is 32 | considered to be successful after all events have been fired. Events 33 | can be either: 34 | 35 | * ``load``: when ``load`` event is fired. 36 | * ``domcontentloaded``: when the ``DOMContentLoaded`` event is fired. 37 | * ``networkidle0``: when there are no more than 0 network connections 38 | for at least 500 ms. 39 | * ``networkidle2``: when there are no more than 2 network connections 40 | for at least 500 ms. 41 | 42 | Available wait options (see puppeteer `page.waitFor 43 | `_); 44 | 45 | * ``selectorOrTimeout`` (int|float|str): If it is a selector string or xpath string, wait until 46 | element which matches that selector appears on page. If it is a number, then it 47 | is treated as a timeout in milliseconds.`` 48 | * ``options`` (dict): optional parameters to wait on selector 49 | * ``visible`` (bool): wait for element to be present in DOM and to be visible. 50 | Defaults to false. 51 | * ``timeout`` (int|float): maximum time to wait for in milliseconds. 52 | Defaults to 30000 (30 seconds). Pass 0 to disable timeout. 53 | * ``hidden`` (bool): wait for element to not be found in the DOM or to be hidden. 54 | Defaults to false. 55 | 56 | """ 57 | 58 | endpoint = "goto" 59 | 60 | def __init__( 61 | self, 62 | url: str, 63 | navigation_options: dict = None, 64 | wait_options: dict = None, 65 | har_recording: bool = False, 66 | ): 67 | self.url = url 68 | self.navigation_options = navigation_options 69 | self.wait_options = wait_options 70 | self.har_recording = har_recording 71 | 72 | def payload(self): 73 | return { 74 | "url": self.url, 75 | "navigationOptions": self.navigation_options, 76 | "waitOptions": self.wait_options, 77 | "harRecording": self.har_recording, 78 | } 79 | 80 | 81 | class GoForward(PuppeteerServiceAction): 82 | """ 83 | Navigate to the next page in history. 84 | 85 | :param dict navigation_options: Navigation options, same as GoTo action. 86 | :param dict wait_options: Options specifying wait after navigation, same as GoTo action. 87 | 88 | """ 89 | 90 | endpoint = "forward" 91 | 92 | def __init__(self, navigation_options: dict = None, wait_options: dict = None): 93 | self.navigation_options = navigation_options 94 | self.wait_options = wait_options 95 | 96 | def payload(self): 97 | return { 98 | "navigationOptions": self.navigation_options, 99 | "waitOptions": self.wait_options, 100 | } 101 | 102 | 103 | class GoBack(PuppeteerServiceAction): 104 | """ 105 | Navigate to the previous page in history. 106 | 107 | :param dict navigation_options: Navigation options, same as GoTo action. 108 | :param dict wait_options: Options specifying wait after navigation, same as GoTo action. 109 | 110 | """ 111 | 112 | endpoint = "back" 113 | 114 | def __init__(self, navigation_options: dict = None, wait_options: dict = None): 115 | self.navigation_options = navigation_options 116 | self.wait_options = wait_options 117 | 118 | def payload(self): 119 | return { 120 | "navigationOptions": self.navigation_options, 121 | "waitOptions": self.wait_options, 122 | } 123 | 124 | 125 | class Click(PuppeteerServiceAction): 126 | """ 127 | Click element which matches ``selector``. 128 | 129 | :param str selector: Specifies element to click. 130 | :param dict click_options: Optional parameters for click. 131 | :param dict wait_options: Options specifying wait after click, same as GoTo action. 132 | :param dict navigation_options: Navigation options to be used if click results in navigation to 133 | other page, same as GoTo action. 134 | 135 | 136 | Available click options (see puppeteer `page.click 137 | `_): 138 | 139 | * ``button`` (str): ``left``, ``right``, or ``middle``, defaults to 140 | ``left``. 141 | * ``clickCount`` (int): defaults to 1. 142 | * ``delay`` (int|float): Time to wait between ``mousedown`` and 143 | ``mouseup`` in milliseconds. defaults to 0. 144 | 145 | Response for this action contains page state after click and wait. 146 | 147 | """ 148 | 149 | endpoint = "click" 150 | 151 | def __init__( 152 | self, 153 | selector: str, 154 | click_options: dict = None, 155 | wait_options: dict = None, 156 | navigation_options: dict = None, 157 | ): 158 | self.selector = selector 159 | self.click_options = click_options 160 | self.wait_options = wait_options 161 | self.navigation_options = navigation_options 162 | 163 | def payload(self): 164 | return { 165 | "selector": self.selector, 166 | "clickOptions": self.click_options, 167 | "waitOptions": self.wait_options, 168 | "navigationOptions": self.navigation_options, 169 | } 170 | 171 | 172 | class Scroll(PuppeteerServiceAction): 173 | """ 174 | Scroll page down or for specific element. 175 | 176 | :param str selector: If provided, scroll this element into view, otherwise scroll down by window 177 | height. 178 | :param dict wait_options: Same as in GoTo and Click actions. 179 | 180 | Response for this action contains page state after scroll and wait. 181 | 182 | """ 183 | 184 | endpoint = "scroll" 185 | 186 | def __init__(self, selector: str = None, wait_options: dict = None): 187 | self.selector = selector 188 | self.wait_options = wait_options 189 | 190 | def payload(self): 191 | return {"selector": self.selector, "waitOptions": self.wait_options} 192 | 193 | 194 | class Screenshot(PuppeteerServiceAction): 195 | """ 196 | Take a screenshot. 197 | 198 | Available options (see puppeteer `page.screenshot 199 | `_) 200 | 201 | * ``type`` (str): Specify screenshot type, can be either ``jpeg`` or 202 | ``png``. Defaults to ``png``. 203 | * ``quality`` (int): The quality of the image, between 0-100. Not 204 | applicable to ``png`` image. 205 | * ``fullPage`` (bool): When true, take a screenshot of the full 206 | scrollable page. Defaults to ``False``. 207 | * ``clip`` (dict): An object which specifies clipping region of the 208 | page. This option should have the following fields: 209 | 210 | * ``x`` (int): x-coordinate of top-left corner of clip area. 211 | * ``y`` (int): y-coordinate of top-left corner of clip area. 212 | * ``width`` (int): width of clipping area. 213 | * ``height`` (int): height of clipping area. 214 | 215 | * ``omitBackground`` (bool): Hide default white background and allow 216 | capturing screenshot with transparency. 217 | 218 | Response for this action contains screenshot image in base64 encoding. 219 | 220 | """ 221 | 222 | endpoint = "screenshot" 223 | 224 | def __init__(self, options: dict = None, **kwargs): 225 | self.options = options or {} 226 | self.options.update(kwargs) 227 | 228 | def payload(self): 229 | return {"options": self.options} 230 | 231 | 232 | class Har(PuppeteerServiceAction): 233 | """ 234 | The `Har` action is used to capture and retrieve the HTTP Archive (HAR) file, 235 | which contains detailed information about network requests and responses 236 | made during the session. 237 | 238 | This action is called without any arguments. To generate the HAR file, 239 | you must pass the `har_recording=True` argument to `PuppeteerRequest` 240 | when initiating the request. 241 | """ 242 | 243 | endpoint = "har" 244 | 245 | def payload(self): 246 | return {} 247 | 248 | 249 | class FillForm(PuppeteerServiceAction): 250 | """ 251 | Fill out and submit forms on a webpage. 252 | 253 | Available options: 254 | 255 | * ``input_mapping`` (dict): A dictionary where each key is a CSS selector, and 256 | each value is another dictionary containing details about the input for that element. 257 | Each entry in the dictionary should follow this structure: 258 | 259 | * ``selector`` (str): The CSS selector for the input element (used as the key). 260 | * ``value`` (str): The text to be inputted into the element. 261 | * ``delay`` (int, optional): A delay (in milliseconds) between each keystroke 262 | when inputting the text. Defaults to 0 if not provided. 263 | 264 | * ``submit_button`` (str, optional): The CSS selector for the form's submit button. 265 | If provided, the button will be clicked after filling in the form. 266 | """ 267 | 268 | endpoint = "fill_form" 269 | 270 | def __init__(self, input_mapping: dict, submit_button: str = None): 271 | self.input_mapping = input_mapping 272 | self.submit_button = submit_button 273 | 274 | def payload(self): 275 | return {"inputMapping": self.input_mapping, "submitButton": self.submit_button} 276 | 277 | 278 | class RecaptchaSolver(PuppeteerServiceAction): 279 | """ 280 | Tries to solve recaptcha on the page. 281 | First it tries to find recaptcha. If it couldn't find a recaptcha nothing 282 | will happen to your 2captcha balance. 283 | Then it solves recaptcha with 2captcha service and inserts the special code 284 | into the page automatically. 285 | Note that it does not click buttons like "submit" buttons. 286 | 287 | :param bool solve_recaptcha: (default = True) enables automatic solving of recaptcha on the page if found. 288 | If false is provided recaptcha will still be detected on the page but not solved. 289 | You can get info about found recaptchas via return value. 290 | :param bool close_on_empty: (default = True) whether to close page or not if there was no captcha on the page. 291 | 292 | :param dict navigation_options: Navigation options, same as GoTo action. 293 | :param dict wait_options: Options specifying wait after navigation, same as GoTo action. 294 | 295 | Response for this action is PuppeteerRecaptchaSolverResponse. 296 | """ 297 | 298 | endpoint = "recaptcha_solver" 299 | 300 | def __init__( 301 | self, 302 | solve_recaptcha: bool = True, 303 | close_on_empty: bool = False, 304 | navigation_options: dict = None, 305 | wait_options: dict = None, 306 | **kwargs, 307 | ): 308 | self.solve_recaptcha = solve_recaptcha 309 | self.close_on_empty = close_on_empty 310 | self.navigation_options = navigation_options 311 | self.wait_options = wait_options 312 | 313 | def payload(self): 314 | return { 315 | "solve_recaptcha": self.solve_recaptcha, 316 | "close_on_empty": self.close_on_empty, 317 | "navigationOptions": self.navigation_options, 318 | "waitOptions": self.wait_options, 319 | } 320 | 321 | 322 | class CustomJsAction(PuppeteerServiceAction): 323 | """ 324 | Evaluate custom JavaScript function on page. 325 | 326 | :param str js_action: JavaScript function. 327 | 328 | Expected signature: ``async function action(page, request)``. 329 | 330 | JavaScript function should not return object with attributes 331 | of ``scrapypuppeteer.PuppeteerJsonResponse``. 332 | Otherwise, undefined behaviour is possible. 333 | 334 | Response for this action contains result of the function. 335 | 336 | """ 337 | 338 | endpoint = "action" 339 | content_type = "application/javascript" 340 | 341 | def __init__(self, js_action: str): 342 | self.js_action = js_action 343 | 344 | def payload(self): 345 | return self.js_action 346 | 347 | 348 | class Compose(PuppeteerServiceAction): 349 | """ 350 | Compose several scrapy-puppeteer actions into one action and send it to the service. 351 | 352 | Response for this action is PuppeteerResponse to last action in a sequence. 353 | 354 | """ 355 | 356 | endpoint = "compose" 357 | 358 | def __init__(self, *actions: PuppeteerServiceAction): 359 | self.actions = self.__flatten(actions) 360 | 361 | @staticmethod 362 | def __flatten( 363 | actions: Tuple[PuppeteerServiceAction, ...], 364 | ) -> List[PuppeteerServiceAction]: 365 | flatten_actions = [] 366 | for action in actions: 367 | if isinstance(action, Compose): 368 | flatten_actions.extend(action.actions) 369 | else: 370 | flatten_actions.append(action) 371 | if not flatten_actions: 372 | raise ValueError("No actions provided in `Compose`.") 373 | return flatten_actions 374 | 375 | def payload(self): 376 | return { 377 | "actions": [ 378 | {"endpoint": action.endpoint, "body": action.payload()} 379 | for action in self.actions 380 | ] 381 | } 382 | -------------------------------------------------------------------------------- /scrapypuppeteer/browser_managers/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["BrowserManager"] 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | 6 | class BrowserManager(ABC): 7 | @abstractmethod 8 | def process_request(self, request, spider): 9 | pass 10 | 11 | @abstractmethod 12 | def close_used_contexts(self): 13 | pass 14 | 15 | @abstractmethod 16 | def process_response(self, middleware, request, response, spider): 17 | pass 18 | -------------------------------------------------------------------------------- /scrapypuppeteer/browser_managers/playwright_browser_manager.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import uuid 4 | 5 | import syncer 6 | from playwright.async_api import async_playwright 7 | 8 | from scrapypuppeteer.browser_managers import BrowserManager 9 | from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest 10 | from scrapypuppeteer.response import ( 11 | PuppeteerHtmlResponse, 12 | PuppeteerScreenshotResponse, 13 | ) 14 | 15 | 16 | class ContextManager: 17 | def __init__(self): 18 | self.browser = syncer.sync(self.launch_browser()) 19 | self.contexts = {} 20 | self.pages = {} 21 | self.context_page_map = {} 22 | 23 | async def launch_browser(self): 24 | playwright = await async_playwright().start() 25 | return await playwright.chromium.launch(headless=False) 26 | 27 | async def check_context_and_page(self, context_id, page_id): 28 | if not context_id or not page_id: 29 | context_id, page_id = await self.open_new_page() 30 | return context_id, page_id 31 | 32 | async def open_new_page(self): 33 | context_id = uuid.uuid4().hex.upper() 34 | page_id = uuid.uuid4().hex.upper() 35 | 36 | self.contexts[context_id] = await self.browser.new_context() 37 | self.pages[page_id] = await self.contexts[context_id].new_page() 38 | self.context_page_map[context_id] = page_id 39 | 40 | return context_id, page_id 41 | 42 | def get_page_by_id(self, context_id, page_id): 43 | return self.pages[page_id] 44 | 45 | def close_browser(self): 46 | if self.browser: 47 | syncer.sync(self.browser.close()) 48 | 49 | def close_contexts(self, request: CloseContextRequest): 50 | for context_id in request.contexts: 51 | if context_id in self.contexts: 52 | syncer.sync(self.contexts[context_id].close()) 53 | page_id = self.context_page_map.get(context_id) 54 | self.pages.pop(page_id, None) 55 | 56 | del self.contexts[context_id] 57 | del self.context_page_map[context_id] 58 | 59 | 60 | class PlaywrightBrowserManager(BrowserManager): 61 | def __init__(self): 62 | self.context_manager = ContextManager() 63 | self.action_map = { 64 | "goto": self.goto, 65 | "click": self.click, 66 | "compose": self.compose, 67 | "back": self.go_back, 68 | "forward": self.go_forward, 69 | "scroll": self.scroll, 70 | "screenshot": self.screenshot, 71 | "action": self.action, 72 | "recaptcha_solver": self.recaptcha_solver, 73 | "har": self.har, 74 | "fill_form": self.fill_form, 75 | } 76 | 77 | def process_request(self, request): 78 | if isinstance(request, PuppeteerRequest): 79 | endpoint = request.action.endpoint 80 | action_function = self.action_map.get(endpoint) 81 | if action_function: 82 | return action_function(request) 83 | 84 | if isinstance(request, CloseContextRequest): 85 | return self.close_contexts(request) 86 | 87 | def close_contexts(self, request: CloseContextRequest): 88 | self.context_manager.close_contexts(request) 89 | 90 | def close_used_contexts(self): 91 | self.context_manager.close_browser() 92 | 93 | def process_response(self, middleware, request, response, spider): 94 | return response 95 | 96 | def map_navigation_options(self, navigation_options): 97 | if not navigation_options: 98 | return {} 99 | event_map = { 100 | "load": "load", 101 | "domcontentloaded": "domcontentloaded", 102 | "networkidle0": "networkidle", 103 | "networkidle2": "networkidle", 104 | } 105 | mapped_navigation_options = {} 106 | if "timeout" in navigation_options: 107 | mapped_navigation_options["timeout"] = navigation_options["timeout"] 108 | 109 | waitUntil = navigation_options.get("waitUntil") 110 | 111 | if waitUntil: 112 | if isinstance(waitUntil, list): 113 | event_hierarchy = [ 114 | "load", 115 | "domcontentloaded", 116 | "networkidle2", 117 | "networkidle0", 118 | ] 119 | strictest_event = max( 120 | waitUntil, key=lambda event: event_hierarchy.index(event) 121 | ) 122 | elif isinstance(waitUntil, str): 123 | strictest_event = waitUntil 124 | 125 | if strictest_event in event_map: 126 | mapped_navigation_options["wait_until"] = event_map[strictest_event] 127 | 128 | return mapped_navigation_options 129 | 130 | def map_click_options(self, click_options): 131 | if not click_options: 132 | return {} 133 | mapped_click_options = { 134 | "delay": click_options.get("delay", 0.0), 135 | "button": click_options.get("button", "left"), 136 | "click_count": click_options.get("clickCount", 1), 137 | } 138 | return mapped_click_options 139 | 140 | def map_screenshot_options(self, screenshot_options): 141 | if not screenshot_options: 142 | return {} 143 | mapped_screenshot_options = { 144 | "type": screenshot_options.get("type", "png"), 145 | "quality": screenshot_options.get("quality", 100), 146 | "full_page": screenshot_options.get("fullPage", False), 147 | "clip": screenshot_options.get("clip"), 148 | "omit_background": screenshot_options.get("omitBackground"), 149 | } 150 | return mapped_screenshot_options 151 | 152 | async def wait_with_options(self, page, wait_options): 153 | selector = wait_options.get("selector") 154 | xpath = wait_options.get("xpath") 155 | timeout = wait_options.get("timeout", None) 156 | options = wait_options.get("options", {}) 157 | 158 | selector_or_timeout = wait_options.get("selectorOrTimeout") 159 | if selector_or_timeout: 160 | if isinstance(selector_or_timeout, (int, float)): 161 | timeout = selector_or_timeout 162 | elif isinstance(selector_or_timeout, str): 163 | if selector_or_timeout.startswith("//"): 164 | xpath = selector_or_timeout 165 | else: 166 | selector = selector_or_timeout 167 | 168 | if len([item for item in [selector, xpath, timeout] if item]) > 1: 169 | raise ValueError( 170 | "Wait options must contain either a selector, an xpath, or a timeout" 171 | ) 172 | 173 | if selector: 174 | await page.wait_for_selector(selector, **options) 175 | elif xpath: 176 | await page.wait_for_selector(f"xpath={xpath}", **options) 177 | elif timeout: 178 | await asyncio.sleep(timeout / 1000) 179 | 180 | def get_page_from_request(self, request): 181 | context_id, page_id = syncer.sync( 182 | self.context_manager.check_context_and_page( 183 | request.context_id, request.page_id 184 | ) 185 | ) 186 | return ( 187 | self.context_manager.get_page_by_id(context_id, page_id), 188 | context_id, 189 | page_id, 190 | ) 191 | 192 | def goto(self, request: PuppeteerRequest): 193 | page, context_id, page_id = self.get_page_from_request(request) 194 | 195 | async def async_goto(): 196 | url = request.action.payload()["url"] 197 | cookies = request.cookies 198 | navigation_options = self.map_navigation_options( 199 | request.action.navigation_options 200 | ) 201 | await page.goto(url, **navigation_options) 202 | wait_options = request.action.payload().get("waitOptions", {}) or {} 203 | await self.wait_with_options(page, wait_options) 204 | response_html = await page.content() 205 | return PuppeteerHtmlResponse( 206 | url, 207 | request, 208 | context_id=context_id, 209 | page_id=page_id, 210 | html=response_html, 211 | cookies=cookies, 212 | ) 213 | 214 | return syncer.sync(async_goto()) 215 | 216 | def click(self, request: PuppeteerRequest): 217 | page, context_id, page_id = self.get_page_from_request(request) 218 | 219 | async def async_click(): 220 | selector = request.action.payload().get("selector") 221 | cookies = request.cookies 222 | click_options = self.map_click_options(request.action.click_options) 223 | await page.click(selector, **click_options) 224 | wait_options = request.action.payload().get("waitOptions", {}) or {} 225 | await self.wait_with_options(page, wait_options) 226 | response_html = await page.content() 227 | return PuppeteerHtmlResponse( 228 | request.url, 229 | request, 230 | context_id=context_id, 231 | page_id=page_id, 232 | html=response_html, 233 | cookies=cookies, 234 | ) 235 | 236 | return syncer.sync(async_click()) 237 | 238 | def go_back(self, request: PuppeteerRequest): 239 | page, context_id, page_id = self.get_page_from_request(request) 240 | 241 | async def async_go_back(): 242 | cookies = request.cookies 243 | navigation_options = self.map_navigation_options( 244 | request.action.navigation_options 245 | ) 246 | await page.go_back(**navigation_options) 247 | wait_options = request.action.payload().get("waitOptions", {}) or {} 248 | await self.wait_with_options(page, wait_options) 249 | response_html = await page.content() 250 | return PuppeteerHtmlResponse( 251 | request.url, 252 | request, 253 | context_id=context_id, 254 | page_id=page_id, 255 | html=response_html, 256 | cookies=cookies, 257 | ) 258 | 259 | return syncer.sync(async_go_back()) 260 | 261 | def go_forward(self, request: PuppeteerRequest): 262 | page, context_id, page_id = self.get_page_from_request(request) 263 | 264 | async def async_go_forward(): 265 | cookies = request.cookies 266 | navigation_options = self.map_navigation_options( 267 | request.action.navigation_options 268 | ) 269 | await page.go_forward(**navigation_options) 270 | wait_options = request.action.payload().get("waitOptions", {}) or {} 271 | await self.wait_with_options(page, wait_options) 272 | response_html = await page.content() 273 | return PuppeteerHtmlResponse( 274 | request.url, 275 | request, 276 | context_id=context_id, 277 | page_id=page_id, 278 | html=response_html, 279 | cookies=cookies, 280 | ) 281 | 282 | return syncer.sync(async_go_forward()) 283 | 284 | def screenshot(self, request: PuppeteerRequest): 285 | page, context_id, page_id = self.get_page_from_request(request) 286 | 287 | async def async_screenshot(): 288 | screenshot_options = request.action.options or {} 289 | screenshot_bytes = await page.screenshot( 290 | **self.map_screenshot_options(screenshot_options) 291 | ) 292 | screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") 293 | return PuppeteerScreenshotResponse( 294 | request.url, 295 | request, 296 | context_id=context_id, 297 | page_id=page_id, 298 | screenshot=screenshot_base64, 299 | ) 300 | 301 | return syncer.sync(async_screenshot()) 302 | 303 | def scroll(self, request: PuppeteerRequest): 304 | page, context_id, page_id = self.get_page_from_request(request) 305 | 306 | async def async_scroll(): 307 | cookies = request.cookies 308 | selector = request.action.payload().get("selector", None) 309 | 310 | if selector: 311 | script = f""" 312 | document.querySelector('{selector}').scrollIntoView(); 313 | """ 314 | else: 315 | script = """ 316 | window.scrollBy(0, document.body.scrollHeight); 317 | """ 318 | await page.evaluate(script) 319 | wait_options = request.action.payload().get("waitOptions", {}) or {} 320 | await self.wait_with_options(page, wait_options) 321 | response_html = await page.content() 322 | return PuppeteerHtmlResponse( 323 | request.url, 324 | request, 325 | context_id=context_id, 326 | page_id=page_id, 327 | html=response_html, 328 | cookies=cookies, 329 | ) 330 | 331 | return syncer.sync(async_scroll()) 332 | 333 | def fill_form(self, request: PuppeteerRequest): 334 | page, context_id, page_id = self.get_page_from_request(request) 335 | 336 | async def async_fill_form(): 337 | input_mapping = request.action.payload().get("inputMapping") 338 | submit_button = request.action.payload().get("submitButton", None) 339 | cookies = request.cookies 340 | 341 | for selector, params in input_mapping.items(): 342 | text = params.get("value", None) 343 | delay = params.get("delay", 0) 344 | await page.type(selector, text=text, delay=delay) 345 | 346 | if submit_button: 347 | await page.click(submit_button) 348 | 349 | response_html = await page.content() 350 | return PuppeteerHtmlResponse( 351 | request.url, 352 | request, 353 | context_id=context_id, 354 | page_id=page_id, 355 | html=response_html, 356 | cookies=cookies, 357 | ) 358 | 359 | return syncer.sync(async_fill_form()) 360 | 361 | def compose(self, request: PuppeteerRequest): 362 | _, context_id, page_id = self.get_page_from_request(request) 363 | request.page_id = page_id 364 | request.context_id = context_id 365 | 366 | for action in request.action.actions: 367 | response = self.action_map[action.endpoint](request.replace(action=action)) 368 | return response.replace(puppeteer_request=request) 369 | 370 | def action(self, request: PuppeteerRequest): 371 | raise ValueError("CustomJsAction is not available in local mode") 372 | 373 | def recaptcha_solver(self, request: PuppeteerRequest): 374 | raise ValueError("RecaptchaSolver is not available in local mode") 375 | 376 | def har(self, request: PuppeteerRequest): 377 | raise ValueError("Har is not available in local mode") 378 | -------------------------------------------------------------------------------- /scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import uuid 4 | 5 | import syncer 6 | from pyppeteer import launch 7 | 8 | from scrapypuppeteer.browser_managers import BrowserManager 9 | from scrapypuppeteer.request import CloseContextRequest, PuppeteerRequest 10 | from scrapypuppeteer.response import ( 11 | PuppeteerHtmlResponse, 12 | PuppeteerScreenshotResponse, 13 | ) 14 | 15 | 16 | class ContextManager: 17 | def __init__(self): 18 | self.browser = syncer.sync(launch()) 19 | self.contexts = {} 20 | self.pages = {} 21 | self.context_page_map = {} 22 | 23 | async def check_context_and_page(self, context_id, page_id): 24 | if not context_id or not page_id: 25 | context_id, page_id = await self.open_new_page() 26 | return context_id, page_id 27 | 28 | async def open_new_page(self): 29 | context_id = uuid.uuid4().hex.upper() 30 | page_id = uuid.uuid4().hex.upper() 31 | 32 | self.contexts[context_id] = await self.browser.createIncognitoBrowserContext() 33 | self.pages[page_id] = await self.contexts[context_id].newPage() 34 | self.context_page_map[context_id] = page_id 35 | 36 | return context_id, page_id 37 | 38 | def get_page_by_id(self, context_id, page_id): 39 | return self.pages[page_id] 40 | 41 | def close_browser(self): 42 | if self.browser: 43 | syncer.sync(self.browser.close()) 44 | 45 | def close_contexts(self, request: CloseContextRequest): 46 | for context_id in request.contexts: 47 | if context_id in self.contexts: 48 | syncer.sync(self.contexts[context_id].close()) 49 | page_id = self.context_page_map.get(context_id) 50 | self.pages.pop(page_id, None) 51 | 52 | del self.contexts[context_id] 53 | del self.context_page_map[context_id] 54 | 55 | 56 | class PyppeteerBrowserManager(BrowserManager): 57 | def __init__(self): 58 | self.context_manager = ContextManager() 59 | self.action_map = { 60 | "goto": self.goto, 61 | "click": self.click, 62 | "compose": self.compose, 63 | "back": self.go_back, 64 | "forward": self.go_forward, 65 | "scroll": self.scroll, 66 | "screenshot": self.screenshot, 67 | "action": self.action, 68 | "recaptcha_solver": self.recaptcha_solver, 69 | "har": self.har, 70 | "fill_form": self.fill_form, 71 | } 72 | 73 | def process_request(self, request): 74 | if isinstance(request, PuppeteerRequest): 75 | endpoint = request.action.endpoint 76 | action_function = self.action_map.get(endpoint) 77 | if action_function: 78 | return action_function(request) 79 | 80 | if isinstance(request, CloseContextRequest): 81 | return self.close_contexts(request) 82 | 83 | def close_contexts(self, request: CloseContextRequest): 84 | self.context_manager.close_contexts(request) 85 | 86 | def close_used_contexts(self): 87 | self.context_manager.close_browser() 88 | 89 | def process_response(self, middleware, request, response, spider): 90 | return response 91 | 92 | async def wait_with_options(self, page, wait_options: dict): 93 | selector = wait_options.get("selector") 94 | xpath = wait_options.get("xpath") 95 | timeout = wait_options.get("timeout", None) 96 | options = wait_options.get("options", {}) 97 | 98 | selector_or_timeout = wait_options.get("selectorOrTimeout") 99 | if selector_or_timeout: 100 | if isinstance(selector_or_timeout, (int, float)): 101 | timeout = selector_or_timeout 102 | elif isinstance(selector_or_timeout, str): 103 | if selector_or_timeout.startswith("//"): 104 | xpath = selector_or_timeout 105 | else: 106 | selector = selector_or_timeout 107 | 108 | if len([item for item in [selector, xpath, timeout] if item]) > 1: 109 | raise ValueError( 110 | "Wait options must contain either a selector, an xpath, or a timeout" 111 | ) 112 | 113 | if selector: 114 | await page.waitForSelector(selector, options) 115 | elif xpath: 116 | await page.waitForXPath(xpath, options) 117 | elif timeout: 118 | await asyncio.sleep(timeout / 1000) 119 | 120 | def goto(self, request: PuppeteerRequest): 121 | context_id, page_id = syncer.sync( 122 | self.context_manager.check_context_and_page( 123 | request.context_id, request.page_id 124 | ) 125 | ) 126 | page = self.context_manager.get_page_by_id(context_id, page_id) 127 | 128 | async def async_goto(): 129 | url = request.action.payload()["url"] 130 | cookies = request.cookies 131 | navigation_options = request.action.navigation_options 132 | await page.goto(url, navigation_options) 133 | wait_options = request.action.payload().get("waitOptions", {}) or {} 134 | await self.wait_with_options(page, wait_options) 135 | response_html = await page.content() 136 | return PuppeteerHtmlResponse( 137 | url, 138 | request, 139 | context_id=context_id, 140 | page_id=page_id, 141 | html=response_html, 142 | cookies=cookies, 143 | ) 144 | 145 | return syncer.sync(async_goto()) 146 | 147 | def click(self, request: PuppeteerRequest): 148 | context_id, page_id = syncer.sync( 149 | self.context_manager.check_context_and_page( 150 | request.context_id, request.page_id 151 | ) 152 | ) 153 | page = self.context_manager.get_page_by_id(context_id, page_id) 154 | 155 | async def async_click(): 156 | selector = request.action.payload().get("selector") 157 | cookies = request.cookies 158 | click_options = request.action.click_options or {} 159 | navigation_options = request.action.navigation_options or {} 160 | options = {**click_options, **navigation_options} 161 | await page.click(selector, options) 162 | wait_options = request.action.payload().get("waitOptions", {}) or {} 163 | await self.wait_with_options(page, wait_options) 164 | response_html = await page.content() 165 | return PuppeteerHtmlResponse( 166 | request.url, 167 | request, 168 | context_id=context_id, 169 | page_id=page_id, 170 | html=response_html, 171 | cookies=cookies, 172 | ) 173 | 174 | return syncer.sync(async_click()) 175 | 176 | def go_back(self, request: PuppeteerRequest): 177 | context_id, page_id = syncer.sync( 178 | self.context_manager.check_context_and_page( 179 | request.context_id, request.page_id 180 | ) 181 | ) 182 | page = self.context_manager.get_page_by_id(context_id, page_id) 183 | 184 | async def async_go_back(): 185 | cookies = request.cookies 186 | navigation_options = request.action.navigation_options 187 | await page.goBack(navigation_options) 188 | wait_options = request.action.payload().get("waitOptions", {}) or {} 189 | await self.wait_with_options(page, wait_options) 190 | response_html = await page.content() 191 | return PuppeteerHtmlResponse( 192 | request.url, 193 | request, 194 | context_id=context_id, 195 | page_id=page_id, 196 | html=response_html, 197 | cookies=cookies, 198 | ) 199 | 200 | return syncer.sync(async_go_back()) 201 | 202 | def go_forward(self, request: PuppeteerRequest): 203 | context_id, page_id = syncer.sync( 204 | self.context_manager.check_context_and_page( 205 | request.context_id, request.page_id 206 | ) 207 | ) 208 | page = self.context_manager.get_page_by_id(context_id, page_id) 209 | 210 | async def async_go_forward(): 211 | cookies = request.cookies 212 | navigation_options = request.action.navigation_options 213 | await page.goForward(navigation_options) 214 | wait_options = request.action.payload().get("waitOptions", {}) or {} 215 | await self.wait_with_options(page, wait_options) 216 | response_html = await page.content() 217 | return PuppeteerHtmlResponse( 218 | request.url, 219 | request, 220 | context_id=context_id, 221 | page_id=page_id, 222 | html=response_html, 223 | cookies=cookies, 224 | ) 225 | 226 | return syncer.sync(async_go_forward()) 227 | 228 | def screenshot(self, request: PuppeteerRequest): 229 | context_id, page_id = syncer.sync( 230 | self.context_manager.check_context_and_page( 231 | request.context_id, request.page_id 232 | ) 233 | ) 234 | page = self.context_manager.get_page_by_id(context_id, page_id) 235 | 236 | async def async_screenshot(): 237 | request_options = request.action.options or {} 238 | screenshot_options = {"encoding": "binary"} 239 | screenshot_options.update(request_options) 240 | screenshot_bytes = await page.screenshot(screenshot_options) 241 | screenshot_base64 = base64.b64encode(screenshot_bytes).decode("utf-8") 242 | return PuppeteerScreenshotResponse( 243 | request.url, 244 | request, 245 | context_id=context_id, 246 | page_id=page_id, 247 | screenshot=screenshot_base64, 248 | ) 249 | 250 | return syncer.sync(async_screenshot()) 251 | 252 | def scroll(self, request: PuppeteerRequest): 253 | context_id, page_id = syncer.sync( 254 | self.context_manager.check_context_and_page( 255 | request.context_id, request.page_id 256 | ) 257 | ) 258 | page = self.context_manager.get_page_by_id(context_id, page_id) 259 | 260 | async def async_scroll(): 261 | cookies = request.cookies 262 | selector = request.action.payload().get("selector", None) 263 | 264 | if selector: 265 | script = f""" 266 | document.querySelector('{selector}').scrollIntoView(); 267 | """ 268 | else: 269 | script = """ 270 | window.scrollBy(0, document.body.scrollHeight); 271 | """ 272 | await page.evaluate(script) 273 | wait_options = request.action.payload().get("waitOptions", {}) or {} 274 | await self.wait_with_options(page, wait_options) 275 | response_html = await page.content() 276 | return PuppeteerHtmlResponse( 277 | request.url, 278 | request, 279 | context_id=context_id, 280 | page_id=page_id, 281 | html=response_html, 282 | cookies=cookies, 283 | ) 284 | 285 | return syncer.sync(async_scroll()) 286 | 287 | def fill_form(self, request: PuppeteerRequest): 288 | context_id, page_id = syncer.sync( 289 | self.context_manager.check_context_and_page( 290 | request.context_id, request.page_id 291 | ) 292 | ) 293 | page = self.context_manager.get_page_by_id(context_id, page_id) 294 | 295 | async def async_fill_form(): 296 | input_mapping = request.action.payload().get("inputMapping") 297 | submit_button = request.action.payload().get("submitButton", None) 298 | cookies = request.cookies 299 | 300 | for selector, params in input_mapping.items(): 301 | value = params.get("value", None) 302 | delay = params.get("delay", 0) 303 | await page.type(selector, value, {"delay": delay}) 304 | 305 | if submit_button: 306 | await page.click(submit_button) 307 | 308 | response_html = await page.content() 309 | return PuppeteerHtmlResponse( 310 | request.url, 311 | request, 312 | context_id=context_id, 313 | page_id=page_id, 314 | html=response_html, 315 | cookies=cookies, 316 | ) 317 | 318 | return syncer.sync(async_fill_form()) 319 | 320 | def compose(self, request: PuppeteerRequest): 321 | context_id, page_id = syncer.sync( 322 | self.context_manager.check_context_and_page( 323 | request.context_id, request.page_id 324 | ) 325 | ) 326 | request.page_id = page_id 327 | request.context_id = context_id 328 | 329 | for action in request.action.actions: 330 | response = self.action_map[action.endpoint](request.replace(action=action)) 331 | return response.replace(puppeteer_request=request) 332 | 333 | def action(self, request: PuppeteerRequest): 334 | raise ValueError("CustomJsAction is not available in local mode") 335 | 336 | def recaptcha_solver(self, request: PuppeteerRequest): 337 | raise ValueError("RecaptchaSolver is not available in local mode") 338 | 339 | def har(self, request: PuppeteerRequest): 340 | raise ValueError("Har is not available in local mode") 341 | -------------------------------------------------------------------------------- /scrapypuppeteer/browser_managers/service_browser_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from collections import defaultdict 4 | from urllib.parse import urlencode, urljoin 5 | 6 | from scrapy.exceptions import DontCloseSpider 7 | from scrapy.http import Headers, Response, TextResponse 8 | from scrapy.utils.log import failure_to_exc_info 9 | from twisted.python.failure import Failure 10 | 11 | from scrapypuppeteer.actions import ( 12 | Click, 13 | Compose, 14 | FillForm, 15 | GoBack, 16 | GoForward, 17 | GoTo, 18 | Har, 19 | RecaptchaSolver, 20 | Screenshot, 21 | Scroll, 22 | ) 23 | from scrapypuppeteer.browser_managers import BrowserManager 24 | from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest 25 | from scrapypuppeteer.response import ( 26 | PuppeteerHarResponse, 27 | PuppeteerHtmlResponse, 28 | PuppeteerJsonResponse, 29 | PuppeteerRecaptchaSolverResponse, 30 | PuppeteerScreenshotResponse, 31 | ) 32 | 33 | 34 | class ServiceBrowserManager(BrowserManager): 35 | def __init__(self, service_base_url, include_meta, include_headers, crawler): 36 | self.service_base_url = service_base_url 37 | self.include_meta = include_meta 38 | self.include_headers = include_headers 39 | self.used_contexts = defaultdict(set) 40 | self.service_logger = logging.getLogger(__name__) 41 | self.crawler = crawler 42 | 43 | if self.service_base_url is None: 44 | raise ValueError("Puppeteer service URL must be provided") 45 | 46 | def process_request(self, request): 47 | if isinstance(request, CloseContextRequest): 48 | return self.process_close_context_request(request) 49 | 50 | if isinstance(request, PuppeteerRequest): 51 | return self.process_puppeteer_request(request) 52 | 53 | def process_close_context_request(self, request: CloseContextRequest): 54 | if not request.is_valid_url: 55 | return request.replace( 56 | url=urljoin(self.service_base_url, "/close_context"), 57 | ) 58 | 59 | def process_puppeteer_request(self, request: PuppeteerRequest): 60 | action = request.action 61 | service_url = urljoin(self.service_base_url, action.endpoint) 62 | service_params = self._encode_service_params(request) 63 | if service_params: 64 | service_url += "?" + service_params 65 | meta = { 66 | "puppeteer_request": request, 67 | "dont_obey_robotstxt": True, 68 | "proxy": None, 69 | } 70 | if self.include_meta: 71 | meta = {**request.meta, **meta} 72 | action_request = ActionRequest( 73 | url=service_url, 74 | action=action, 75 | method="POST", 76 | headers=Headers({"Content-Type": action.content_type}), 77 | body=self._serialize_body(action, request), 78 | dont_filter=True, 79 | cookies=request.cookies, 80 | priority=request.priority, 81 | callback=request.callback, 82 | cb_kwargs=request.cb_kwargs, 83 | errback=request.errback, 84 | meta=meta, 85 | ) 86 | return action_request 87 | 88 | @staticmethod 89 | def _encode_service_params(request): 90 | service_params = {} 91 | if request.context_id is not None: 92 | service_params["contextId"] = request.context_id 93 | if request.page_id is not None: 94 | service_params["pageId"] = request.page_id 95 | if request.close_page: 96 | service_params["closePage"] = 1 97 | return urlencode(service_params) 98 | 99 | def _serialize_body(self, action, request): 100 | payload = action.payload() 101 | if action.content_type == "application/json": 102 | payload = self.__clean_payload(payload) 103 | proxy = request.meta.get("proxy") 104 | if proxy: 105 | payload["proxy"] = proxy 106 | include_headers = ( 107 | self.include_headers 108 | if request.include_headers is None 109 | else request.include_headers 110 | ) 111 | if include_headers: 112 | headers = request.headers.to_unicode_dict() 113 | if isinstance(include_headers, list): 114 | headers = { 115 | h.lower(): headers[h] for h in include_headers if h in headers 116 | } 117 | payload["headers"] = headers 118 | return json.dumps(payload) 119 | return str(payload) 120 | 121 | def __clean_payload(self, payload): 122 | """ 123 | disallow null values in request parameters 124 | """ 125 | if isinstance(payload, dict): 126 | payload = { 127 | k: self.__clean_payload(v) for k, v in payload.items() if v is not None 128 | } 129 | elif isinstance(payload, list): 130 | payload = [self.__clean_payload(v) for v in payload if v is not None] 131 | return payload 132 | 133 | def close_used_contexts(self, spider): 134 | contexts = list(self.used_contexts.pop(id(spider), set())) 135 | if contexts: 136 | request = CloseContextRequest( 137 | contexts, 138 | meta={"proxy": None}, 139 | ) 140 | 141 | def handle_close_contexts_result(result): 142 | if isinstance(result, Response): 143 | if result.status == 200: 144 | self.service_logger.debug( 145 | f"Successfully closed {len(request.contexts)} " 146 | f"contexts with request {result.request}" 147 | ) 148 | else: 149 | self.service_logger.warning( 150 | f"Could not close contexts: {result.text}" 151 | ) 152 | elif isinstance(result, Failure): 153 | self.service_logger.warning( 154 | f"Could not close contexts: {result.value}", 155 | exc_info=failure_to_exc_info(result), 156 | ) 157 | 158 | dfd = self.crawler.engine.download(request) 159 | dfd.addBoth(handle_close_contexts_result) 160 | 161 | raise DontCloseSpider() 162 | 163 | def process_response(self, middleware, request, response, spider): 164 | if not isinstance(response, TextResponse): 165 | return response 166 | 167 | puppeteer_request = request.meta.get("puppeteer_request") 168 | if puppeteer_request is None: 169 | return response 170 | 171 | if b"application/json" not in response.headers.get(b"Content-Type", b""): 172 | return response.replace(request=request) 173 | 174 | response_data = json.loads(response.text) 175 | if response.status != 200: 176 | reason = response_data.pop("error", f"undefined, status {response.status}") 177 | middleware.service_logger.warning( 178 | f"Request {request} is not succeeded. Reason: {reason}" 179 | ) 180 | context_id = response_data.get("contextId") 181 | if context_id: 182 | self.used_contexts[id(spider)].add(context_id) 183 | return response 184 | 185 | response_cls = self._get_response_class(puppeteer_request.action) 186 | 187 | return self._form_response( 188 | response_cls, 189 | response_data, 190 | puppeteer_request.url, 191 | request, 192 | puppeteer_request, 193 | spider, 194 | ) 195 | 196 | def _form_response( 197 | self, 198 | response_cls, 199 | response_data, 200 | url, 201 | request, 202 | puppeteer_request, 203 | spider, 204 | ): 205 | context_id = response_data.pop("contextId", puppeteer_request.context_id) 206 | page_id = response_data.pop("pageId", puppeteer_request.page_id) 207 | self.used_contexts[id(spider)].add(context_id) 208 | 209 | return response_cls( 210 | url=url, 211 | puppeteer_request=puppeteer_request, 212 | context_id=context_id, 213 | page_id=page_id, 214 | request=request, 215 | **response_data, 216 | ) 217 | 218 | def _get_response_class(self, request_action): 219 | if isinstance( 220 | request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm) 221 | ): 222 | return PuppeteerHtmlResponse 223 | if isinstance(request_action, Screenshot): 224 | return PuppeteerScreenshotResponse 225 | if isinstance(request_action, Har): 226 | return PuppeteerHarResponse 227 | if isinstance(request_action, RecaptchaSolver): 228 | return PuppeteerRecaptchaSolverResponse 229 | if isinstance(request_action, Compose): 230 | # Response class is a last action's response class 231 | return self._get_response_class(request_action.actions[-1]) 232 | return PuppeteerJsonResponse 233 | -------------------------------------------------------------------------------- /scrapypuppeteer/middleware.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import defaultdict 3 | from typing import List, Union 4 | 5 | from scrapy import signals 6 | from scrapy.crawler import Crawler 7 | from scrapy.exceptions import IgnoreRequest, NotConfigured 8 | 9 | from scrapypuppeteer.actions import ( 10 | Click, 11 | CustomJsAction, 12 | RecaptchaSolver, 13 | Screenshot, 14 | Scroll, 15 | ) 16 | from scrapypuppeteer.browser_managers import BrowserManager 17 | from scrapypuppeteer.browser_managers.playwright_browser_manager import ( 18 | PlaywrightBrowserManager, 19 | ) 20 | from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( 21 | PyppeteerBrowserManager, 22 | ) 23 | from scrapypuppeteer.browser_managers.service_browser_manager import ( 24 | ServiceBrowserManager, 25 | ) 26 | from scrapypuppeteer.request import ActionRequest, CloseContextRequest, PuppeteerRequest 27 | from scrapypuppeteer.response import ( 28 | PuppeteerHtmlResponse, 29 | PuppeteerResponse, 30 | ) 31 | 32 | 33 | class PuppeteerServiceDownloaderMiddleware: 34 | """ 35 | This downloader middleware converts PuppeteerRequest instances to 36 | Puppeteer service API requests and then converts its responses to 37 | PuppeteerResponse instances. Additionally, it tracks all browser contexts 38 | that spider uses and performs cleanup request to service right before 39 | spider is closed. 40 | 41 | Additionally, the middleware uses these meta-keys, do not use them, because their changing 42 | could possibly (almost probably) break determined behaviour: 43 | 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' 44 | 45 | Settings: 46 | 47 | PUPPETEER_SERVICE_URL (str) 48 | Service URL, e.g. 'http://localhost:3000' 49 | 50 | PUPPETEER_INCLUDE_HEADERS (bool|list[str]) 51 | Determines which request headers will be sent to remote site by puppeteer service. 52 | Either True (all headers), False (no headers) or list of header names. 53 | May be overridden per request. 54 | By default, only cookies are sent. 55 | 56 | PUPPETEER_INCLUDE_META (bool) 57 | Determines whether to send or not user's meta attached by user. 58 | Default to False. 59 | """ 60 | 61 | SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" 62 | INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" 63 | SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" 64 | DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately 65 | 66 | EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" 67 | 68 | service_logger = logging.getLogger(__name__) 69 | 70 | def __init__( 71 | self, 72 | crawler: Crawler, 73 | service_url: str, 74 | include_headers: Union[bool, List[str]], 75 | include_meta: bool, 76 | browser_manager: BrowserManager, 77 | ): 78 | self.service_base_url = service_url 79 | self.include_headers = include_headers 80 | self.include_meta = include_meta 81 | self.crawler = crawler 82 | self.used_contexts = defaultdict(set) 83 | self.browser_manager = browser_manager 84 | 85 | @classmethod 86 | def from_crawler(cls, crawler): 87 | service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) 88 | if cls.INCLUDE_HEADERS_SETTING in crawler.settings: 89 | try: 90 | include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) 91 | except ValueError: 92 | include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) 93 | else: 94 | include_headers = cls.DEFAULT_INCLUDE_HEADERS 95 | include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) 96 | 97 | execution_method = crawler.settings.get( 98 | cls.EXECUTION_METHOD_SETTING, "PUPPETEER" 99 | ).lower() 100 | 101 | if execution_method == "pyppeteer": 102 | browser_manager = PyppeteerBrowserManager() 103 | elif execution_method == "puppeteer": 104 | browser_manager = ServiceBrowserManager( 105 | service_url, include_meta, include_headers, crawler 106 | ) 107 | elif execution_method == "playwright": 108 | browser_manager = PlaywrightBrowserManager() 109 | else: 110 | raise NameError("Wrong EXECUTION_METHOD") 111 | 112 | middleware = cls( 113 | crawler, service_url, include_headers, include_meta, browser_manager 114 | ) 115 | crawler.signals.connect( 116 | middleware.browser_manager.close_used_contexts, signal=signals.spider_idle 117 | ) 118 | return middleware 119 | 120 | def process_request(self, request, spider): 121 | return self.browser_manager.process_request(request) 122 | 123 | def process_response(self, request, response, spider): 124 | return self.browser_manager.process_response(self, request, response, spider) 125 | 126 | 127 | class PuppeteerRecaptchaDownloaderMiddleware: 128 | """ 129 | This middleware is supposed to solve recaptcha on the page automatically. 130 | If there is no captcha on the page then this middleware will do nothing 131 | on the page, so your 2captcha balance will remain the same. 132 | It can submit recaptcha if "submit button" is provided. 133 | It will not "submit" captcha if there is no submit-selector. 134 | 135 | If you want to turn Recaptcha solving off on the exact request provide 136 | meta-key 'dont_recaptcha' with True value. The middleware will skip the request 137 | through itself. 138 | 139 | The middleware uses additionally these meta-keys, do not use them, because their changing 140 | could possibly (almost probably) break determined behaviour: 141 | '_captcha_submission', '_captcha_solving' 142 | 143 | Settings: 144 | 145 | RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) 146 | RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not 147 | RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and 148 | these domains' submit selectors, e.g. 149 | 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' 150 | it could be also squeezed to 151 | 'ecaptcha/api2/de': '#recaptcha-demo-submit' 152 | also you can use not just strings but Click actions with required parameters: 153 | 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') 154 | In general - domain is a unique identifying string which is contained in web-page url 155 | If there is no button to submit recaptcha then provide empty string to a domain. 156 | This setting can also be a string. If so the middleware will only click the button 157 | related to this selector. 158 | This setting can also be unprovided. In this case every web-page you crawl is supposed to be 159 | without submit button, or you manually do it yourself. 160 | """ 161 | 162 | MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" 163 | RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" 164 | SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" 165 | 166 | def __init__(self, recaptcha_solving: bool, submit_selectors: dict): 167 | self.submit_selectors = submit_selectors 168 | self.recaptcha_solving = recaptcha_solving 169 | self._page_responses = dict() 170 | self._page_closing = set() 171 | 172 | @classmethod 173 | def from_crawler(cls, crawler: Crawler): 174 | activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) 175 | if not activation: 176 | raise NotConfigured 177 | recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) 178 | 179 | try: 180 | submit_selectors = crawler.settings.getdict( 181 | cls.SUBMIT_SELECTORS_SETTING, dict() 182 | ) 183 | except ValueError: 184 | submit_selectors = { 185 | "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") 186 | } 187 | except Exception as exception: 188 | raise ValueError( 189 | f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" 190 | ) 191 | 192 | for key in submit_selectors.keys(): 193 | submit_selector = submit_selectors[key] 194 | if isinstance(submit_selector, str): 195 | submit_selectors[key] = Click(selector=submit_selector) 196 | elif not isinstance(submit_selector, Click): 197 | raise TypeError( 198 | "Submit selector must be str or Click," 199 | f"but {type(submit_selector)} provided" 200 | ) 201 | return cls(recaptcha_solving, submit_selectors) 202 | 203 | @staticmethod 204 | def is_recaptcha_producing_action(action) -> bool: 205 | return not isinstance( 206 | action, 207 | (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), 208 | ) 209 | 210 | def process_request(self, request, **_): 211 | if request.meta.get("dont_recaptcha", False): 212 | return None 213 | 214 | # Checking if we need to close page after action 215 | if isinstance(request, PuppeteerRequest): 216 | if self.is_recaptcha_producing_action(request.action): 217 | if request.close_page and not request.meta.get( 218 | "_captcha_submission", False 219 | ): 220 | request.close_page = False 221 | request.dont_filter = True 222 | self._page_closing.add(request) 223 | return request 224 | 225 | def process_response(self, request, response, spider): 226 | if not isinstance( 227 | response, PuppeteerResponse 228 | ): # We only work with PuppeteerResponses 229 | return response 230 | 231 | puppeteer_request = response.puppeteer_request 232 | if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses 233 | return response 234 | 235 | if puppeteer_request.meta.pop( 236 | "_captcha_submission", False 237 | ): # Submitted captcha 238 | return self.__gen_response(response) 239 | 240 | if puppeteer_request.meta.pop("_captcha_solving", False): 241 | # RECaptchaSolver was called by recaptcha middleware 242 | return self._submit_recaptcha(request, response, spider) 243 | 244 | if not self.is_recaptcha_producing_action(puppeteer_request.action): 245 | # No recaptcha after these actions 246 | return response 247 | 248 | # Any puppeteer response besides PuppeteerRecaptchaSolverResponse 249 | return self._solve_recaptcha(request, response) 250 | 251 | def _solve_recaptcha(self, request, response): 252 | self._page_responses[response.page_id] = ( 253 | response # Saving main response to return it later 254 | ) 255 | 256 | recaptcha_solver = RecaptchaSolver( 257 | solve_recaptcha=self.recaptcha_solving, 258 | close_on_empty=self.__is_closing(response, remove_request=False), 259 | navigation_options={"waitUntil": "domcontentloaded"}, 260 | ) 261 | return response.follow( 262 | recaptcha_solver, 263 | callback=request.callback, 264 | cb_kwargs=request.cb_kwargs, 265 | errback=request.errback, 266 | meta={"_captcha_solving": True}, 267 | close_page=False, 268 | ) 269 | 270 | def _submit_recaptcha(self, request, response, spider): 271 | if not response.puppeteer_request.action.solve_recaptcha: 272 | spider.log( 273 | message=f"Found {len(response.recaptcha_data['captchas'])} captcha " 274 | f"but did not solve due to argument", 275 | level=logging.INFO, 276 | ) 277 | return self.__gen_response(response) 278 | # Click "submit button"? 279 | if response.recaptcha_data["captchas"] and self.submit_selectors: 280 | # We need to click "submit button" 281 | for domain, submitting in self.submit_selectors.items(): 282 | if domain in response.url: 283 | if not submitting.selector: 284 | return self.__gen_response(response) 285 | return response.follow( 286 | action=submitting, 287 | callback=request.callback, 288 | cb_kwargs=request.cb_kwargs, 289 | errback=request.errback, 290 | close_page=self.__is_closing(response), 291 | meta={"_captcha_submission": True}, 292 | ) 293 | raise IgnoreRequest( 294 | "No submit selector found to click on the page but captcha found" 295 | ) 296 | return self.__gen_response(response) 297 | 298 | def __gen_response(self, response): 299 | main_response_data = dict() 300 | main_response_data["page_id"] = ( 301 | None if self.__is_closing(response) else response.puppeteer_request.page_id 302 | ) 303 | 304 | main_response = self._page_responses.pop(response.page_id) 305 | 306 | if isinstance(main_response, PuppeteerHtmlResponse): 307 | if isinstance(response.puppeteer_request.action, RecaptchaSolver): 308 | main_response_data["body"] = response.html 309 | elif isinstance(response.puppeteer_request.action, Click): 310 | main_response_data["body"] = response.body 311 | 312 | return main_response.replace(**main_response_data) 313 | 314 | def __is_closing(self, response, remove_request: bool = True) -> bool: 315 | main_request = self._page_responses[response.page_id].puppeteer_request 316 | close_page = main_request in self._page_closing 317 | if close_page and remove_request: 318 | self._page_closing.remove(main_request) 319 | return close_page 320 | -------------------------------------------------------------------------------- /scrapypuppeteer/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List, Tuple, Union 3 | 4 | from scrapy.http import Headers, Request 5 | 6 | from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction 7 | 8 | 9 | class ActionRequest(Request): 10 | """ 11 | Request with puppeteer action parameter and 12 | beautified representation. 13 | """ 14 | 15 | attributes: Tuple[str, ...] = Request.attributes + ("action",) 16 | """ 17 | A tuple of :class:`str` objects containing the name of all public 18 | attributes of the class that are also keyword parameters of the 19 | ``__init__`` method. 20 | """ 21 | 22 | def __init__(self, url: str, action: Union[str, PuppeteerServiceAction], **kwargs): 23 | self.action = action 24 | super().__init__(url, **kwargs) 25 | 26 | def __repr__(self): 27 | return f"<{self.action.endpoint.upper()} {self.meta.get('puppeteer_request', self).url}>" 28 | 29 | def __str__(self): 30 | return self.__repr__() 31 | 32 | 33 | class PuppeteerRequest(ActionRequest): 34 | """ 35 | Request to be executed in browser with puppeteer. 36 | """ 37 | 38 | attributes: Tuple[str, ...] = ActionRequest.attributes + ( 39 | "context_id", 40 | "page_id", 41 | "close_page", 42 | "include_headers", 43 | ) 44 | """ 45 | A tuple of :class:`str` objects containing the name of all public 46 | attributes of the class that are also keyword parameters of the 47 | ``__init__`` method. 48 | 49 | Currently used by :meth:`PuppeteerRequest.replace` 50 | """ 51 | 52 | def __init__( 53 | self, 54 | action: Union[str, PuppeteerServiceAction], 55 | context_id: str = None, 56 | page_id: str = None, 57 | close_page: bool = True, 58 | include_headers: Union[bool, List[str]] = None, 59 | har_recording: bool = False, 60 | **kwargs, 61 | ): 62 | """ 63 | 64 | :param action: URL or browser action 65 | :param context_id: puppeteer browser context id; if None (default), 66 | new incognito context will be created 67 | :param page_id: puppeteer browser page id; if None (default), new 68 | page will be opened in given context 69 | :param close_page: whether to close page after request completion; 70 | set to False, if you want to continue interacting 71 | with the page 72 | :param include_headers: determines which headers will be sent to remote 73 | site by puppeteer: either True (all headers), 74 | False (no headers), list of header names 75 | or None (default, let middleware decide) 76 | :param kwargs: 77 | """ 78 | url = kwargs.pop("url", None) 79 | if isinstance(action, str): 80 | url = action 81 | navigation_options = kwargs.pop("navigation_options", None) 82 | wait_options = kwargs.pop("wait_options", None) 83 | action = GoTo( 84 | url, 85 | navigation_options=navigation_options, 86 | wait_options=wait_options, 87 | har_recording=har_recording, 88 | ) 89 | elif isinstance(action, GoTo): 90 | url = action.url 91 | elif isinstance(action, Compose): 92 | if isinstance(action.actions[0], GoTo): 93 | url = action.actions[0].url 94 | elif not isinstance(action, PuppeteerServiceAction): 95 | raise TypeError( 96 | f"Undefined browser action: `{type(action)}`. `Expected PuppeteerServiceAction`" 97 | ) 98 | if url is None: 99 | raise ValueError( 100 | "Request is not a goto-containing request and does not follow a response" 101 | ) 102 | super().__init__(url, action, **kwargs) 103 | self.context_id = context_id 104 | self.page_id = page_id 105 | self.close_page = close_page 106 | self.include_headers = include_headers 107 | 108 | 109 | class CloseContextRequest(Request): 110 | """ 111 | This request is used to close the browser contexts. 112 | 113 | The response for this request is a regular Scrapy HTMLResponse. 114 | """ 115 | 116 | attributes: Tuple[str, ...] = Request.attributes + ("contexts",) 117 | 118 | def __init__(self, contexts: List, **kwargs): 119 | """ 120 | :param contexts: list of puppeteer contexts to close. 121 | 122 | :param kwargs: arguments of scrapy.Request. 123 | """ 124 | self.contexts = contexts 125 | self.is_valid_url = False 126 | 127 | if "url" in kwargs: 128 | self.is_valid_url = True 129 | url = kwargs.pop("url", "://") # Incorrect url. To be replaced in middleware 130 | 131 | kwargs["method"] = "POST" 132 | kwargs["headers"] = Headers({"Content-Type": "application/json"}) 133 | kwargs["body"] = json.dumps(self.contexts) 134 | 135 | super().__init__(url, **kwargs) 136 | 137 | def __repr__(self): 138 | return f"" 139 | 140 | def __str__(self): 141 | return self.__repr__() 142 | -------------------------------------------------------------------------------- /scrapypuppeteer/response.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Generator, Tuple, Union 3 | 4 | import parsel 5 | from scrapy.exceptions import ScrapyDeprecationWarning 6 | from scrapy.http import HtmlResponse, TextResponse 7 | from scrapy.http.response.text import _url_from_selector 8 | from scrapy.link import Link 9 | 10 | from scrapypuppeteer import PuppeteerRequest 11 | from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction 12 | 13 | 14 | class PuppeteerResponse(TextResponse): 15 | attributes: Tuple[str, ...] = TextResponse.attributes + ( 16 | "url", 17 | "puppeteer_request", 18 | "context_id", 19 | "page_id", 20 | ) 21 | """ 22 | A tuple of :class:`str` objects containing the name of all public 23 | attributes of the class that are also keyword parameters of the 24 | ``__init__`` method. 25 | 26 | Currently used by :meth:`PuppeteerResponse.replace`. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | url: str, 32 | puppeteer_request: PuppeteerRequest, 33 | context_id: str, 34 | page_id: str, 35 | **kwargs, 36 | ): 37 | self.puppeteer_request = puppeteer_request 38 | self.context_id = context_id 39 | self.page_id = page_id 40 | super().__init__(url, **kwargs) 41 | 42 | def follow( 43 | self, 44 | action: Union[str, parsel.Selector, Link, PuppeteerServiceAction], 45 | close_page=True, 46 | accumulate_meta: bool = False, 47 | **kwargs, 48 | ) -> PuppeteerRequest: 49 | """ 50 | Execute action on the same browser page. 51 | 52 | :param action: URL (maybe relative) or browser action. 53 | :param close_page: whether to close page after request completion 54 | :param accumulate_meta: whether to accumulate meta from response 55 | :param kwargs: 56 | :return: 57 | """ 58 | page_id = None if self.puppeteer_request.close_page else self.page_id 59 | if isinstance(action, str): 60 | action = self.urljoin(action) 61 | elif isinstance(action, parsel.Selector): 62 | action = self.urljoin(_url_from_selector(action)) 63 | elif isinstance(action, Link): 64 | action = self.urljoin(action.url) 65 | elif isinstance(action, GoTo): 66 | action.url = self.urljoin(action.url) 67 | else: 68 | kwargs["url"] = self.url 69 | kwargs["dont_filter"] = True 70 | if accumulate_meta: 71 | kwargs["meta"] = {**self.meta, **kwargs.pop("meta", {})} 72 | return PuppeteerRequest( 73 | action, 74 | context_id=self.context_id, 75 | page_id=page_id, 76 | close_page=close_page, 77 | **kwargs, 78 | ) 79 | 80 | def follow_all( 81 | self, 82 | actions=None, 83 | close_page: bool = True, 84 | accumulate_meta: bool = False, 85 | css=None, 86 | xpath=None, 87 | **kwargs, 88 | ) -> Generator[PuppeteerRequest, None, None]: 89 | """ 90 | Execute actions in the same context but in other browser pages. 91 | Only one of `actions`, `css`, or `xpath` must be specified.` 92 | Note that original page from which the method was called lasts unaffected. 93 | 94 | :param actions: iterable of PuppeteerActions or selectors 95 | :param close_page: whether to close page after request completion 96 | :param accumulate_meta: whether to accumulate meta from response 97 | :param css: selector 98 | :param xpath: selector 99 | :return: Iterable[PuppeteerRequest] 100 | """ 101 | 102 | arguments = [x for x in (actions, css, xpath) if x is not None] 103 | if len(arguments) != 1: 104 | raise ValueError( 105 | "Please supply exactly one of the following arguments: actions, css, xpath" 106 | ) 107 | if not actions: 108 | if css: 109 | actions = self.css(css) 110 | if xpath: 111 | actions = self.xpath(xpath) 112 | else: 113 | # Ban any PuppeteerAction except GoTo and GoTo-like Compose 114 | for action in actions: 115 | if isinstance(action, PuppeteerServiceAction): 116 | if isinstance(action, Compose): 117 | action = action.actions[0] 118 | if not isinstance(action, GoTo): 119 | raise TypeError(f"Expected GoTo, got {type(action)}") 120 | 121 | page_id = self.page_id 122 | for action in actions: 123 | self.page_id = None # Substitution of page_id in order to create new page 124 | try: 125 | next_request = self.follow( 126 | action, 127 | close_page=close_page, 128 | accumulate_meta=accumulate_meta, 129 | **kwargs, 130 | ) 131 | finally: # To save the original state of response 132 | self.page_id = page_id 133 | yield next_request 134 | 135 | 136 | class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse): 137 | """ 138 | scrapy.TextResponse capturing state of a page in browser. 139 | Additionally, exposes received html and cookies via corresponding attributes. 140 | """ 141 | 142 | attributes: Tuple[str, ...] = tuple( 143 | set(PuppeteerResponse.attributes + HtmlResponse.attributes) 144 | ) + ("html", "cookies") 145 | """ 146 | A tuple of :class:`str` objects containing the name of all public 147 | attributes of the class that are also keyword parameters of the 148 | ``__init__`` method. 149 | 150 | Currently used by :meth:`PuppeteerResponse.replace`. 151 | """ 152 | 153 | def __init__(self, url, puppeteer_request, context_id, page_id, **kwargs): 154 | self.html = kwargs.pop("html") 155 | self.cookies = kwargs.pop("cookies") 156 | kwargs.setdefault("body", self.html) 157 | kwargs.setdefault("encoding", "utf-8") 158 | kwargs.setdefault("headers", {}).setdefault("Content-Type", "text/html") 159 | super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) 160 | 161 | 162 | class PuppeteerScreenshotResponse(PuppeteerResponse): 163 | """ 164 | Response for Screenshot action. 165 | Screenshot is available via self.screenshot as base64 encoded string. 166 | """ 167 | 168 | attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("screenshot",) 169 | 170 | def __init__(self, url, puppeteer_request, context_id, page_id, **kwargs): 171 | self.screenshot = kwargs.pop("screenshot") 172 | super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) 173 | 174 | 175 | class PuppeteerHarResponse(PuppeteerResponse): 176 | """ 177 | Response for Har action. 178 | Har is available via self.har. 179 | """ 180 | 181 | attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("har",) 182 | 183 | def __init__(self, url, puppeteer_request, context_id, page_id, **kwargs): 184 | self.har = kwargs.pop("har") 185 | super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) 186 | 187 | 188 | class PuppeteerJsonResponse(PuppeteerResponse): 189 | """ 190 | Response for CustomJsAction. 191 | Result is available via self.data object. 192 | """ 193 | 194 | attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("data",) 195 | 196 | def __init__(self, url, puppeteer_request, context_id, page_id, data, **kwargs): 197 | kwargs["headers"] = {"Content-Type": "application/json"} 198 | self.data = data 199 | super().__init__(url, puppeteer_request, context_id, page_id, **kwargs) 200 | 201 | def to_html(self) -> PuppeteerHtmlResponse: 202 | """ 203 | Tries to converge a PuppeteerJsonResponse to a PuppeteerHtmlResponse. 204 | For this self.data must be dict. 205 | Then self.data must have "html" key with a string containing a page content 206 | and "cookies" key with a list of cookies or None. 207 | 208 | If the .data property does not have at least 1 argument the error is raised. 209 | """ 210 | if not isinstance(self.data, dict): 211 | raise TypeError( 212 | "PuppeteerJsonResponse's .data property must be a dict" 213 | "to converse it to a PuppeteerHtmlResponse." 214 | ) 215 | 216 | kwargs = dict() 217 | for attr in PuppeteerResponse.attributes: 218 | kwargs[attr] = getattr(self, attr) 219 | kwargs["html"] = self.data["html"] 220 | kwargs["body"] = kwargs["html"] 221 | kwargs["cookies"] = self.data["cookies"] 222 | kwargs["headers"].update({"Content-Type": ["text/html"]}) 223 | kwargs["encoding"] = "utf-8" 224 | 225 | return PuppeteerHtmlResponse(**kwargs) 226 | 227 | 228 | class PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse): 229 | """ 230 | Response for RecaptchaSolver. 231 | Result is available via self.recaptcha_data and self.data["recaptcha_data"] 232 | (deprecated, to be deleted in next versions) object. 233 | You can visit 234 | https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object 235 | to get information about return value. 236 | """ 237 | 238 | attributes: Tuple[str, ...] = tuple( 239 | set(PuppeteerHtmlResponse.attributes + PuppeteerJsonResponse.attributes) 240 | ) + ("recaptcha_data",) 241 | 242 | @property 243 | def data(self): 244 | warnings.warn( 245 | "self.data['recaptcha_data'] is deprecated and staged to remove in next versions. " 246 | "Use self.recaptcha_data instead.", 247 | ScrapyDeprecationWarning, 248 | stacklevel=2, 249 | ) 250 | return self._data 251 | 252 | @data.setter 253 | def data(self, value): 254 | self._data = value 255 | 256 | def __init__( 257 | self, url, puppeteer_request, context_id, page_id, recaptcha_data, **kwargs 258 | ): 259 | kwargs["headers"] = {"Content-Type": "application/json"} 260 | self._data = {"recaptcha_data": recaptcha_data} 261 | self.recaptcha_data = recaptcha_data 262 | super().__init__( 263 | url, puppeteer_request, context_id, page_id, self._data, **kwargs 264 | ) 265 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import find_packages, setup 4 | 5 | 6 | def read_long_description(file_path): 7 | with open(file_path, "r") as file: 8 | return file.read() 9 | 10 | 11 | setup( 12 | name="scrapy-puppeteer-client", 13 | version="0.3.9", 14 | description="A library to use Puppeteer-managed browser in Scrapy spiders", 15 | long_description=read_long_description("README.md"), 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/ispras/scrapy-puppeteer", 18 | author="MODIS @ ISP RAS", 19 | maintainer="Maksim Varlamov", 20 | maintainer_email="varlamov@ispras.ru", 21 | packages=find_packages(), 22 | install_requires=["scrapy>=2.6", "pyppeteer", "syncer", "bs4", "playwright"], 23 | python_requires=">=3.6", 24 | license="BSD", 25 | classifiers=[ 26 | "Development Status :: 3 - Alpha", 27 | "Programming Language :: Python :: 3", 28 | "Programming Language :: Python :: 3.6", 29 | "Programming Language :: Python :: 3.7", 30 | "Programming Language :: Python :: 3.8", 31 | "Programming Language :: Python :: 3.9", 32 | "Programming Language :: Python :: 3.10", 33 | "Programming Language :: Python :: 3.11", 34 | "Framework :: Scrapy", 35 | "Intended Audience :: Developers", 36 | "Operating System :: OS Independent", 37 | "License :: OSI Approved :: BSD License", 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /tests/actions/constants.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations 2 | from random import randint 3 | 4 | URLS = ("https://some_url.com", "not_url/not_url") 5 | WAIT_UNTIL = ("load", "domcontentloaded", "networkidle0") 6 | WAIT_OPTS = [None] 7 | SELECTORS = ("nothing", "tr.td::attr(something)") 8 | CLICK_OPTS = [None] 9 | HAR_RECORDING = [None] 10 | 11 | 12 | def __gen_nav_opts(): 13 | options = [None] 14 | for opt_num in range(1, 5): 15 | for comb in combinations(WAIT_UNTIL, opt_num): 16 | timeout = randint(0, 100) * 1000 17 | options.append( 18 | { 19 | "timeout": timeout, 20 | "waitUntil": list(comb), 21 | } 22 | ) 23 | return options 24 | 25 | 26 | NAV_OPTS = __gen_nav_opts() 27 | -------------------------------------------------------------------------------- /tests/actions/test_actions.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | 3 | from constants import CLICK_OPTS, HAR_RECORDING, NAV_OPTS, SELECTORS, URLS, WAIT_OPTS 4 | from pytest import mark 5 | 6 | from scrapypuppeteer.actions import Click, GoBack, GoForward, GoTo, Scroll 7 | 8 | 9 | def _gen_goto(): 10 | for url, nav_opt, wait_opt, har_recording in product( 11 | URLS, NAV_OPTS, WAIT_OPTS, HAR_RECORDING 12 | ): 13 | expected = { 14 | "url": url, 15 | "navigationOptions": nav_opt, 16 | "waitOptions": wait_opt, 17 | "harRecording": har_recording, 18 | } 19 | yield url, nav_opt, wait_opt, har_recording, expected 20 | 21 | 22 | def _gen_back_forward(): 23 | for nav_opt, wait_opt in product(NAV_OPTS, WAIT_OPTS): 24 | expected = { 25 | "navigationOptions": nav_opt, 26 | "waitOptions": wait_opt, 27 | } 28 | yield nav_opt, wait_opt, expected 29 | 30 | 31 | def _gen_click(): 32 | for selector, click_opt, nav_opt, wait_opt in product( 33 | SELECTORS, CLICK_OPTS, NAV_OPTS, WAIT_OPTS 34 | ): 35 | expected = { 36 | "selector": selector, 37 | "clickOptions": click_opt, 38 | "waitOptions": wait_opt, 39 | "navigationOptions": nav_opt, 40 | } 41 | yield selector, click_opt, nav_opt, wait_opt, expected 42 | 43 | 44 | def _gen_scroll(): 45 | for selector, wait_opt in product(SELECTORS, WAIT_OPTS): 46 | expected = {"selector": selector, "waitOptions": wait_opt} 47 | yield selector, wait_opt, expected 48 | 49 | 50 | @mark.parametrize( 51 | "url, navigation_options, wait_options, har_recording, expected", _gen_goto() 52 | ) 53 | def test_goto(url, navigation_options, wait_options, har_recording, expected): 54 | action = GoTo(url, navigation_options, wait_options, har_recording) 55 | assert action.payload() == expected 56 | 57 | 58 | @mark.parametrize("navigation_options, wait_options, expected", _gen_back_forward()) 59 | def test_go_forward(navigation_options, wait_options, expected): 60 | action = GoForward(navigation_options, wait_options) 61 | assert action.payload() == expected 62 | 63 | 64 | @mark.parametrize("navigation_options, wait_options, expected", _gen_back_forward()) 65 | def test_go_forward(navigation_options, wait_options, expected): 66 | action = GoBack(navigation_options, wait_options) 67 | assert action.payload() == expected 68 | 69 | 70 | @mark.parametrize( 71 | "selector, click_options, navigation_options, wait_options, expected", _gen_click() 72 | ) 73 | def test_click(selector, click_options, navigation_options, wait_options, expected): 74 | action = Click(selector, click_options, wait_options, navigation_options) 75 | assert action.payload() == expected 76 | 77 | 78 | @mark.parametrize("selector, wait_options, expected", _gen_scroll()) 79 | def test_scroll(selector, wait_options, expected): 80 | action = Scroll(selector, wait_options) 81 | assert action.payload() == expected 82 | -------------------------------------------------------------------------------- /tests/middleware/test_middleware.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.test import get_crawler 2 | from twisted.internet import defer 3 | from twisted.trial.unittest import TestCase 4 | 5 | from tests.mockserver import MockServer 6 | from tests.spiders import ( 7 | ClickSpider, 8 | CustomJsActionSpider, 9 | GoBackForwardSpider, 10 | GoToSpider, 11 | RecaptchaSolverSpider, 12 | ScreenshotSpider, 13 | ) 14 | 15 | 16 | class PuppeteerCrawlTest(TestCase): 17 | SETTINGS = { 18 | "DOWNLOADER_MIDDLEWARES": { 19 | "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042 20 | }, 21 | "PUPPETEER_SERVICE_URL": None, 22 | } 23 | 24 | def setUp(self): 25 | self.mockserver = MockServer() 26 | self.mockserver.__enter__() 27 | self.SETTINGS["PUPPETEER_SERVICE_URL"] = self.mockserver.http_address 28 | 29 | def tearDown(self): 30 | self.mockserver.__exit__(None, None, None) 31 | 32 | def _start_testing(self, spider_cls, expected): 33 | crawler = get_crawler(spider_cls, self.SETTINGS) 34 | yield crawler.crawl(mockserver=self.mockserver) 35 | self.assertEqual(expected, len(crawler.spider.urls_visited)) 36 | 37 | @defer.inlineCallbacks 38 | def test_goto(self): 39 | yield from self._start_testing(GoToSpider, 1) 40 | 41 | @defer.inlineCallbacks 42 | def test_back_forward(self): 43 | yield from self._start_testing(GoBackForwardSpider, 1) 44 | 45 | @defer.inlineCallbacks 46 | def test_click(self): 47 | yield from self._start_testing(ClickSpider, 1) 48 | 49 | @defer.inlineCallbacks 50 | def test_screenshot(self): 51 | yield from self._start_testing(ScreenshotSpider, 1) 52 | 53 | @defer.inlineCallbacks 54 | def test_custom_js_action(self): 55 | yield from self._start_testing(CustomJsActionSpider, 1) 56 | 57 | @defer.inlineCallbacks 58 | def test_recaptcha_solver(self): 59 | yield from self._start_testing(RecaptchaSolverSpider, 1) 60 | -------------------------------------------------------------------------------- /tests/middleware/view.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy import Request 3 | 4 | 5 | class ViewSpider(scrapy.Spider): 6 | name = "view" 7 | 8 | start_urls = ["https://www.google.com/recaptcha/api2/demo"] 9 | 10 | custom_settings = {} 11 | 12 | def start_requests(self): 13 | for url in self.start_urls: 14 | yield Request(url, callback=self.parse, errback=self.errback) 15 | 16 | def parse(self, response, **kwargs): 17 | self.log("WE ARE PARSING RESPONSE!") 18 | self.log(response) 19 | self.log(response.body) 20 | self.log("WE HAVE PARSED RESPONSE!") 21 | 22 | def errback(self, failure): 23 | self.log("We are in error processing!") 24 | self.log(failure) 25 | -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from base64 import b64encode 5 | from json import dumps 6 | from pathlib import Path 7 | from secrets import token_hex 8 | from subprocess import PIPE, Popen 9 | from typing import Dict 10 | 11 | from scrapy.utils.python import to_bytes 12 | from twisted.internet import reactor 13 | from twisted.internet.protocol import ServerFactory 14 | from twisted.internet.task import deferLater 15 | from twisted.web import resource 16 | from twisted.web.server import NOT_DONE_YET, Site 17 | 18 | 19 | def get_arg(request, name, default=None, arg_type=None): 20 | if name in request.args: 21 | value = request.args[name][0] 22 | if arg_type is not None: 23 | value = arg_type(value) 24 | return value 25 | return default 26 | 27 | 28 | def get_mockserver_env() -> Dict[str, str]: 29 | """Return an OS environment dict suitable to run mockserver processes.""" 30 | 31 | tests_path = Path(__file__).parent.parent 32 | python_path = str(tests_path) + os.pathsep + os.environ.get("PYTHONPATH", "") 33 | env = os.environ.copy() 34 | env["PYTHONPATH"] = python_path 35 | return env 36 | 37 | 38 | class LeafResource(resource.Resource): 39 | isLeaf = True 40 | 41 | def render_POST(self, request): 42 | page_id = get_arg(request, b"pageId", default=None, arg_type=str) 43 | context_id = get_arg(request, b"contextId", default=None, arg_type=str) 44 | close_page = get_arg(request, b"closePage", default=0, arg_type=bool) 45 | 46 | request.setHeader(b"Content-Type", b"application/json") 47 | 48 | self.defer_request( 49 | request, 0, self.render_request, request, page_id, context_id, close_page 50 | ) 51 | return NOT_DONE_YET 52 | 53 | @staticmethod 54 | def defer_request(request, delay, render_func, *args, **kwargs): 55 | def _cancel_request(_): 56 | # silence CancelledError 57 | d.addErrback(lambda _: None) 58 | d.cancel() 59 | 60 | d = deferLater(reactor, delay, render_func, *args, **kwargs) 61 | request.notifyFinish().addErrback(_cancel_request) 62 | return d 63 | 64 | def render_request(self, request, page_id, context_id, close_page): 65 | request.write( 66 | to_bytes(dumps(self._form_response(page_id, context_id, close_page))) 67 | ) 68 | request.finish() 69 | 70 | def _form_response(self, page_id, context_id, close_page): 71 | raise NotImplementedError 72 | 73 | 74 | class GoTo(LeafResource): 75 | def _form_response(self, page_id, context_id, close_page): 76 | html = """ 77 | 78 | """ 79 | return { 80 | "contextId": token_hex(20), 81 | "pageId": token_hex(20), 82 | "html": html, 83 | "cookies": None, 84 | } 85 | 86 | 87 | class GoForward(LeafResource): 88 | def _form_response(self, page_id, context_id, close_page): 89 | html = """ 90 | went forward 91 | """ 92 | return { 93 | "contextId": context_id, 94 | "pageId": page_id, 95 | "html": html, 96 | "cookies": None, 97 | } 98 | 99 | 100 | class Back(LeafResource): 101 | def _form_response(self, page_id, context_id, close_page): 102 | html = """ 103 | went back 104 | """ 105 | return { 106 | "contextId": context_id, 107 | "pageId": page_id, 108 | "html": html, 109 | "cookies": None, 110 | } 111 | 112 | 113 | class Click(LeafResource): 114 | def _form_response(self, page_id, context_id, close_page): 115 | html = """ 116 | clicked 117 | """ 118 | return { 119 | "contextId": context_id, 120 | "pageId": page_id, 121 | "html": html, 122 | "cookies": None, 123 | } 124 | 125 | 126 | class Screenshot(LeafResource): 127 | def _form_response(self, page_id, context_id, close_page): 128 | with open("./tests/scrapy_logo.png", "rb") as image: 129 | return { 130 | "contextId": context_id, 131 | "pageId": page_id, 132 | "screenshot": b64encode(image.read()).decode(), 133 | } 134 | 135 | 136 | class RecaptchaSolver(LeafResource): 137 | def _form_response(self, page_id, context_id, close_page): 138 | html = """ 139 | there is recaptcha on the page! 140 | """ 141 | return { 142 | "contextId": context_id, 143 | "pageId": page_id, 144 | "html": html, 145 | "cookies": None, 146 | "recaptcha_data": { 147 | "captchas": [1], # 1 captcha 148 | "some_other_fields": [], 149 | }, 150 | } 151 | 152 | 153 | class CustomJsAction(LeafResource): 154 | def _form_response(self, page_id, context_id, close_page): 155 | return { 156 | "contextId": context_id, 157 | "pageId": page_id, 158 | "data": {"field": "Hello!"}, 159 | } 160 | 161 | 162 | class CloseContext(LeafResource): 163 | def render_request(self, request, page_id, context_id, close_page): 164 | request.finish() 165 | 166 | 167 | class Root(resource.Resource): 168 | def __init__(self): 169 | resource.Resource.__init__(self) 170 | self.putChild(b"goto", GoTo()) 171 | self.putChild(b"forward", GoForward()) 172 | self.putChild(b"back", Back()) 173 | self.putChild(b"click", Click()) 174 | self.putChild(b"screenshot", Screenshot()) 175 | self.putChild(b"action", CustomJsAction()) 176 | self.putChild(b"recaptcha_solver", RecaptchaSolver()) 177 | self.putChild(b"close_context", CloseContext()) 178 | 179 | def getChild(self, name, request): 180 | return self 181 | 182 | 183 | class MockServer: 184 | def __enter__(self): 185 | self.proc = Popen( 186 | [sys.executable, "-u", "-m", "tests.mockserver", "-t", "http"], 187 | stdout=PIPE, 188 | env=get_mockserver_env(), 189 | ) 190 | self.http_address = self.proc.stdout.readline().strip().decode("ascii") 191 | 192 | return self 193 | 194 | def __exit__(self, exc_type, exc_value, traceback): 195 | self.proc.kill() 196 | self.proc.communicate() 197 | 198 | def url(self, path): 199 | host = self.http_address.replace("0.0.0.0", "127.0.0.1") 200 | return host + path 201 | 202 | 203 | def main(): 204 | parser = argparse.ArgumentParser() 205 | parser.add_argument("-t", "--type", type=str, choices=("http",), default="http") 206 | args = parser.parse_args() 207 | 208 | if args.type == "http": 209 | root = Root() 210 | factory: ServerFactory = Site(root) 211 | http_port = reactor.listenTCP(0, factory) 212 | 213 | def print_listening(): 214 | http_host = http_port.getHost() 215 | http_address = f"http://{http_host.host}:{http_host.port}" 216 | print(http_address) 217 | 218 | reactor.callWhenRunning(print_listening) 219 | reactor.run() 220 | 221 | 222 | if __name__ == "__main__": 223 | main() 224 | -------------------------------------------------------------------------------- /tests/scrapy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ispras/scrapy-puppeteer/f666232c98a34bbfcaf21aabff51cab54627e62e/tests/scrapy_logo.png -------------------------------------------------------------------------------- /tests/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | 3 | from scrapypuppeteer import PuppeteerRequest 4 | from scrapypuppeteer.actions import ( 5 | Click, 6 | CustomJsAction, 7 | GoBack, 8 | GoForward, 9 | GoTo, 10 | RecaptchaSolver, 11 | Screenshot, 12 | ) 13 | 14 | 15 | class MockServerSpider(Spider): 16 | def __init__(self, mockserver=None, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | self.mockserver = mockserver 19 | 20 | 21 | class MetaSpider(MockServerSpider): 22 | name = "meta" 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__(*args, **kwargs) 26 | self.meta = {} 27 | 28 | def closed(self, reason): 29 | self.meta["close_reason"] = reason 30 | 31 | @staticmethod 32 | def errback(failure): 33 | print(failure) 34 | 35 | 36 | class GoToSpider(MetaSpider): 37 | name = "goto" 38 | 39 | def __init__(self, *args, **kwargs): 40 | super().__init__(*args, **kwargs) 41 | self.urls_visited = [] 42 | 43 | def start_requests(self): 44 | yield PuppeteerRequest( 45 | GoTo("https://some_url.com"), 46 | callback=self.parse, 47 | errback=self.errback, 48 | close_page=False, 49 | ) 50 | 51 | def parse(self, response, **kwargs): 52 | body = b""" 53 | 54 | """ 55 | if response.body == body: 56 | self.urls_visited.append(response.url) 57 | 58 | 59 | class ClickSpider(MetaSpider): 60 | name = "click" 61 | 62 | def __init__(self, *args, **kwargs): 63 | super().__init__(*args, **kwargs) 64 | self.urls_visited = [] 65 | 66 | def start_requests(self): 67 | yield PuppeteerRequest( 68 | GoTo("https://some_url.com"), 69 | callback=self.click, 70 | errback=self.errback, 71 | close_page=False, 72 | ) 73 | 74 | def click(self, response, **kwargs): 75 | yield response.follow( 76 | Click("the_selector"), 77 | callback=self.parse, 78 | errback=self.errback, 79 | close_page=False, 80 | ) 81 | 82 | def parse(self, response, **kwargs): 83 | body = b""" 84 | clicked 85 | """ 86 | if response.body == body: 87 | self.urls_visited.append(response.url) 88 | 89 | 90 | class ScreenshotSpider(MetaSpider): 91 | name = "screenshot" 92 | 93 | def __init__(self, *args, **kwargs): 94 | super().__init__(*args, **kwargs) 95 | self.urls_visited = [] 96 | 97 | def start_requests(self): 98 | yield PuppeteerRequest( 99 | GoTo("https://some_url.com"), 100 | callback=self.screenshot, 101 | errback=self.errback, 102 | close_page=False, 103 | ) 104 | 105 | def screenshot(self, response, **kwargs): 106 | yield response.follow( 107 | Screenshot(), callback=self.parse, errback=self.errback, close_page=False 108 | ) 109 | 110 | def parse(self, response, **kwargs): 111 | from base64 import b64encode 112 | 113 | with open("./tests/scrapy_logo.png", "rb") as image: 114 | if b64encode(image.read()).decode() == response.screenshot: 115 | self.urls_visited.append(response.url) 116 | 117 | 118 | class CustomJsActionSpider(MetaSpider): 119 | name = "custom_js_action" 120 | 121 | def __init__(self, *args, **kwargs): 122 | super().__init__(*args, **kwargs) 123 | self.urls_visited = [] 124 | 125 | def start_requests(self): 126 | yield PuppeteerRequest( 127 | GoTo("https://some_url.com"), 128 | callback=self.action, 129 | errback=self.errback, 130 | close_page=False, 131 | ) 132 | 133 | def action(self, response, **kwargs): 134 | js_function = """ 135 | some js function 136 | """ 137 | yield response.follow( 138 | CustomJsAction(js_function), 139 | callback=self.parse, 140 | errback=self.errback, 141 | close_page=False, 142 | ) 143 | 144 | def parse(self, response, **kwargs): 145 | response_data = {"field": "Hello!"} 146 | if response.data == response_data: 147 | self.urls_visited.append(response.url) 148 | 149 | 150 | class GoBackForwardSpider(MetaSpider): 151 | name = "go_back_forward" 152 | 153 | def __init__(self, *args, **kwargs): 154 | super().__init__(*args, **kwargs) 155 | self.urls_visited = [] 156 | 157 | def start_requests(self): 158 | yield PuppeteerRequest( 159 | GoTo("https://some_url.com"), 160 | callback=self.go_next, 161 | errback=self.errback, 162 | close_page=False, 163 | ) 164 | 165 | def go_next(self, response, **kwargs): 166 | yield response.follow( 167 | GoTo("/article"), 168 | callback=self.go_back, 169 | errback=self.errback, 170 | close_page=False, 171 | ) 172 | 173 | def go_back(self, response, **kwargs): 174 | yield response.follow( 175 | GoBack(), callback=self.go_forward, errback=self.errback, close_page=False 176 | ) 177 | 178 | def go_forward(self, response, **kwargs): 179 | body = b""" 180 | went back 181 | """ 182 | 183 | assert response.body == body 184 | yield response.follow( 185 | GoForward(), callback=self.parse, errback=self.errback, close_page=False 186 | ) 187 | 188 | def parse(self, response, **kwargs): 189 | body = b""" 190 | went forward 191 | """ 192 | if response.body == body: 193 | self.urls_visited.append(response.url) 194 | 195 | 196 | class RecaptchaSolverSpider(MetaSpider): 197 | name = "recaptcha_solver" 198 | 199 | def __init__(self, *args, **kwargs): 200 | super().__init__(*args, **kwargs) 201 | self.urls_visited = [] 202 | 203 | def start_requests(self): 204 | yield PuppeteerRequest( 205 | GoTo("https://some_url.com/with_captcha"), 206 | callback=self.solve_recaptcha, 207 | errback=self.errback, 208 | close_page=False, 209 | ) 210 | 211 | def solve_recaptcha(self, response, **kwargs): 212 | yield response.follow( 213 | RecaptchaSolver(solve_recaptcha=True), 214 | callback=self.parse, 215 | errback=self.errback, 216 | close_page=False, 217 | ) 218 | 219 | def parse(self, response, **kwargs): 220 | if response.data["recaptcha_data"]["captchas"] == [ 221 | 1 222 | ] and response.recaptcha_data["captchas"] == [1]: 223 | self.urls_visited.append(response.url) 224 | --------------------------------------------------------------------------------