├── .bumpversion.cfg ├── .github └── workflows │ ├── checks.yml │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── changelog.md └── faq.md ├── examples ├── .gitignore ├── books.py ├── books │ └── .gitignore ├── contexts.py ├── download.py ├── events.py ├── exception_errback.py ├── exception_middleware.py ├── headers.py ├── init_page.py ├── max_pages.py ├── post.py ├── scroll.py └── storage.py ├── pylintrc ├── pyproject.toml ├── scrapy_playwright ├── __init__.py ├── _utils.py ├── handler.py ├── headers.py ├── memusage.py └── page.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── launch_chromium_server.js ├── mockserver.py ├── site │ ├── data │ │ ├── quotes1.json │ │ ├── quotes2.json │ │ └── quotes3.json │ ├── files │ │ └── mancha.pdf │ ├── gallery.html │ ├── index.html │ ├── lorem_ipsum.html │ ├── redirect.html │ ├── scroll.html │ └── static │ │ ├── bootstrap.min.css │ │ ├── img │ │ ├── ales-krivec-ZMZHcvIVgbg-unsplash.jpg │ │ ├── elyssa-fahndrich-MF16lGb95WY-unsplash.jpg │ │ └── nathan-dumlao-RCfalHrnFAs-unsplash.jpg │ │ ├── jquery.js │ │ └── main.css ├── tests_asyncio │ ├── __init__.py │ ├── test_browser.py │ ├── test_browser_contexts.py │ ├── test_extensions.py │ ├── test_headers.py │ ├── test_page_methods.py │ ├── test_playwright_requests.py │ ├── test_settings.py │ └── test_utils.py └── tests_twisted │ ├── __init__.py │ └── test_mixed_requests.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.43 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:scrapy_playwright/__init__.py] 7 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: Checks 2 | on: [push, pull_request, workflow_dispatch] 3 | 4 | jobs: 5 | checks: 6 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 7 | runs-on: ubuntu-latest 8 | timeout-minutes: 5 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | include: 13 | - env: 14 | TOXENV: bandit 15 | - env: 16 | TOXENV: black 17 | - env: 18 | TOXENV: flake8 19 | - env: 20 | TOXENV: typing 21 | - env: 22 | TOXENV: pylint 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - name: Set up Python 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: 3.11 31 | 32 | - name: Run check 33 | env: ${{ matrix.env }} 34 | run: | 35 | pip install -U pip 36 | pip install -U tox 37 | tox 38 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 5 10 | 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: 3.11 18 | 19 | - name: Publish to PyPI 20 | run: | 21 | pip install --upgrade pip 22 | pip install --upgrade setuptools wheel twine 23 | python setup.py sdist bdist_wheel 24 | export TWINE_USERNAME=__token__ 25 | export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }} 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push, pull_request, workflow_dispatch] 3 | 4 | jobs: 5 | tests: 6 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 7 | runs-on: ${{ matrix.os }} 8 | timeout-minutes: 20 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | os: [ubuntu-22.04] 13 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 14 | include: 15 | - os: macos-14 16 | python-version: "3.12" 17 | - os: windows-2022 18 | python-version: "3.12" 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Set up node 29 | uses: actions/setup-node@v4 30 | with: 31 | node-version: 18 32 | 33 | - name: Install tox 34 | run: pip install tox 35 | 36 | - name: Run asyncio tests 37 | run: tox -e py 38 | 39 | - name: Run twisted tests 40 | run: tox -e py-twisted 41 | 42 | - name: Upload coverage report (Linux) 43 | if: runner.os == 'Linux' 44 | env: 45 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 46 | run: | 47 | curl -Os https://uploader.codecov.io/latest/linux/codecov 48 | chmod +x codecov 49 | ./codecov 50 | 51 | - name: Upload coverage report (macOS) 52 | if: runner.os == 'macOS' 53 | env: 54 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 55 | run: | 56 | curl -Os https://uploader.codecov.io/latest/macos/codecov 57 | chmod +x codecov 58 | ./codecov 59 | 60 | - name: Upload coverage report (Windows) 61 | if: runner.os == 'Windows' 62 | env: 63 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 64 | run: | 65 | $ProgressPreference = 'SilentlyContinue' 66 | Invoke-WebRequest -Uri https://uploader.codecov.io/latest/windows/codecov.exe -Outfile codecov.exe 67 | .\codecov.exe 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .~lock* 3 | .DS_Store 4 | .mypy_cache/ 5 | *.egg-info/ 6 | .tox/ 7 | build/ 8 | dist/ 9 | examples/*.png 10 | pip-wheel-metadata/ 11 | 12 | # coverage 13 | .coverage 14 | .coverage.* 15 | htmlcov/ 16 | coverage.xml 17 | coverage-*.xml 18 | coverage-asyncio/ 19 | coverage-twisted/ 20 | 21 | # nodejs stuff 22 | node_modules/ 23 | package-lock.json 24 | package.json 25 | 26 | .idea 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 Eugenio Lacuesta 2 | 3 | Redistribution and use in source and binary forms, with or without modification, 4 | are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation and/or 11 | other materials provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the names of its contributors 14 | may be used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | # scrapy-playwright changelog 2 | 3 | 4 | ### [v0.0.43](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.43) (2025-02-22) 5 | 6 | * Only register request and response loggers when needed (#336) 7 | 8 | 9 | ### [v0.0.42](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.42) (2024-11-06) 10 | 11 | * Allow custom PageMethod callbacks (#318) 12 | * Fix download errors caused by Content-Encoding header (#322) 13 | 14 | 15 | ### [v0.0.41](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.41) (2024-08-13) 16 | 17 | * Keyword arguments for PLAYWRIGHT_PROCESS_REQUEST_HEADERS, pass additional Request data (#303). 18 | Deprecated positional argument handling for the function passed to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS 19 | setting, arguments should now be handled by keyword. 20 | * Retry to create page on browser crash (#305) 21 | * Fix typo in log message (#312) 22 | 23 | 24 | ### [v0.0.40](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.40) (2024-07-16) 25 | 26 | * Enforce asyncio reactor in all platforms (#298) 27 | * Allow multiple handlers in separate thread (#299) 28 | 29 | 30 | ### [v0.0.39](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.39) (2024-07-11) 31 | 32 | * Return proper status and headers for downloads (#293) 33 | * Restart on browser crash (#295) 34 | * Override method and/or body only for the first matching request (#297) 35 | 36 | 37 | ### [v0.0.38](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.38) (2024-07-06) 38 | 39 | * Fix freezing on responses with status 204 (#292) 40 | * Connect to remote browser using BrowserType.connect (#283) 41 | 42 | 43 | ### [v0.0.37](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.37) (2024-07-03) 44 | 45 | * Improve Windows concurrency (#286) 46 | 47 | 48 | ### [v0.0.36](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.36) (2024-06-24) 49 | 50 | * Windows support (#276) 51 | 52 | 53 | ### [v0.0.35](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.35) (2024-06-01) 54 | 55 | * Update exception message check 56 | 57 | 58 | ### [v0.0.34](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.34) (2024-01-01) 59 | 60 | * Update dev status classifier to 4 - beta 61 | * Official Python 3.12 support (#254) 62 | * Custom memusage extension (#257) 63 | 64 | 65 | ### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19) 66 | 67 | * Handle downloads as binary responses (#228) 68 | 69 | 70 | ### [v0.0.32](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.32) (2023-09-04) 71 | 72 | * Connect to browser using CDP (#227) 73 | 74 | 75 | ### [v0.0.31](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.31) (2023-08-28) 76 | 77 | * Do not fail when getting referer header for debug log messages (#225) 78 | * Do not override headers with values from asset requests (#226) 79 | 80 | 81 | ### [v0.0.30](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.30) (2023-08-17) 82 | 83 | * Fix page_init_callback duplication (#222) 84 | * Bump minimum Python version from 3.7 to 3.8 (#223) 85 | 86 | 87 | ### [v0.0.29](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.29) (2023-08-11) 88 | 89 | * Set exc_info=True for warning log records (#219) 90 | * Invoke page_init_callback after setting route (#205) 91 | 92 | 93 | ### [v0.0.28](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.28) (2023-08-05) 94 | 95 | * Retry page.content if necessary (#218) 96 | 97 | 98 | ### [v0.0.27](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.27) (2023-07-24) 99 | 100 | * Override method only for navigation requests (#177) 101 | * Pass spider argument to _create_browser_context (#212) 102 | * await AsyncPlaywright.stop on close (#214) 103 | 104 | 105 | ### [v0.0.26](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.26) (2023-02-01) 106 | 107 | * Fix logging (pass extra args instead of updating log record factory) 108 | * Miscellaneous adjustments (naming, typing, etc) 109 | 110 | 111 | ### [v0.0.25](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.25) (2023-01-24) 112 | 113 | * Set spider attribute on log records 114 | 115 | 116 | ### [v0.0.24](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.24) (2022-12-04) 117 | 118 | * Fix request method override 119 | 120 | 121 | ### [v0.0.23](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.23) (2022-11-27) 122 | 123 | * Set redirect request metadata 124 | 125 | 126 | ### [v0.0.22](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.22) (2022-10-09) 127 | 128 | * Remove deprecated code (`PageCoroutine` class, `playwright_page_coroutines` request meta key, 129 | `use_playwright_headers` function). 130 | * `playwright_page_init_callback` meta key (page initialization callback) 131 | 132 | 133 | ### [v0.0.21](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.21) (2022-08-08) 134 | 135 | * Fixed TypeError exception when getting server IP address 136 | 137 | 138 | ### [v0.0.20](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.20) (2022-08-03) 139 | 140 | * Don't raise exceptions if `Page.goto` returns `None` 141 | 142 | 143 | ### [v0.0.19](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.19) (2022-07-17) 144 | 145 | * Add support for `Page.goto` keyword arguments (`playwright_page_goto_kwargs` request meta key) 146 | 147 | 148 | ### [v0.0.18](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.18) (2022-06-18) 149 | 150 | * Always override request headers 151 | 152 | 153 | ### [v0.0.17](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.17) (2022-05-22) 154 | 155 | * Support for persistent contexts 156 | * Limit concurrent context count (`PLAYWRIGHT_MAX_CONTEXTS` setting) 157 | 158 | 159 | ### [v0.0.16](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16) (2022-05-14) 160 | 161 | * Use new headers API introduced in Playwright 1.15 (bump required Playwright version) 162 | * Deprecate `scrapy_playwright.headers.use_playwright_headers`, set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead 163 | 164 | 165 | ### [v0.0.15](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.15) (2022-05-08) 166 | 167 | * Remove deprecated `PLAYWRIGHT_CONTEXT_ARGS` setting 168 | * Warn on failed requests 169 | * `PLAYWRIGHT_ABORT_REQUEST` setting: accept coroutine functions 170 | * `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` setting: accept sync functions to process headers 171 | * Set `playwright_page` request meta key early 172 | 173 | 174 | ### [v0.0.14](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14) (2022-03-26) 175 | 176 | * Renamed `scrapy_playwright.page.PageCoroutine` to `scrapy_playwright.page.PageMethod` 177 | (`PageCoroutine` is now deprecated). Also deprecated the `playwright_page_coroutines` 178 | Request meta key in favor of `playwright_page_methods`. 179 | 180 | 181 | ### [v0.0.13](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13) (2022-03-24) 182 | 183 | * PageCoroutine checks 184 | * Fix encoding detection 185 | * Ability to abort requests via setting 186 | 187 | 188 | ### [v0.0.12](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.12) (2022-03-15) 189 | 190 | * Avoid exceptions during cleanup when the browser could not start 191 | * Warn when non PageCoroutine objects are passed to Request.meta.playwright_page_coroutines 192 | 193 | 194 | ### [v0.0.11](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.11) (2022-03-12) 195 | 196 | * Set the maximum amount of pages per context 197 | * Response.ip_address attribute 198 | * Response security details 199 | 200 | 201 | ### [v0.0.10](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.10) (2022-03-02) 202 | 203 | * Fix response encoding detection 204 | 205 | 206 | ### [v0.0.9](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.9) (2022-01-27) 207 | 208 | * Ability to process request headers 209 | 210 | 211 | ### [v0.0.8](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.8) (2022-01-13) 212 | 213 | * Fix PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT setting (allow zero value) 214 | 215 | 216 | ### [v0.0.7](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.7) (2021-10-20) 217 | 218 | * Log all requests/responses (debug level) 219 | 220 | 221 | ### [v0.0.6](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.6) (2021-10-19) 222 | 223 | * Page event handlers 224 | * Python 3.10 support 225 | * Doc fixes 226 | * Override User-Agent header 227 | 228 | 229 | ### [v0.0.5](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.5) (2021-08-20) 230 | 231 | * Improve garbage collection by removing unnecessary reference 232 | 233 | ### [v0.0.4](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.4) (2021-07-16) 234 | 235 | * Add support for multiple browser contexts ([#13](https://github.com/scrapy-plugins/scrapy-playwright/pull/13)) 236 | * Deprecate `PLAYWRIGHT_CONTEXT_ARGS` setting in favor of `PLAYWRIGHT_CONTEXTS` 237 | 238 | 239 | ### [v0.0.3](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.3) (2021-02-22) 240 | 241 | * Snake case (requires playwright-python >= [v1.8.0a1](https://github.com/microsoft/playwright-python/releases/tag/v1.8.0a1)) 242 | 243 | 244 | ### [v0.0.2](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.2) (2021-01-13) 245 | 246 | * `PLAYWRIGHT_CONTEXT_ARGS` setting (ability to pass keyword arguments to the browser context) 247 | 248 | ### [v0.0.1](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.1) (2020-12-18) 249 | 250 | Initial public release. 251 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | 4 | ## How to use scrapy-playwright with the [CrawlSpider](https://docs.scrapy.org/en/latest/topics/spiders.html#crawlspider)? 5 | 6 | By specifying a `process_request` method that modifies requests in-place in your 7 | [crawling rules](https://docs.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Rule). 8 | For instance: 9 | 10 | ```python 11 | def set_playwright_true(request, response): 12 | request.meta["playwright"] = True 13 | return request 14 | 15 | class MyCrawlSpider(CrawlSpider): 16 | ... 17 | rules = ( 18 | Rule( 19 | link_extractor=LinkExtractor(...), 20 | callback="parse_item", 21 | follow=False, 22 | process_request=set_playwright_true, 23 | ), 24 | ) 25 | ``` 26 | 27 | 28 | ## How to download all requests using scrapy-playwright? 29 | 30 | If you want all requests to be processed by Playwright and don't want to repeat 31 | yourself, or you're using a generic spider that doesn't support request 32 | customization (e.g. `scrapy.spiders.SitemapSpider`), you can use a middleware 33 | to edit the `meta` attribute for all requests. 34 | 35 | Depending on your project and the interactions with other components, you might 36 | decide to use a 37 | [spider middleware](https://docs.scrapy.org/en/latest/topics/spider-middleware.html) 38 | or a 39 | [downloader middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html). 40 | 41 | Spider middleware example: 42 | 43 | ```python 44 | class PlaywrightSpiderMiddleware: 45 | def process_spider_output(self, response, result, spider): 46 | for obj in result: 47 | if isinstance(obj, scrapy.Request): 48 | obj.meta.setdefault("playwright", True) 49 | yield obj 50 | ``` 51 | 52 | Downloader middleware example: 53 | 54 | ```python 55 | class PlaywrightDownloaderMiddleware: 56 | def process_request(self, request, spider): 57 | request.meta.setdefault("playwright", True) 58 | return None 59 | ``` 60 | 61 | 62 | ## How to increase the allowed memory size for the browser? 63 | 64 | If you're seeing messages such as `JavaScript heap out of memory`, there's a 65 | chance you're falling into the scope of 66 | https://github.com/microsoft/playwright/issues/6319. As a workaround, it's 67 | possible to increase the amount of memory allowed for the Node.js process by 68 | specifying a value for the the `--max-old-space-size` V8 option in the 69 | `NODE_OPTIONS` environment variable, e.g.: 70 | 71 | ``` 72 | $ export NODE_OPTIONS=--max-old-space-size=SIZE # in megabytes 73 | ``` 74 | 75 | Sources & further reading: 76 | * https://github.com/scrapy-plugins/scrapy-playwright/issues/19#issuecomment-886211045 77 | * https://github.com/npm/npm/issues/12238#issuecomment-367147962 78 | * https://medium.com/the-node-js-collection/node-options-has-landed-in-8-x-5fba57af703d 79 | * https://nodejs.org/dist/latest-v8.x/docs/api/cli.html#cli_node_options_options 80 | * https://nodejs.org/api/cli.html#cli_max_old_space_size_size_in_megabytes 81 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | *.pdf 3 | -------------------------------------------------------------------------------- /examples/books.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | from pathlib import Path 4 | from typing import Generator, Optional 5 | 6 | from playwright.async_api import Page 7 | from scrapy import Spider 8 | from scrapy.http.response import Response 9 | 10 | 11 | class BooksSpider(Spider): 12 | """Extract all books, save screenshots.""" 13 | 14 | name = "books" 15 | custom_settings = { 16 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 17 | "DOWNLOAD_HANDLERS": { 18 | # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 19 | "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 20 | }, 21 | "CONCURRENT_REQUESTS": 32, 22 | "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 4, 23 | "CLOSESPIDER_ITEMCOUNT": 100, 24 | "FEEDS": { 25 | "books.json": {"format": "json", "encoding": "utf-8", "indent": 4}, 26 | }, 27 | } 28 | start_urls = ["http://books.toscrape.com"] 29 | 30 | def __init__(self, name=None, **kwargs): 31 | super().__init__(name, **kwargs) 32 | logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING) 33 | logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING) 34 | 35 | def parse(self, response: Response, current_page: Optional[int] = None) -> Generator: 36 | page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)") 37 | page_count = int(page_count) 38 | for page in range(2, page_count + 1): 39 | yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page}) 40 | 41 | current_page = current_page or 1 42 | for book in response.css("article.product_pod a"): 43 | yield response.follow( 44 | book, 45 | callback=self.parse_book, 46 | meta={ 47 | "playwright": True, 48 | "playwright_include_page": True, 49 | "playwright_context": f"page-{current_page}", 50 | }, 51 | ) 52 | 53 | async def parse_book(self, response: Response) -> dict: 54 | url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest() 55 | page: Page = response.meta["playwright_page"] 56 | await page.screenshot( 57 | path=Path(__file__).parent / "books" / f"{url_sha256}.png", full_page=True 58 | ) 59 | await page.close() 60 | return { 61 | "url": response.url, 62 | "title": response.css("h1::text").get(), 63 | "price": response.css("p.price_color::text").get(), 64 | "breadcrumbs": response.css(".breadcrumb a::text").getall(), 65 | "image": f"books/{url_sha256}.png", 66 | } 67 | -------------------------------------------------------------------------------- /examples/books/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | -------------------------------------------------------------------------------- /examples/contexts.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from playwright.async_api import Page 4 | from scrapy import Spider, Request 5 | 6 | 7 | class MultipleContextsSpider(Spider): 8 | """Handle multiple browser contexts""" 9 | 10 | name = "contexts" 11 | custom_settings = { 12 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 13 | "DOWNLOAD_HANDLERS": { 14 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 15 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 16 | }, 17 | "PLAYWRIGHT_MAX_CONTEXTS": 6, 18 | "PLAYWRIGHT_CONTEXTS": { 19 | "first": { 20 | "storage_state": { 21 | "cookies": [ 22 | { 23 | "url": "https://example.org", 24 | "name": "context", 25 | "value": "first", 26 | }, 27 | ], 28 | }, 29 | }, 30 | "second": { 31 | "storage_state": { 32 | "cookies": [ 33 | { 34 | "url": "https://example.org", 35 | "name": "context", 36 | "value": "second", 37 | }, 38 | ], 39 | }, 40 | }, 41 | "persistent": { 42 | "user_data_dir": str(Path.home() / "playwright-persistent-context"), 43 | "java_script_enabled": False, 44 | }, 45 | }, 46 | } 47 | 48 | def start_requests(self): 49 | # using existing contexts 50 | for ctx_name in self.custom_settings["PLAYWRIGHT_CONTEXTS"].keys(): 51 | yield Request( 52 | url="https://example.org", 53 | meta={ 54 | "playwright": True, 55 | "playwright_context": ctx_name, 56 | "playwright_include_page": True, 57 | }, 58 | dont_filter=True, 59 | ) 60 | # create a new context 61 | yield Request( 62 | url="https://example.org", 63 | meta={ 64 | "playwright": True, 65 | "playwright_context": "third", 66 | "playwright_context_kwargs": { 67 | "storage_state": { 68 | "cookies": [ 69 | { 70 | "url": "https://example.org", 71 | "name": "context", 72 | "value": "third", 73 | }, 74 | ], 75 | }, 76 | }, 77 | "playwright_include_page": True, 78 | }, 79 | dont_filter=True, 80 | ) 81 | # default context 82 | yield Request( 83 | url="https://example.org", 84 | meta={"playwright": True, "playwright_include_page": True}, 85 | dont_filter=True, 86 | ) 87 | # each request on a different context 88 | for i in range(20): 89 | yield Request( 90 | url=f"https://example.org?foo={i}", 91 | meta={ 92 | "playwright": True, 93 | "playwright_context": f"context-{i}", 94 | "playwright_include_page": True, 95 | }, 96 | dont_filter=True, 97 | ) 98 | 99 | async def parse(self, response, **kwargs): 100 | page: Page = response.meta["playwright_page"] 101 | context_name = response.meta["playwright_context"] 102 | storage_state = await page.context.storage_state() 103 | await page.close() 104 | await page.context.close() 105 | return { 106 | "url": response.url, 107 | "context": context_name, 108 | "cookies": storage_state["cookies"], 109 | } 110 | -------------------------------------------------------------------------------- /examples/download.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from scrapy import Spider, Request 4 | 5 | 6 | class DownloadSpider(Spider): 7 | name = "download" 8 | custom_settings = { 9 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 10 | "DOWNLOAD_HANDLERS": { 11 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 12 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 13 | }, 14 | } 15 | 16 | def start_requests(self): 17 | yield Request(url="https://example.org", meta={"playwright": True}) 18 | yield Request( 19 | url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", 20 | meta={"playwright": True}, 21 | ) 22 | 23 | def parse(self, response, **kwargs): 24 | if filename := response.meta.get("playwright_suggested_filename"): 25 | (Path(__file__).parent / filename).write_bytes(response.body) 26 | yield { 27 | "url": response.url, 28 | "response_cls": response.__class__.__name__, 29 | "first_bytes": response.body[:60], 30 | "filename": filename, 31 | } 32 | -------------------------------------------------------------------------------- /examples/events.py: -------------------------------------------------------------------------------- 1 | from playwright.async_api import Dialog, Response as PlaywrightResponse 2 | from scrapy import Spider, Request 3 | from scrapy_playwright.page import PageMethod 4 | 5 | 6 | class EventsSpider(Spider): 7 | """Handle page events.""" 8 | 9 | name = "events" 10 | custom_settings = { 11 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 12 | "DOWNLOAD_HANDLERS": { 13 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 14 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 15 | }, 16 | } 17 | 18 | def start_requests(self): 19 | yield Request( 20 | url="https://example.org", 21 | meta={ 22 | "playwright": True, 23 | "playwright_page_methods": [ 24 | PageMethod("evaluate", "alert('foobar');"), 25 | ], 26 | "playwright_page_event_handlers": { 27 | "dialog": self.handle_dialog, 28 | "response": "handle_response", 29 | }, 30 | }, 31 | ) 32 | 33 | async def handle_dialog(self, dialog: Dialog) -> None: 34 | self.logger.info(f"Handled dialog with message: {dialog.message}") 35 | await dialog.dismiss() 36 | 37 | async def handle_response(self, response: PlaywrightResponse) -> None: 38 | self.logger.info(f"Received response with URL {response.url}") 39 | 40 | def parse(self, response, **kwargs): 41 | return {"url": response.url} 42 | -------------------------------------------------------------------------------- /examples/exception_errback.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from scrapy import Spider, Request 4 | 5 | 6 | class HandleExceptionInErrbackSpider(Spider): 7 | """Handle exceptions in the Playwright downloader, such as TimeoutError""" 8 | 9 | name = "awesome" 10 | custom_settings = { 11 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, # milliseconds 12 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 13 | "DOWNLOAD_HANDLERS": { 14 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 15 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 16 | }, 17 | "RETRY_TIMES": 0, 18 | } 19 | 20 | def start_requests(self): 21 | yield Request( 22 | url="https://httpbin.org/delay/10", 23 | meta={"playwright": True}, 24 | errback=self.errback, 25 | ) 26 | 27 | def errback(self, failure): 28 | logging.info( 29 | "Handling failure in errback, request=%r, exception=%r", failure.request, failure.value 30 | ) 31 | -------------------------------------------------------------------------------- /examples/exception_middleware.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from scrapy import Spider, Request 5 | from scrapy_playwright.page import PageMethod 6 | 7 | 8 | class HandleTimeoutMiddleware: 9 | def process_exception(self, request, exception, spider): 10 | new_url = "https://httpbin.org/get" 11 | logging.info( 12 | "Caught exception: %s for request %s, recovering to %s", 13 | exception.__class__, 14 | request, 15 | new_url, 16 | ) 17 | return Request( 18 | url=new_url, 19 | meta={ 20 | "playwright": True, 21 | "playwright_page_methods": [ 22 | PageMethod( 23 | "screenshot", path=Path(__file__).parent / "recovered.png", full_page=True 24 | ), 25 | ], 26 | }, 27 | ) 28 | 29 | 30 | class HandleExceptionInMiddlewareSpider(Spider): 31 | """Handle exceptions in the Playwright downloader, such as TimeoutError""" 32 | 33 | name = "awesome" 34 | custom_settings = { 35 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, # milliseconds 36 | "DOWNLOADER_MIDDLEWARES": { 37 | HandleTimeoutMiddleware: 100, 38 | }, 39 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 40 | "DOWNLOAD_HANDLERS": { 41 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 42 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 43 | }, 44 | "RETRY_TIMES": 0, 45 | } 46 | 47 | def start_requests(self): 48 | yield Request( 49 | url="https://httpbin.org/delay/10", 50 | meta={"playwright": True}, 51 | ) 52 | 53 | def parse(self, response, **kwargs): 54 | logging.info("Received response for %s", response.url) 55 | yield {"url": response.url} 56 | -------------------------------------------------------------------------------- /examples/headers.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | from scrapy import Spider, Request 5 | from scrapy_playwright.page import PageMethod 6 | 7 | 8 | class HeadersSpider(Spider): 9 | """Control how requests headers are handled via the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting. 10 | 11 | If PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None, neither USER_AGENT nor cookies will be sent to the 12 | website, comment out PLAYWRIGHT_PROCESS_REQUEST_HEADERS to sent them. 13 | """ 14 | 15 | name = "headers" 16 | custom_settings = { 17 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 18 | "DOWNLOAD_HANDLERS": { 19 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 20 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 21 | }, 22 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, 23 | "USER_AGENT": "Overridden user agent", 24 | } 25 | 26 | def start_requests(self): 27 | yield Request( 28 | url="https://httpbin.org/headers", 29 | meta={ 30 | "playwright": True, 31 | "playwright_page_methods": [ 32 | PageMethod( 33 | "screenshot", path=Path(__file__).parent / "headers.png", full_page=True 34 | ), 35 | ], 36 | }, 37 | cookies={"foo": "bar"}, 38 | ) 39 | 40 | def parse(self, response, **kwargs): 41 | headers = json.loads(response.css("pre::text").get())["headers"] 42 | yield {"url": response.url, "headers": headers} 43 | -------------------------------------------------------------------------------- /examples/init_page.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import scrapy 4 | 5 | 6 | async def init_page(page, request): 7 | await page.set_extra_http_headers({"Asdf": "Qwerty"}) 8 | 9 | 10 | class InitPageSpider(scrapy.Spider): 11 | """A spider that initializes pages upon creation.""" 12 | 13 | name = "init_page" 14 | custom_settings = { 15 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, # needed to keep playwright headers 16 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 17 | "DOWNLOAD_HANDLERS": { 18 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 19 | }, 20 | } 21 | 22 | def start_requests(self): 23 | yield scrapy.Request( 24 | url="https://httpbin.org/headers", 25 | meta={ 26 | "playwright": True, 27 | "playwright_page_init_callback": init_page, 28 | }, 29 | ) 30 | 31 | def parse(self, response, **kwargs): 32 | json_str = response.css("pre::text").get() 33 | print(json_str) 34 | return {"data": json.loads(json_str)} 35 | -------------------------------------------------------------------------------- /examples/max_pages.py: -------------------------------------------------------------------------------- 1 | from playwright.async_api import Page 2 | from scrapy import Spider, Request 3 | 4 | 5 | class MaxPagesPerContextContextsSpider(Spider): 6 | """Limit pages by context""" 7 | 8 | name = "contexts" 9 | custom_settings = { 10 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 11 | "DOWNLOAD_HANDLERS": { 12 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 13 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 14 | }, 15 | "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 2, 16 | "PLAYWRIGHT_CONTEXTS": { 17 | "a": {"java_script_enabled": True}, 18 | "b": {"java_script_enabled": True}, 19 | }, 20 | } 21 | 22 | def start_requests(self): 23 | for _ in range(20): 24 | yield Request( 25 | url="https://httpbin.org/status?n=404", 26 | meta={ 27 | "playwright": True, 28 | "playwright_context": "a", 29 | "playwright_include_page": True, 30 | }, 31 | dont_filter=True, 32 | errback=self.errback, 33 | ) 34 | for i in range(20): 35 | yield Request( 36 | url=f"https://httpbin.org/get?a={i}", 37 | meta={"playwright": True, "playwright_context": "a"}, 38 | ) 39 | for i in range(20): 40 | yield Request( 41 | url=f"https://httpbin.org/get?b={i}", 42 | meta={"playwright": True, "playwright_context": "b"}, 43 | ) 44 | 45 | def parse(self, response, **kwargs): 46 | return {"url": response.url} 47 | 48 | async def errback(self, failure): 49 | page: Page = failure.request.meta["playwright_page"] 50 | await page.close() 51 | -------------------------------------------------------------------------------- /examples/post.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from scrapy import Spider, FormRequest 4 | from scrapy_playwright.page import PageMethod 5 | 6 | 7 | class PostSpider(Spider): 8 | """Send data using the POST verb.""" 9 | 10 | name = "post" 11 | custom_settings = { 12 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 13 | "DOWNLOAD_HANDLERS": { 14 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 15 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 16 | }, 17 | } 18 | 19 | def start_requests(self): 20 | yield FormRequest( 21 | url="https://httpbin.org/post", 22 | formdata={"foo": "bar"}, 23 | meta={ 24 | "playwright": True, 25 | "playwright_page_methods": [ 26 | PageMethod( 27 | "screenshot", path=Path(__file__).parent / "post.png", full_page=True 28 | ), 29 | ], 30 | }, 31 | ) 32 | 33 | def parse(self, response, **kwargs): 34 | yield {"url": response.url} 35 | -------------------------------------------------------------------------------- /examples/scroll.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from scrapy import Spider, Request 4 | from scrapy_playwright.page import PageMethod 5 | 6 | 7 | class ScrollSpider(Spider): 8 | """Scroll down on an infinite-scroll page.""" 9 | 10 | name = "scroll" 11 | custom_settings = { 12 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 13 | "DOWNLOAD_HANDLERS": { 14 | # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 15 | "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 16 | }, 17 | "LOG_LEVEL": "INFO", 18 | } 19 | 20 | def start_requests(self): 21 | yield Request( 22 | url="http://quotes.toscrape.com/scroll", 23 | cookies={"foo": "bar", "asdf": "qwerty"}, 24 | meta={ 25 | "playwright": True, 26 | "playwright_page_methods": [ 27 | PageMethod("wait_for_selector", "div.quote"), 28 | PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), 29 | PageMethod("wait_for_selector", "div.quote:nth-child(11)"), # 10 per page 30 | PageMethod( 31 | "screenshot", path=Path(__file__).parent / "scroll.png", full_page=True 32 | ), 33 | ], 34 | }, 35 | ) 36 | 37 | def parse(self, response, **kwargs): 38 | return {"url": response.url, "count": len(response.css("div.quote"))} 39 | -------------------------------------------------------------------------------- /examples/storage.py: -------------------------------------------------------------------------------- 1 | from playwright.async_api import Page 2 | from scrapy import Spider, Request 3 | from scrapy_playwright.page import PageMethod 4 | 5 | 6 | class StorageSpider(Spider): 7 | """Set and get storage state, get the server's IP address.""" 8 | 9 | name = "storage" 10 | custom_settings = { 11 | "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", 12 | "DOWNLOAD_HANDLERS": { 13 | "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 14 | # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", 15 | }, 16 | } 17 | 18 | def start_requests(self): 19 | yield Request( 20 | url="https://example.org", 21 | meta={ 22 | "playwright": True, 23 | "playwright_include_page": True, 24 | "playwright_page_methods": [ 25 | PageMethod("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"), 26 | ], 27 | }, 28 | ) 29 | 30 | async def parse(self, response, **kwargs): 31 | page: Page = response.meta["playwright_page"] 32 | storage_state = await page.context.storage_state() 33 | await page.close() 34 | return { 35 | "url": response.url, 36 | "storage_state": storage_state, 37 | "ip_address": response.ip_address, 38 | } 39 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable= 3 | attribute-defined-outside-init, 4 | broad-except, 5 | invalid-name, 6 | missing-class-docstring, 7 | missing-function-docstring, 8 | missing-module-docstring, 9 | too-few-public-methods, 10 | too-many-arguments, 11 | too-many-instance-attributes, 12 | # tests 13 | duplicate-code, 14 | import-outside-toplevel, 15 | protected-access, 16 | too-many-public-methods, 17 | unnecessary-dunder-call, 18 | 19 | 20 | [FORMAT] 21 | expected-line-ending-format=LF 22 | max-line-length=99 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 99 3 | -------------------------------------------------------------------------------- /scrapy_playwright/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.43" 2 | -------------------------------------------------------------------------------- /scrapy_playwright/_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import platform 4 | import threading 5 | from typing import Awaitable, Dict, Iterator, Optional, Tuple, Union 6 | 7 | import scrapy 8 | from playwright.async_api import Error, Page, Request, Response 9 | from scrapy.http.headers import Headers 10 | from scrapy.settings import Settings 11 | from scrapy.utils.python import to_unicode 12 | from twisted.internet.defer import Deferred 13 | from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding 14 | 15 | 16 | logger = logging.getLogger("scrapy-playwright") 17 | 18 | 19 | async def _maybe_await(obj): 20 | if isinstance(obj, Awaitable): 21 | return await obj 22 | return obj 23 | 24 | 25 | def _possible_encodings(headers: Headers, text: str) -> Iterator[str]: 26 | if headers.get("content-type"): 27 | content_type = to_unicode(headers["content-type"]) 28 | yield http_content_type_encoding(content_type) 29 | yield html_body_declared_encoding(text) 30 | 31 | 32 | def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]: 33 | for encoding in filter(None, _possible_encodings(headers, text)): 34 | try: 35 | body = text.encode(encoding) 36 | except UnicodeEncodeError: 37 | pass 38 | else: 39 | return body, encoding 40 | return text.encode("utf-8"), "utf-8" # fallback 41 | 42 | 43 | def _is_safe_close_error(error: Error) -> bool: 44 | """ 45 | Taken almost verbatim from 46 | https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238 47 | """ 48 | message = str(error) 49 | return message.endswith("Browser has been closed") or message.endswith( 50 | "Target page, context or browser has been closed" 51 | ) 52 | 53 | 54 | _NAVIGATION_ERROR_MSG = ( 55 | "Unable to retrieve content because the page is navigating and changing the content." 56 | ) 57 | 58 | 59 | async def _get_page_content( 60 | page: Page, 61 | spider: scrapy.Spider, 62 | context_name: str, 63 | scrapy_request_url: str, 64 | scrapy_request_method: str, 65 | ) -> str: 66 | """Wrapper around Page.content to retry if necessary. 67 | Arguments other than the page are only for logging. 68 | """ 69 | try: 70 | return await page.content() 71 | except Error as err: 72 | if _NAVIGATION_ERROR_MSG in err.message: 73 | logger.debug( 74 | "Retrying to get content from page '%s', error: '%s'", 75 | page.url, 76 | _NAVIGATION_ERROR_MSG, 77 | extra={ 78 | "spider": spider, 79 | "context_name": context_name, 80 | "scrapy_request_url": scrapy_request_url, 81 | "scrapy_request_method": scrapy_request_method, 82 | "playwright_page_url": page.url, 83 | }, 84 | ) 85 | return await page.content() 86 | raise 87 | 88 | 89 | def _get_float_setting(settings: Settings, key: str) -> Optional[float]: 90 | try: 91 | return float(settings[key]) 92 | except Exception: 93 | return None 94 | 95 | 96 | async def _get_header_value( 97 | resource: Union[Request, Response], 98 | header_name: str, 99 | ) -> Optional[str]: 100 | try: 101 | return await resource.header_value(header_name) 102 | except Exception: 103 | return None 104 | 105 | 106 | class _ThreadedLoopAdapter: 107 | """Utility class to start an asyncio event loop in a new thread and redirect coroutines. 108 | This allows to run Playwright in a different loop than the Scrapy crawler, allowing to 109 | use ProactorEventLoop which is supported by Playwright on Windows. 110 | """ 111 | 112 | _loop: asyncio.AbstractEventLoop 113 | _thread: threading.Thread 114 | _coro_queue: asyncio.Queue = asyncio.Queue() 115 | _stop_events: Dict[int, asyncio.Event] = {} 116 | 117 | @classmethod 118 | async def _handle_coro(cls, coro, future) -> None: 119 | try: 120 | future.set_result(await coro) 121 | except Exception as exc: 122 | future.set_exception(exc) 123 | 124 | @classmethod 125 | async def _process_queue(cls) -> None: 126 | while any(not ev.is_set() for ev in cls._stop_events.values()): 127 | coro, future = await cls._coro_queue.get() 128 | asyncio.create_task(cls._handle_coro(coro, future)) 129 | cls._coro_queue.task_done() 130 | 131 | @classmethod 132 | def _deferred_from_coro(cls, coro) -> Deferred: 133 | future: asyncio.Future = asyncio.Future() 134 | asyncio.run_coroutine_threadsafe(cls._coro_queue.put((coro, future)), cls._loop) 135 | return scrapy.utils.defer.deferred_from_coro(future) 136 | 137 | @classmethod 138 | def start(cls, caller_id: int) -> None: 139 | cls._stop_events[caller_id] = asyncio.Event() 140 | if not getattr(cls, "_loop", None): 141 | policy = asyncio.DefaultEventLoopPolicy() 142 | if platform.system() == "Windows": 143 | policy = asyncio.WindowsProactorEventLoopPolicy() # type: ignore[attr-defined] 144 | cls._loop = policy.new_event_loop() 145 | asyncio.set_event_loop(cls._loop) 146 | 147 | if not getattr(cls, "_thread", None): 148 | cls._thread = threading.Thread(target=cls._loop.run_forever, daemon=True) 149 | cls._thread.start() 150 | logger.info("Started loop on separate thread: %s", cls._loop) 151 | asyncio.run_coroutine_threadsafe(cls._process_queue(), cls._loop) 152 | 153 | @classmethod 154 | def stop(cls, caller_id: int) -> None: 155 | """Wait until all handlers are closed to stop the event loop and join the thread.""" 156 | cls._stop_events[caller_id].set() 157 | if all(ev.is_set() for ev in cls._stop_events.values()): 158 | asyncio.run_coroutine_threadsafe(cls._coro_queue.join(), cls._loop) 159 | cls._loop.call_soon_threadsafe(cls._loop.stop) 160 | cls._thread.join() 161 | -------------------------------------------------------------------------------- /scrapy_playwright/handler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import logging 4 | import platform 5 | import warnings 6 | from contextlib import suppress 7 | from dataclasses import dataclass, field as dataclass_field 8 | from functools import partial 9 | from ipaddress import ip_address 10 | from time import time 11 | from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union 12 | 13 | from playwright._impl._errors import TargetClosedError 14 | from playwright.async_api import ( 15 | BrowserContext, 16 | BrowserType, 17 | Download as PlaywrightDownload, 18 | Error as PlaywrightError, 19 | Page, 20 | Playwright as AsyncPlaywright, 21 | PlaywrightContextManager, 22 | Request as PlaywrightRequest, 23 | Response as PlaywrightResponse, 24 | Route, 25 | ) 26 | from scrapy import Spider, signals 27 | from scrapy.core.downloader.handlers.http import HTTPDownloadHandler 28 | from scrapy.crawler import Crawler 29 | from scrapy.exceptions import NotSupported, ScrapyDeprecationWarning 30 | from scrapy.http import Request, Response 31 | from scrapy.http.headers import Headers 32 | from scrapy.responsetypes import responsetypes 33 | from scrapy.settings import Settings 34 | from scrapy.utils.defer import deferred_from_coro 35 | from scrapy.utils.misc import load_object 36 | from scrapy.utils.reactor import verify_installed_reactor 37 | from twisted.internet.defer import Deferred, inlineCallbacks 38 | 39 | from scrapy_playwright.headers import use_scrapy_headers 40 | from scrapy_playwright.page import PageMethod 41 | from scrapy_playwright._utils import ( 42 | _ThreadedLoopAdapter, 43 | _encode_body, 44 | _get_float_setting, 45 | _get_header_value, 46 | _get_page_content, 47 | _is_safe_close_error, 48 | _maybe_await, 49 | ) 50 | 51 | 52 | __all__ = ["ScrapyPlaywrightDownloadHandler"] 53 | 54 | 55 | PlaywrightHandler = TypeVar("PlaywrightHandler", bound="ScrapyPlaywrightDownloadHandler") 56 | 57 | 58 | logger = logging.getLogger("scrapy-playwright") 59 | 60 | 61 | DEFAULT_BROWSER_TYPE = "chromium" 62 | DEFAULT_CONTEXT_NAME = "default" 63 | PERSISTENT_CONTEXT_PATH_KEY = "user_data_dir" 64 | 65 | 66 | @dataclass 67 | class BrowserContextWrapper: 68 | context: BrowserContext 69 | semaphore: asyncio.Semaphore 70 | persistent: bool 71 | 72 | 73 | @dataclass 74 | class Download: 75 | body: bytes = b"" 76 | url: str = "" 77 | suggested_filename: str = "" 78 | exception: Optional[Exception] = None 79 | response_status: int = 200 80 | headers: dict = dataclass_field(default_factory=dict) 81 | 82 | def __bool__(self) -> bool: 83 | return bool(self.body) or bool(self.exception) 84 | 85 | 86 | @dataclass 87 | class Config: 88 | cdp_url: Optional[str] 89 | cdp_kwargs: dict 90 | connect_url: Optional[str] 91 | connect_kwargs: dict 92 | browser_type_name: str 93 | launch_options: dict 94 | max_pages_per_context: int 95 | max_contexts: Optional[int] 96 | startup_context_kwargs: dict 97 | navigation_timeout: Optional[float] 98 | restart_disconnected_browser: bool 99 | target_closed_max_retries: int = 3 100 | use_threaded_loop: bool = False 101 | 102 | @classmethod 103 | def from_settings(cls, settings: Settings) -> "Config": 104 | if settings.get("PLAYWRIGHT_CDP_URL") and settings.get("PLAYWRIGHT_CONNECT_URL"): 105 | msg = "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported" 106 | logger.error(msg) 107 | raise NotSupported(msg) 108 | cfg = cls( 109 | cdp_url=settings.get("PLAYWRIGHT_CDP_URL"), 110 | cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {}, 111 | connect_url=settings.get("PLAYWRIGHT_CONNECT_URL"), 112 | connect_kwargs=settings.getdict("PLAYWRIGHT_CONNECT_KWARGS") or {}, 113 | browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE, 114 | launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {}, 115 | max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"), 116 | max_contexts=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") or None, 117 | startup_context_kwargs=settings.getdict("PLAYWRIGHT_CONTEXTS"), 118 | navigation_timeout=_get_float_setting( 119 | settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" 120 | ), 121 | restart_disconnected_browser=settings.getbool( 122 | "PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER", default=True 123 | ), 124 | use_threaded_loop=platform.system() == "Windows" 125 | or settings.getbool("_PLAYWRIGHT_THREADED_LOOP", False), 126 | ) 127 | cfg.cdp_kwargs.pop("endpoint_url", None) 128 | cfg.connect_kwargs.pop("ws_endpoint", None) 129 | if not cfg.max_pages_per_context: 130 | cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS") 131 | if (cfg.cdp_url or cfg.connect_url) and cfg.launch_options: 132 | logger.warning("Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS") 133 | return cfg 134 | 135 | 136 | class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler): 137 | playwright_context_manager: Optional[PlaywrightContextManager] = None 138 | playwright: Optional[AsyncPlaywright] = None 139 | 140 | def __init__(self, crawler: Crawler) -> None: 141 | super().__init__(settings=crawler.settings, crawler=crawler) 142 | verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") 143 | crawler.signals.connect(self._engine_started, signals.engine_started) 144 | self.stats = crawler.stats 145 | self.config = Config.from_settings(crawler.settings) 146 | 147 | if self.config.use_threaded_loop: 148 | _ThreadedLoopAdapter.start(id(self)) 149 | 150 | self.browser_launch_lock = asyncio.Lock() 151 | self.context_launch_lock = asyncio.Lock() 152 | self.context_wrappers: Dict[str, BrowserContextWrapper] = {} 153 | if self.config.max_contexts: 154 | self.context_semaphore = asyncio.Semaphore(value=self.config.max_contexts) 155 | 156 | # headers 157 | if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings: 158 | if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None: 159 | self.process_request_headers = None 160 | else: 161 | self.process_request_headers = load_object( 162 | crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] 163 | ) 164 | else: 165 | self.process_request_headers = use_scrapy_headers 166 | 167 | self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None 168 | if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"): 169 | self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"]) 170 | 171 | @classmethod 172 | def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler: 173 | return cls(crawler) 174 | 175 | def _deferred_from_coro(self, coro: Awaitable) -> Deferred: 176 | if self.config.use_threaded_loop: 177 | return _ThreadedLoopAdapter._deferred_from_coro(coro) 178 | return deferred_from_coro(coro) 179 | 180 | def _engine_started(self) -> Deferred: 181 | """Launch the browser. Use the engine_started signal as it supports returning deferreds.""" 182 | return self._deferred_from_coro(self._launch()) 183 | 184 | async def _launch(self) -> None: 185 | """Launch Playwright manager and configured startup context(s).""" 186 | logger.info("Starting download handler") 187 | self.playwright_context_manager = PlaywrightContextManager() 188 | self.playwright = await self.playwright_context_manager.start() 189 | self.browser_type: BrowserType = getattr(self.playwright, self.config.browser_type_name) 190 | if self.config.startup_context_kwargs: 191 | logger.info("Launching %i startup context(s)", len(self.config.startup_context_kwargs)) 192 | await asyncio.gather( 193 | *[ 194 | self._create_browser_context(name=name, context_kwargs=kwargs) 195 | for name, kwargs in self.config.startup_context_kwargs.items() 196 | ] 197 | ) 198 | self._set_max_concurrent_context_count() 199 | logger.info("Startup context(s) launched") 200 | self.stats.set_value("playwright/page_count", self._get_total_page_count()) 201 | 202 | async def _maybe_launch_browser(self) -> None: 203 | async with self.browser_launch_lock: 204 | if not hasattr(self, "browser"): 205 | logger.info("Launching browser %s", self.browser_type.name) 206 | self.browser = await self.browser_type.launch(**self.config.launch_options) 207 | logger.info("Browser %s launched", self.browser_type.name) 208 | self.stats.inc_value("playwright/browser_count") 209 | self.browser.on("disconnected", self._browser_disconnected_callback) 210 | 211 | async def _maybe_connect_remote_devtools(self) -> None: 212 | async with self.browser_launch_lock: 213 | if not hasattr(self, "browser"): 214 | logger.info("Connecting using CDP: %s", self.config.cdp_url) 215 | self.browser = await self.browser_type.connect_over_cdp( 216 | self.config.cdp_url, **self.config.cdp_kwargs 217 | ) 218 | logger.info("Connected using CDP: %s", self.config.cdp_url) 219 | self.stats.inc_value("playwright/browser_count") 220 | self.browser.on("disconnected", self._browser_disconnected_callback) 221 | 222 | async def _maybe_connect_remote(self) -> None: 223 | async with self.browser_launch_lock: 224 | if not hasattr(self, "browser"): 225 | logger.info("Connecting to remote Playwright") 226 | self.browser = await self.browser_type.connect( 227 | self.config.connect_url, **self.config.connect_kwargs 228 | ) 229 | logger.info("Connected to remote Playwright") 230 | self.stats.inc_value("playwright/browser_count") 231 | self.browser.on("disconnected", self._browser_disconnected_callback) 232 | 233 | async def _create_browser_context( 234 | self, 235 | name: str, 236 | context_kwargs: Optional[dict], 237 | spider: Optional[Spider] = None, 238 | ) -> BrowserContextWrapper: 239 | """Create a new context, also launching a local browser or connecting 240 | to a remote one if necessary. 241 | """ 242 | if hasattr(self, "context_semaphore"): 243 | await self.context_semaphore.acquire() 244 | context_kwargs = context_kwargs or {} 245 | persistent = remote = False 246 | if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY): 247 | context = await self.browser_type.launch_persistent_context(**context_kwargs) 248 | persistent = True 249 | elif self.config.cdp_url: 250 | await self._maybe_connect_remote_devtools() 251 | context = await self.browser.new_context(**context_kwargs) 252 | remote = True 253 | elif self.config.connect_url: 254 | await self._maybe_connect_remote() 255 | context = await self.browser.new_context(**context_kwargs) 256 | remote = True 257 | else: 258 | await self._maybe_launch_browser() 259 | context = await self.browser.new_context(**context_kwargs) 260 | 261 | context.on( 262 | "close", self._make_close_browser_context_callback(name, persistent, remote, spider) 263 | ) 264 | self.stats.inc_value("playwright/context_count") 265 | self.stats.inc_value(f"playwright/context_count/persistent/{persistent}") 266 | self.stats.inc_value(f"playwright/context_count/remote/{remote}") 267 | logger.debug( 268 | "Browser context started: '%s' (persistent=%s, remote=%s)", 269 | name, 270 | persistent, 271 | remote, 272 | extra={ 273 | "spider": spider, 274 | "context_name": name, 275 | "persistent": persistent, 276 | "remote": remote, 277 | }, 278 | ) 279 | if self.config.navigation_timeout is not None: 280 | context.set_default_navigation_timeout(self.config.navigation_timeout) 281 | self.context_wrappers[name] = BrowserContextWrapper( 282 | context=context, 283 | semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context), 284 | persistent=persistent, 285 | ) 286 | self._set_max_concurrent_context_count() 287 | return self.context_wrappers[name] 288 | 289 | async def _create_page(self, request: Request, spider: Spider) -> Page: 290 | """Create a new page in a context, also creating a new context if necessary.""" 291 | context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) 292 | # this block needs to be locked because several attempts to launch a context 293 | # with the same name could happen at the same time from different requests 294 | async with self.context_launch_lock: 295 | ctx_wrapper = self.context_wrappers.get(context_name) 296 | if ctx_wrapper is None: 297 | ctx_wrapper = await self._create_browser_context( 298 | name=context_name, 299 | context_kwargs=request.meta.get("playwright_context_kwargs"), 300 | spider=spider, 301 | ) 302 | 303 | await ctx_wrapper.semaphore.acquire() 304 | page = await ctx_wrapper.context.new_page() 305 | self.stats.inc_value("playwright/page_count") 306 | total_page_count = self._get_total_page_count() 307 | logger.debug( 308 | "[Context=%s] New page created, page count is %i (%i for all contexts)", 309 | context_name, 310 | len(ctx_wrapper.context.pages), 311 | total_page_count, 312 | extra={ 313 | "spider": spider, 314 | "context_name": context_name, 315 | "context_page_count": len(ctx_wrapper.context.pages), 316 | "total_page_count": total_page_count, 317 | "scrapy_request_url": request.url, 318 | "scrapy_request_method": request.method, 319 | }, 320 | ) 321 | self._set_max_concurrent_page_count() 322 | if self.config.navigation_timeout is not None: 323 | page.set_default_navigation_timeout(self.config.navigation_timeout) 324 | 325 | page.on("close", self._make_close_page_callback(context_name)) 326 | page.on("crash", self._make_close_page_callback(context_name)) 327 | page.on("request", self._increment_request_stats) 328 | page.on("response", self._increment_response_stats) 329 | if logger.getEffectiveLevel() <= logging.DEBUG: 330 | page.on("request", _make_request_logger(context_name, spider)) 331 | page.on("response", _make_response_logger(context_name, spider)) 332 | 333 | return page 334 | 335 | def _get_total_page_count(self): 336 | return sum(len(ctx.context.pages) for ctx in self.context_wrappers.values()) 337 | 338 | def _set_max_concurrent_page_count(self): 339 | count = self._get_total_page_count() 340 | current_max_count = self.stats.get_value("playwright/page_count/max_concurrent") 341 | if current_max_count is None or count > current_max_count: 342 | self.stats.set_value("playwright/page_count/max_concurrent", count) 343 | 344 | def _set_max_concurrent_context_count(self): 345 | current_max_count = self.stats.get_value("playwright/context_count/max_concurrent") 346 | if current_max_count is None or len(self.context_wrappers) > current_max_count: 347 | self.stats.set_value( 348 | "playwright/context_count/max_concurrent", len(self.context_wrappers) 349 | ) 350 | 351 | @inlineCallbacks 352 | def close(self) -> Deferred: 353 | logger.info("Closing download handler") 354 | yield super().close() 355 | yield self._deferred_from_coro(self._close()) 356 | if self.config.use_threaded_loop: 357 | _ThreadedLoopAdapter.stop(id(self)) 358 | 359 | async def _close(self) -> None: 360 | with suppress(TargetClosedError): 361 | await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()]) 362 | self.context_wrappers.clear() 363 | if hasattr(self, "browser"): 364 | logger.info("Closing browser") 365 | await self.browser.close() 366 | if self.playwright_context_manager: 367 | await self.playwright_context_manager.__aexit__() 368 | if self.playwright: 369 | await self.playwright.stop() 370 | 371 | def download_request(self, request: Request, spider: Spider) -> Deferred: 372 | if request.meta.get("playwright"): 373 | return self._deferred_from_coro(self._download_request(request, spider)) 374 | return super().download_request(request, spider) 375 | 376 | async def _download_request(self, request: Request, spider: Spider) -> Response: 377 | counter = 0 378 | while True: 379 | try: 380 | return await self._download_request_with_retry(request=request, spider=spider) 381 | except TargetClosedError as ex: 382 | counter += 1 383 | if counter > self.config.target_closed_max_retries: 384 | raise ex 385 | logger.debug( 386 | "Target closed, retrying to create page for %s", 387 | request, 388 | extra={ 389 | "spider": spider, 390 | "scrapy_request_url": request.url, 391 | "scrapy_request_method": request.method, 392 | "exception": ex, 393 | }, 394 | ) 395 | 396 | async def _download_request_with_retry(self, request: Request, spider: Spider) -> Response: 397 | page = request.meta.get("playwright_page") 398 | if not isinstance(page, Page) or page.is_closed(): 399 | page = await self._create_page(request=request, spider=spider) 400 | context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME) 401 | 402 | _attach_page_event_handlers( 403 | page=page, request=request, spider=spider, context_name=context_name 404 | ) 405 | 406 | # We need to identify the Playwright request that matches the Scrapy request 407 | # in order to override method and body if necessary. 408 | # Checking the URL and Request.is_navigation_request() is not enough, e.g. 409 | # requests produced by submitting forms can produce false positives. 410 | # Let's track only the first request that matches the above conditions. 411 | initial_request_done = asyncio.Event() 412 | 413 | await page.unroute("**") 414 | await page.route( 415 | "**", 416 | self._make_request_handler( 417 | context_name=context_name, 418 | method=request.method, 419 | url=request.url, 420 | headers=request.headers, 421 | body=request.body, 422 | encoding=request.encoding, 423 | spider=spider, 424 | initial_request_done=initial_request_done, 425 | ), 426 | ) 427 | 428 | await _maybe_execute_page_init_callback( 429 | page=page, request=request, context_name=context_name, spider=spider 430 | ) 431 | 432 | try: 433 | return await self._download_request_with_page(request, page, spider) 434 | except Exception as ex: 435 | if not request.meta.get("playwright_include_page") and not page.is_closed(): 436 | logger.warning( 437 | "Closing page due to failed request: %s exc_type=%s exc_msg=%s", 438 | request, 439 | type(ex), 440 | str(ex), 441 | extra={ 442 | "spider": spider, 443 | "context_name": context_name, 444 | "scrapy_request_url": request.url, 445 | "scrapy_request_method": request.method, 446 | "exception": ex, 447 | }, 448 | exc_info=True, 449 | ) 450 | await page.close() 451 | self.stats.inc_value("playwright/page_count/closed") 452 | raise 453 | 454 | async def _download_request_with_page( 455 | self, request: Request, page: Page, spider: Spider 456 | ) -> Response: 457 | # set this early to make it available in errbacks even if something fails 458 | if request.meta.get("playwright_include_page"): 459 | request.meta["playwright_page"] = page 460 | 461 | start_time = time() 462 | response, download = await self._get_response_and_download(request, page, spider) 463 | if isinstance(response, PlaywrightResponse): 464 | await _set_redirect_meta(request=request, response=response) 465 | headers = Headers(await response.all_headers()) 466 | headers.pop("Content-Encoding", None) 467 | elif not download: 468 | logger.warning( 469 | "Navigating to %s returned None, the response" 470 | " will have empty headers and status 200", 471 | request, 472 | extra={ 473 | "spider": spider, 474 | "context_name": request.meta.get("playwright_context"), 475 | "scrapy_request_url": request.url, 476 | "scrapy_request_method": request.method, 477 | }, 478 | ) 479 | headers = Headers() 480 | 481 | await self._apply_page_methods(page, request, spider) 482 | body_str = await _get_page_content( 483 | page=page, 484 | spider=spider, 485 | context_name=request.meta.get("playwright_context"), 486 | scrapy_request_url=request.url, 487 | scrapy_request_method=request.method, 488 | ) 489 | request.meta["download_latency"] = time() - start_time 490 | 491 | server_ip_address = None 492 | if response is not None: 493 | request.meta["playwright_security_details"] = await response.security_details() 494 | with suppress(KeyError, TypeError, ValueError): 495 | server_addr = await response.server_addr() 496 | server_ip_address = ip_address(server_addr["ipAddress"]) 497 | 498 | if download and download.exception: 499 | raise download.exception 500 | 501 | if not request.meta.get("playwright_include_page"): 502 | await page.close() 503 | self.stats.inc_value("playwright/page_count/closed") 504 | 505 | if download: 506 | request.meta["playwright_suggested_filename"] = download.suggested_filename 507 | respcls = responsetypes.from_args(url=download.url, body=download.body) 508 | download_headers = Headers(download.headers) 509 | download_headers.pop("Content-Encoding", None) 510 | return respcls( 511 | url=download.url, 512 | status=download.response_status, 513 | headers=download_headers, 514 | body=download.body, 515 | request=request, 516 | flags=["playwright"], 517 | ) 518 | 519 | body, encoding = _encode_body(headers=headers, text=body_str) 520 | respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) 521 | return respcls( 522 | url=page.url, 523 | status=response.status if response is not None else 200, 524 | headers=headers, 525 | body=body, 526 | request=request, 527 | flags=["playwright"], 528 | encoding=encoding, 529 | ip_address=server_ip_address, 530 | ) 531 | 532 | async def _get_response_and_download( 533 | self, request: Request, page: Page, spider: Spider 534 | ) -> Tuple[Optional[PlaywrightResponse], Optional[Download]]: 535 | response: Optional[PlaywrightResponse] = None 536 | download: Download = Download() # updated in-place in _handle_download 537 | download_started = asyncio.Event() 538 | download_ready = asyncio.Event() 539 | 540 | async def _handle_download(dwnld: PlaywrightDownload) -> None: 541 | download_started.set() 542 | self.stats.inc_value("playwright/download_count") 543 | try: 544 | if failure := await dwnld.failure(): 545 | raise RuntimeError(f"Failed to download {dwnld.url}: {failure}") 546 | download.body = (await dwnld.path()).read_bytes() 547 | download.url = dwnld.url 548 | download.suggested_filename = dwnld.suggested_filename 549 | except Exception as ex: 550 | download.exception = ex 551 | finally: 552 | download_ready.set() 553 | 554 | async def _handle_response(response: PlaywrightResponse) -> None: 555 | download.response_status = response.status 556 | download.headers = await response.all_headers() 557 | download_started.set() 558 | 559 | page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {} 560 | page_goto_kwargs.pop("url", None) 561 | page.on("download", _handle_download) 562 | page.on("response", _handle_response) 563 | try: 564 | response = await page.goto(url=request.url, **page_goto_kwargs) 565 | except PlaywrightError as err: 566 | if not ( 567 | self.config.browser_type_name in ("firefox", "webkit") 568 | and "Download is starting" in err.message 569 | or self.config.browser_type_name == "chromium" 570 | and "net::ERR_ABORTED" in err.message 571 | ): 572 | raise 573 | 574 | logger.debug( 575 | "Navigating to %s failed", 576 | request.url, 577 | extra={ 578 | "spider": spider, 579 | "context_name": request.meta.get("playwright_context"), 580 | "scrapy_request_url": request.url, 581 | "scrapy_request_method": request.method, 582 | }, 583 | ) 584 | await download_started.wait() 585 | 586 | if download.response_status == 204: 587 | raise err 588 | 589 | logger.debug( 590 | "Waiting on download to finish for %s", 591 | request.url, 592 | extra={ 593 | "spider": spider, 594 | "context_name": request.meta.get("playwright_context"), 595 | "scrapy_request_url": request.url, 596 | "scrapy_request_method": request.method, 597 | }, 598 | ) 599 | await download_ready.wait() 600 | finally: 601 | page.remove_listener("download", _handle_download) 602 | page.remove_listener("response", _handle_response) 603 | 604 | return response, download if download else None 605 | 606 | async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None: 607 | context_name = request.meta.get("playwright_context") 608 | page_methods = request.meta.get("playwright_page_methods") or () 609 | if isinstance(page_methods, dict): 610 | page_methods = page_methods.values() 611 | for pm in page_methods: 612 | if isinstance(pm, PageMethod): 613 | try: 614 | if callable(pm.method): 615 | method = partial(pm.method, page) 616 | else: 617 | method = getattr(page, pm.method) 618 | except AttributeError as ex: 619 | logger.warning( 620 | "Ignoring %r: could not find method", 621 | pm, 622 | extra={ 623 | "spider": spider, 624 | "context_name": context_name, 625 | "scrapy_request_url": request.url, 626 | "scrapy_request_method": request.method, 627 | "exception": ex, 628 | }, 629 | exc_info=True, 630 | ) 631 | else: 632 | pm.result = await _maybe_await(method(*pm.args, **pm.kwargs)) 633 | await page.wait_for_load_state(timeout=self.config.navigation_timeout) 634 | else: 635 | logger.warning( 636 | "Ignoring %r: expected PageMethod, got %r", 637 | pm, 638 | type(pm), 639 | extra={ 640 | "spider": spider, 641 | "context_name": context_name, 642 | "scrapy_request_url": request.url, 643 | "scrapy_request_method": request.method, 644 | }, 645 | ) 646 | 647 | def _increment_request_stats(self, request: PlaywrightRequest) -> None: 648 | stats_prefix = "playwright/request_count" 649 | self.stats.inc_value(stats_prefix) 650 | self.stats.inc_value(f"{stats_prefix}/resource_type/{request.resource_type}") 651 | self.stats.inc_value(f"{stats_prefix}/method/{request.method}") 652 | if request.is_navigation_request(): 653 | self.stats.inc_value(f"{stats_prefix}/navigation") 654 | 655 | def _increment_response_stats(self, response: PlaywrightResponse) -> None: 656 | stats_prefix = "playwright/response_count" 657 | self.stats.inc_value(stats_prefix) 658 | self.stats.inc_value(f"{stats_prefix}/resource_type/{response.request.resource_type}") 659 | self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}") 660 | 661 | async def _browser_disconnected_callback(self) -> None: 662 | close_context_coros = [ 663 | ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values() 664 | ] 665 | self.context_wrappers.clear() 666 | with suppress(TargetClosedError): 667 | await asyncio.gather(*close_context_coros) 668 | logger.debug("Browser disconnected") 669 | if self.config.restart_disconnected_browser: 670 | del self.browser 671 | 672 | def _make_close_page_callback(self, context_name: str) -> Callable: 673 | def close_page_callback() -> None: 674 | if context_name in self.context_wrappers: 675 | self.context_wrappers[context_name].semaphore.release() 676 | 677 | return close_page_callback 678 | 679 | def _make_close_browser_context_callback( 680 | self, name: str, persistent: bool, remote: bool, spider: Optional[Spider] = None 681 | ) -> Callable: 682 | def close_browser_context_callback() -> None: 683 | self.context_wrappers.pop(name, None) 684 | if hasattr(self, "context_semaphore"): 685 | self.context_semaphore.release() 686 | logger.debug( 687 | "Browser context closed: '%s' (persistent=%s, remote=%s)", 688 | name, 689 | persistent, 690 | remote, 691 | extra={ 692 | "spider": spider, 693 | "context_name": name, 694 | "persistent": persistent, 695 | "remote": remote, 696 | }, 697 | ) 698 | 699 | return close_browser_context_callback 700 | 701 | def _make_request_handler( 702 | self, 703 | context_name: str, 704 | method: str, 705 | url: str, 706 | headers: Headers, 707 | body: Optional[bytes], 708 | encoding: str, 709 | spider: Spider, 710 | initial_request_done: asyncio.Event, 711 | ) -> Callable: 712 | async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None: 713 | """Override request headers, method and body.""" 714 | if self.abort_request: 715 | should_abort = await _maybe_await(self.abort_request(playwright_request)) 716 | if should_abort: 717 | await route.abort() 718 | logger.debug( 719 | "[Context=%s] Aborted Playwright request <%s %s>", 720 | context_name, 721 | playwright_request.method.upper(), 722 | playwright_request.url, 723 | extra={ 724 | "spider": spider, 725 | "context_name": context_name, 726 | "scrapy_request_url": url, 727 | "scrapy_request_method": method, 728 | "playwright_request_url": playwright_request.url, 729 | "playwright_request_method": playwright_request.method, 730 | }, 731 | ) 732 | self.stats.inc_value("playwright/request_count/aborted") 733 | return None 734 | 735 | overrides: dict = {} 736 | 737 | if self.process_request_headers is None: 738 | final_headers = await playwright_request.all_headers() 739 | elif (sig := inspect.signature(self.process_request_headers)) and ( 740 | "browser_type_name" in sig.parameters 741 | and "playwright_request" in sig.parameters 742 | and "scrapy_request_data" in sig.parameters 743 | ): 744 | overrides["headers"] = final_headers = await _maybe_await( 745 | self.process_request_headers( 746 | browser_type_name=self.config.browser_type_name, 747 | playwright_request=playwright_request, 748 | scrapy_request_data={ 749 | "method": method, 750 | "url": url, 751 | "headers": headers, 752 | "body": body, 753 | "encoding": encoding, 754 | }, 755 | ) 756 | ) 757 | else: 758 | warnings.warn( 759 | "Accepting positional arguments in the function passed to the" 760 | " PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function" 761 | " should accept three (3) keyword arguments instead:" 762 | " browser_type_name: str," 763 | " playwright_request: playwright.async_api.Request," 764 | " scrapy_request_data: dict", 765 | category=ScrapyDeprecationWarning, 766 | stacklevel=1, 767 | ) 768 | overrides["headers"] = final_headers = await _maybe_await( 769 | self.process_request_headers( 770 | self.config.browser_type_name, 771 | playwright_request, 772 | headers, 773 | ) 774 | ) 775 | 776 | # if the current request corresponds to the original scrapy one 777 | if ( 778 | playwright_request.url.rstrip("/") == url.rstrip("/") 779 | and playwright_request.is_navigation_request() 780 | and not initial_request_done.is_set() 781 | ): 782 | initial_request_done.set() 783 | if method.upper() != playwright_request.method.upper(): 784 | overrides["method"] = method 785 | if body: 786 | overrides["post_data"] = body.decode(encoding) 787 | # the request that reaches the callback should contain the final headers 788 | headers.clear() 789 | headers.update(final_headers) 790 | 791 | del final_headers 792 | 793 | original_playwright_method: str = playwright_request.method 794 | try: 795 | await route.continue_(**overrides) 796 | if overrides.get("method"): 797 | logger.debug( 798 | "[Context=%s] Overridden method for Playwright request" 799 | " to %s: original=%s new=%s", 800 | context_name, 801 | playwright_request.url, 802 | original_playwright_method, 803 | overrides["method"], 804 | extra={ 805 | "spider": spider, 806 | "context_name": context_name, 807 | "scrapy_request_url": url, 808 | "scrapy_request_method": method, 809 | "playwright_request_url": playwright_request.url, 810 | "playwright_request_method_original": original_playwright_method, 811 | "playwright_request_method_new": overrides["method"], 812 | }, 813 | ) 814 | except PlaywrightError as ex: 815 | if _is_safe_close_error(ex): 816 | logger.warning( 817 | "Failed processing Playwright request: <%s %s> exc_type=%s exc_msg=%s", 818 | playwright_request.method, 819 | playwright_request.url, 820 | type(ex), 821 | str(ex), 822 | extra={ 823 | "spider": spider, 824 | "context_name": context_name, 825 | "scrapy_request_url": url, 826 | "scrapy_request_method": method, 827 | "playwright_request_url": playwright_request.url, 828 | "playwright_request_method": playwright_request.method, 829 | "exception": ex, 830 | }, 831 | exc_info=True, 832 | ) 833 | else: 834 | raise 835 | 836 | return _request_handler 837 | 838 | 839 | def _attach_page_event_handlers( 840 | page: Page, request: Request, spider: Spider, context_name: str 841 | ) -> None: 842 | event_handlers = request.meta.get("playwright_page_event_handlers") or {} 843 | for event, handler in event_handlers.items(): 844 | if callable(handler): 845 | page.on(event, handler) 846 | elif isinstance(handler, str): 847 | try: 848 | page.on(event, getattr(spider, handler)) 849 | except AttributeError as ex: 850 | logger.warning( 851 | "Spider '%s' does not have a '%s' attribute," 852 | " ignoring handler for event '%s'", 853 | spider.name, 854 | handler, 855 | event, 856 | extra={ 857 | "spider": spider, 858 | "context_name": context_name, 859 | "scrapy_request_url": request.url, 860 | "scrapy_request_method": request.method, 861 | "exception": ex, 862 | }, 863 | exc_info=True, 864 | ) 865 | 866 | 867 | async def _set_redirect_meta(request: Request, response: PlaywrightResponse) -> None: 868 | """Update a Scrapy request with metadata about redirects.""" 869 | redirect_times: int = 0 870 | redirect_urls: list = [] 871 | redirect_reasons: list = [] 872 | redirected = response.request.redirected_from 873 | while redirected is not None: 874 | redirect_times += 1 875 | redirect_urls.append(redirected.url) 876 | redirected_response = await redirected.response() 877 | reason = None if redirected_response is None else redirected_response.status 878 | redirect_reasons.append(reason) 879 | redirected = redirected.redirected_from 880 | if redirect_times: 881 | request.meta["redirect_times"] = redirect_times 882 | request.meta["redirect_urls"] = list(reversed(redirect_urls)) 883 | request.meta["redirect_reasons"] = list(reversed(redirect_reasons)) 884 | 885 | 886 | async def _maybe_execute_page_init_callback( 887 | page: Page, 888 | request: Request, 889 | context_name: str, 890 | spider: Spider, 891 | ) -> None: 892 | page_init_callback = request.meta.get("playwright_page_init_callback") 893 | if page_init_callback: 894 | try: 895 | page_init_callback = load_object(page_init_callback) 896 | await page_init_callback(page, request) 897 | except Exception as ex: 898 | logger.warning( 899 | "[Context=%s] Page init callback exception for %s exc_type=%s exc_msg=%s", 900 | context_name, 901 | repr(request), 902 | type(ex), 903 | str(ex), 904 | extra={ 905 | "spider": spider, 906 | "context_name": context_name, 907 | "scrapy_request_url": request.url, 908 | "scrapy_request_method": request.method, 909 | "exception": ex, 910 | }, 911 | exc_info=True, 912 | ) 913 | 914 | 915 | def _make_request_logger(context_name: str, spider: Spider) -> Callable: 916 | async def _log_request(request: PlaywrightRequest) -> None: 917 | log_args = [context_name, request.method.upper(), request.url, request.resource_type] 918 | referrer = await _get_header_value(request, "referer") 919 | if referrer: 920 | log_args.append(referrer) 921 | log_msg = "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)" 922 | else: 923 | log_msg = "[Context=%s] Request: <%s %s> (resource type: %s)" 924 | logger.debug( 925 | log_msg, 926 | *log_args, 927 | extra={ 928 | "spider": spider, 929 | "context_name": context_name, 930 | "playwright_request_url": request.url, 931 | "playwright_request_method": request.method, 932 | "playwright_resource_type": request.resource_type, 933 | }, 934 | ) 935 | 936 | return _log_request 937 | 938 | 939 | def _make_response_logger(context_name: str, spider: Spider) -> Callable: 940 | async def _log_response(response: PlaywrightResponse) -> None: 941 | log_args = [context_name, response.status, response.url] 942 | location = await _get_header_value(response, "location") 943 | if location: 944 | log_args.append(location) 945 | log_msg = "[Context=%s] Response: <%i %s> (location: %s)" 946 | else: 947 | log_msg = "[Context=%s] Response: <%i %s>" 948 | logger.debug( 949 | log_msg, 950 | *log_args, 951 | extra={ 952 | "spider": spider, 953 | "context_name": context_name, 954 | "playwright_response_url": response.url, 955 | "playwright_response_status": response.status, 956 | }, 957 | ) 958 | 959 | return _log_response 960 | -------------------------------------------------------------------------------- /scrapy_playwright/headers.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module includes functions to process request headers. 3 | Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information. 4 | """ 5 | 6 | from typing import Dict 7 | from urllib.parse import urlparse 8 | 9 | from playwright.async_api import Request as PlaywrightRequest 10 | 11 | 12 | async def use_scrapy_headers( 13 | *, 14 | browser_type_name: str, 15 | playwright_request: PlaywrightRequest, 16 | scrapy_request_data: dict, 17 | ) -> Dict[str, str]: 18 | """Scrapy headers take precedence over Playwright headers for navigation requests. 19 | For non-navigation requests, only User-Agent is taken from the Scrapy headers.""" 20 | 21 | scrapy_headers_str = scrapy_request_data["headers"].to_unicode_dict() 22 | playwright_headers = await playwright_request.all_headers() 23 | 24 | # Scrapy's user agent has priority over Playwright's 25 | scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent")) 26 | 27 | if playwright_request.is_navigation_request(): 28 | # if referer header is set via playwright_page_goto_kwargs 29 | if referer := playwright_headers.get("referer"): 30 | scrapy_headers_str.setdefault("referer", referer) 31 | 32 | # otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET 33 | if browser_type_name == "firefox": 34 | scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc 35 | 36 | return scrapy_headers_str 37 | 38 | # override user agent, for consistency with other requests 39 | if scrapy_headers_str.get("user-agent"): 40 | playwright_headers["user-agent"] = scrapy_headers_str["user-agent"] 41 | return playwright_headers 42 | -------------------------------------------------------------------------------- /scrapy_playwright/memusage.py: -------------------------------------------------------------------------------- 1 | from contextlib import suppress 2 | from importlib import import_module 3 | from typing import List 4 | 5 | from scrapy.exceptions import NotConfigured 6 | from scrapy.extensions.memusage import MemoryUsage 7 | 8 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler, logger 9 | 10 | 11 | _MIB_FACTOR = 1024**2 12 | 13 | 14 | class ScrapyPlaywrightMemoryUsageExtension(MemoryUsage): 15 | def __init__(self, *args, **kwargs) -> None: 16 | super().__init__(*args, **kwargs) 17 | try: 18 | self.psutil = import_module("psutil") 19 | except ImportError as exc: 20 | raise NotConfigured("The psutil module is not available") from exc 21 | 22 | def _get_main_process_ids(self) -> List[int]: 23 | try: 24 | return [ 25 | handler.playwright_context_manager._connection._transport._proc.pid 26 | for handler in self.crawler.engine.downloader.handlers._handlers.values() 27 | if isinstance(handler, ScrapyPlaywrightDownloadHandler) 28 | and handler.playwright_context_manager 29 | ] 30 | except Exception: 31 | return [] 32 | 33 | def _get_descendant_processes(self, process) -> list: 34 | children = process.children() 35 | result = children.copy() 36 | for child in children: 37 | result.extend(self._get_descendant_processes(child)) 38 | return result 39 | 40 | def _get_total_playwright_process_memory(self) -> int: 41 | process_list = [self.psutil.Process(pid) for pid in self._get_main_process_ids()] 42 | for proc in process_list.copy(): 43 | process_list.extend(self._get_descendant_processes(proc)) 44 | total_process_size = 0 45 | for proc in process_list: 46 | with suppress(Exception): # might fail if the process exited in the meantime 47 | total_process_size += proc.memory_info().rss 48 | logger.debug( 49 | "Total Playwright process memory: %i Bytes (%i MiB)", 50 | total_process_size, 51 | total_process_size / _MIB_FACTOR, 52 | ) 53 | return total_process_size 54 | 55 | def get_virtual_size(self) -> int: 56 | return super().get_virtual_size() + self._get_total_playwright_process_memory() 57 | -------------------------------------------------------------------------------- /scrapy_playwright/page.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Union 2 | 3 | 4 | __all__ = ["PageMethod"] 5 | 6 | 7 | class PageMethod: 8 | """ 9 | Represents a method to be called (and awaited if necessary) on a 10 | Playwright page, such as "click", "screenshot", "evaluate", etc. 11 | 12 | If a callable is received, it will be called with the page as its first argument. 13 | Any additional arguments are passed to the callable after the page. 14 | """ 15 | 16 | def __init__(self, method: Union[str, Callable], *args, **kwargs) -> None: 17 | self.method: Union[str, Callable] = method 18 | self.args: tuple = args 19 | self.kwargs: dict = kwargs 20 | self.result: Any = None 21 | 22 | def __str__(self) -> str: 23 | return f"<{self.__class__.__name__} for method '{self.method}'>" 24 | 25 | __repr__ = __str__ 26 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 99 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | from scrapy_playwright import __version__ 4 | 5 | 6 | with open("README.md", "r", encoding="utf-8") as fh: 7 | long_description = fh.read() 8 | 9 | 10 | setuptools.setup( 11 | name="scrapy-playwright", 12 | version=__version__, 13 | license="BSD", 14 | description="Playwright integration for Scrapy", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | author="Eugenio Lacuesta", 18 | author_email="eugenio.lacuesta@gmail.com", 19 | url="https://github.com/scrapy-plugins/scrapy-playwright", 20 | packages=["scrapy_playwright"], 21 | classifiers=[ 22 | "Development Status :: 4 - Beta", 23 | "License :: OSI Approved :: BSD License", 24 | "Programming Language :: Python", 25 | "Programming Language :: Python :: 3.8", 26 | "Programming Language :: Python :: 3.9", 27 | "Programming Language :: Python :: 3.10", 28 | "Programming Language :: Python :: 3.11", 29 | "Programming Language :: Python :: 3.12", 30 | "Framework :: Scrapy", 31 | "Intended Audience :: Developers", 32 | "Topic :: Internet :: WWW/HTTP", 33 | "Topic :: Software Development :: Libraries :: Application Frameworks", 34 | "Topic :: Software Development :: Libraries :: Python Modules", 35 | ], 36 | python_requires=">=3.8", 37 | install_requires=[ 38 | "scrapy>=2.0,!=2.4.0", 39 | "playwright>=1.15", 40 | ], 41 | ) 42 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import logging 4 | import platform 5 | from contextlib import asynccontextmanager 6 | from functools import wraps 7 | from typing import Optional 8 | 9 | from scrapy import Request 10 | from scrapy.http.response.html import HtmlResponse 11 | from scrapy.utils.test import get_crawler 12 | 13 | 14 | logger = logging.getLogger("scrapy-playwright-tests") 15 | 16 | 17 | if platform.system() == "Windows": 18 | from scrapy_playwright._utils import _ThreadedLoopAdapter 19 | 20 | def allow_windows(test_method): 21 | """Wrap tests with the _ThreadedLoopAdapter class on Windows.""" 22 | if not inspect.iscoroutinefunction(test_method): 23 | raise RuntimeError(f"{test_method} must be an async def method") 24 | 25 | @wraps(test_method) 26 | async def wrapped(self, *args, **kwargs): 27 | caller_id = 1234 28 | _ThreadedLoopAdapter.start(caller_id) 29 | coro = test_method(self, *args, **kwargs) 30 | asyncio.run_coroutine_threadsafe(coro=coro, loop=_ThreadedLoopAdapter._loop).result() 31 | _ThreadedLoopAdapter.stop(caller_id) 32 | 33 | return wrapped 34 | 35 | else: 36 | 37 | def allow_windows(test_method): 38 | return test_method 39 | 40 | 41 | @asynccontextmanager 42 | async def make_handler(settings_dict: Optional[dict] = None): 43 | """Convenience function to obtain an initialized handler and close it gracefully""" 44 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler 45 | 46 | settings: dict = settings_dict or {} 47 | settings.setdefault("TELNETCONSOLE_ENABLED", False) 48 | crawler = get_crawler(settings_dict=settings) 49 | handler = ScrapyPlaywrightDownloadHandler(crawler=crawler) 50 | try: 51 | await handler._launch() 52 | except: # noqa (E722), pylint: disable=bare-except 53 | pass 54 | else: 55 | yield handler 56 | finally: 57 | await handler._close() 58 | 59 | 60 | def assert_correct_response(response: HtmlResponse, request: Request) -> None: 61 | assert isinstance(response, HtmlResponse) 62 | assert response.request is request 63 | assert response.url == request.url 64 | assert response.status == 200 65 | assert "playwright" in response.flags 66 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | import pytest 4 | 5 | 6 | @pytest.hookimpl(tryfirst=True) 7 | def pytest_configure(config): 8 | # https://twistedmatrix.com/trac/ticket/9766 9 | # https://github.com/pytest-dev/pytest-twisted/issues/80 10 | 11 | if config.getoption("reactor", "default") == "asyncio" and platform.system() == "Windows": 12 | import asyncio 13 | 14 | selector_policy = asyncio.WindowsSelectorEventLoopPolicy() 15 | asyncio.set_event_loop_policy(selector_policy) 16 | 17 | 18 | def pytest_sessionstart(session): # pylint: disable=unused-argument 19 | """ 20 | Called after the Session object has been created and before performing 21 | collection and entering the run test loop. 22 | """ 23 | from twisted.internet.asyncioreactor import install, AsyncioSelectorReactor 24 | from twisted.internet.error import ReactorAlreadyInstalledError 25 | 26 | try: 27 | install() 28 | except ReactorAlreadyInstalledError as exc: 29 | from twisted.internet import reactor 30 | 31 | if not isinstance(reactor, AsyncioSelectorReactor): 32 | raise RuntimeError(f"Wrong reactor installed: {type(reactor)}") from exc 33 | -------------------------------------------------------------------------------- /tests/launch_chromium_server.js: -------------------------------------------------------------------------------- 1 | // used to start a browser server to test the PLAYWRIGHT_CONNECT_URL setting 2 | // usage: 3 | // node launch_browser_server.js PORT WS_PATH 4 | 5 | const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'. 6 | 7 | (async () => { 8 | const browserServer = await chromium.launchServer({ 9 | host: 'localhost', 10 | port: process.argv[2], 11 | wsPath: process.argv[3] 12 | }); 13 | console.log(browserServer.wsEndpoint()) 14 | })(); 15 | -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sys 4 | import time 5 | from http.server import HTTPServer, BaseHTTPRequestHandler 6 | from pathlib import Path 7 | from subprocess import Popen, PIPE 8 | from threading import Thread 9 | from typing import Optional 10 | from urllib.parse import urljoin, urlparse, parse_qs 11 | 12 | 13 | class StaticMockServer: 14 | """A web server that serves the contents of the sibling "site" directory. 15 | To be used as a context manager: 16 | 17 | with StaticMockServer() as server: 18 | url = server.urljoin("/index.html") 19 | ... 20 | """ 21 | 22 | def __enter__(self): 23 | self.proc = Popen( 24 | [sys.executable, "-u", "-m", "http.server", "0", "--bind", "127.0.0.1"], 25 | stdout=PIPE, 26 | cwd=str(Path(__file__).absolute().parent / "site"), 27 | ) 28 | self.address, self.port = re.search( 29 | r"^Serving HTTP on (\d+\.\d+\.\d+\.\d+) port (\d+)", 30 | self.proc.stdout.readline().strip().decode("ascii"), 31 | ).groups() 32 | return self 33 | 34 | def __exit__(self, exc_type, exc_value, traceback): 35 | self.proc.kill() 36 | self.proc.communicate() 37 | 38 | def urljoin(self, url): 39 | return urljoin(f"http://{self.address}:{self.port}", url) 40 | 41 | 42 | class _RequestHandler(BaseHTTPRequestHandler): 43 | def do_POST(self) -> None: 44 | """Echo back the request body""" 45 | content_length = int(self.headers.get("Content-Length") or 0) 46 | body_bytes = b"Request body: " + self.rfile.read(content_length) 47 | self.send_response(200) 48 | self.send_header("Content-Length", str(len(body_bytes))) 49 | self.end_headers() 50 | self.wfile.write(body_bytes) 51 | 52 | def do_GET(self) -> None: 53 | parsed_path = urlparse(self.path) 54 | query_string = {key: values[0] for key, values in parse_qs(parsed_path.query).items()} 55 | 56 | if delay := int(query_string.get("delay") or 0): 57 | print(f"Sleeping {delay} seconds on path {parsed_path.path}...") 58 | time.sleep(delay) 59 | 60 | if parsed_path.path == "/headers": 61 | self._send_json(dict(self.headers)) 62 | elif parsed_path.path == "/status/204": 63 | self.send_response(204) 64 | self.end_headers() 65 | elif parsed_path.path == "/redirect2": 66 | self.send_response(302) 67 | self.send_header("Content-Length", "0") 68 | self.send_header("Location", "/redirect") 69 | self.end_headers() 70 | elif parsed_path.path == "/redirect": 71 | self.send_response(301) 72 | self.send_header("Content-Length", "0") 73 | self.send_header("Location", "/headers") 74 | self.end_headers() 75 | elif parsed_path.path == "/mancha.pdf": 76 | body_bytes = (Path(__file__).absolute().parent / "site/files/mancha.pdf").read_bytes() 77 | content_length_multiplier = int(query_string.get("content_length_multiplier") or 1) 78 | self.send_response(200) 79 | self.send_header("Content-Type", "application/pdf") 80 | self.send_header("Content-Disposition", 'attachment; filename="mancha.pdf"') 81 | self.send_header("Content-Length", str(len(body_bytes) * content_length_multiplier)) 82 | self.end_headers() 83 | self.wfile.write(body_bytes) 84 | else: 85 | self._send_json({"error": "unknown path"}, status=404) 86 | 87 | def _send_json(self, body: dict, status: int = 200) -> None: 88 | body_bytes = json.dumps(body, indent=2).encode("utf8") 89 | self.send_response(status) 90 | self.send_header("Content-Length", str(len(body_bytes))) 91 | self.send_header("Content-Type", "application/json") 92 | self.end_headers() 93 | self.wfile.write(body_bytes) 94 | 95 | 96 | class MockServer: 97 | """A context manager web server using the _RequestHandler class to handle requests.""" 98 | 99 | def __enter__(self): 100 | self.httpd = HTTPServer(("127.0.0.1", 0), _RequestHandler) 101 | self.address, self.port = self.httpd.server_address 102 | self.thread = Thread(target=self.httpd.serve_forever) 103 | self.thread.start() 104 | return self 105 | 106 | def __exit__(self, exc_type, exc_value, traceback): 107 | self.httpd.shutdown() 108 | self.thread.join() 109 | 110 | def urljoin(self, url: Optional[str] = None) -> str: 111 | return urljoin(f"http://{self.address}:{self.port}", url) 112 | 113 | 114 | if __name__ == "__main__": 115 | with MockServer() as server: 116 | print(f"Listening at http://{server.address}:{server.port}") 117 | while True: 118 | pass 119 | -------------------------------------------------------------------------------- /tests/site/data/quotes1.json: -------------------------------------------------------------------------------- 1 | { 2 | "has_next": true, 3 | "page": 1, 4 | "quotes": [ 5 | { 6 | "author": { 7 | "goodreads_link": "/author/show/9810.Albert_Einstein", 8 | "name": "Albert Einstein", 9 | "slug": "Albert-Einstein" 10 | }, 11 | "tags": [ 12 | "change", 13 | "deep-thoughts", 14 | "thinking", 15 | "world" 16 | ], 17 | "text": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d" 18 | }, 19 | { 20 | "author": { 21 | "goodreads_link": "/author/show/1077326.J_K_Rowling", 22 | "name": "J.K. Rowling", 23 | "slug": "J-K-Rowling" 24 | }, 25 | "tags": [ 26 | "abilities", 27 | "choices" 28 | ], 29 | "text": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d" 30 | }, 31 | { 32 | "author": { 33 | "goodreads_link": "/author/show/9810.Albert_Einstein", 34 | "name": "Albert Einstein", 35 | "slug": "Albert-Einstein" 36 | }, 37 | "tags": [ 38 | "inspirational", 39 | "life", 40 | "live", 41 | "miracle", 42 | "miracles" 43 | ], 44 | "text": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d" 45 | }, 46 | { 47 | "author": { 48 | "goodreads_link": "/author/show/1265.Jane_Austen", 49 | "name": "Jane Austen", 50 | "slug": "Jane-Austen" 51 | }, 52 | "tags": [ 53 | "aliteracy", 54 | "books", 55 | "classic", 56 | "humor" 57 | ], 58 | "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d" 59 | }, 60 | { 61 | "author": { 62 | "goodreads_link": "/author/show/82952.Marilyn_Monroe", 63 | "name": "Marilyn Monroe", 64 | "slug": "Marilyn-Monroe" 65 | }, 66 | "tags": [ 67 | "be-yourself", 68 | "inspirational" 69 | ], 70 | "text": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d" 71 | }, 72 | { 73 | "author": { 74 | "goodreads_link": "/author/show/9810.Albert_Einstein", 75 | "name": "Albert Einstein", 76 | "slug": "Albert-Einstein" 77 | }, 78 | "tags": [ 79 | "adulthood", 80 | "success", 81 | "value" 82 | ], 83 | "text": "\u201cTry not to become a man of success. Rather become a man of value.\u201d" 84 | }, 85 | { 86 | "author": { 87 | "goodreads_link": "/author/show/7617.Andr_Gide", 88 | "name": "Andr\u00e9 Gide", 89 | "slug": "Andre-Gide" 90 | }, 91 | "tags": [ 92 | "life", 93 | "love" 94 | ], 95 | "text": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d" 96 | }, 97 | { 98 | "author": { 99 | "goodreads_link": "/author/show/3091287.Thomas_A_Edison", 100 | "name": "Thomas A. Edison", 101 | "slug": "Thomas-A-Edison" 102 | }, 103 | "tags": [ 104 | "edison", 105 | "failure", 106 | "inspirational", 107 | "paraphrased" 108 | ], 109 | "text": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d" 110 | }, 111 | { 112 | "author": { 113 | "goodreads_link": "/author/show/44566.Eleanor_Roosevelt", 114 | "name": "Eleanor Roosevelt", 115 | "slug": "Eleanor-Roosevelt" 116 | }, 117 | "tags": [ 118 | "misattributed-eleanor-roosevelt" 119 | ], 120 | "text": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d" 121 | }, 122 | { 123 | "author": { 124 | "goodreads_link": "/author/show/7103.Steve_Martin", 125 | "name": "Steve Martin", 126 | "slug": "Steve-Martin" 127 | }, 128 | "tags": [ 129 | "humor", 130 | "obvious", 131 | "simile" 132 | ], 133 | "text": "\u201cA day without sunshine is like, you know, night.\u201d" 134 | } 135 | ], 136 | "tag": null, 137 | "top_ten_tags": [ 138 | [ 139 | "love", 140 | 14 141 | ], 142 | [ 143 | "inspirational", 144 | 13 145 | ], 146 | [ 147 | "life", 148 | 13 149 | ], 150 | [ 151 | "humor", 152 | 12 153 | ], 154 | [ 155 | "books", 156 | 11 157 | ], 158 | [ 159 | "reading", 160 | 7 161 | ], 162 | [ 163 | "friendship", 164 | 5 165 | ], 166 | [ 167 | "friends", 168 | 4 169 | ], 170 | [ 171 | "truth", 172 | 4 173 | ], 174 | [ 175 | "simile", 176 | 3 177 | ] 178 | ] 179 | } -------------------------------------------------------------------------------- /tests/site/data/quotes2.json: -------------------------------------------------------------------------------- 1 | { 2 | "has_next": true, 3 | "page": 2, 4 | "quotes": [ 5 | { 6 | "author": { 7 | "goodreads_link": "/author/show/82952.Marilyn_Monroe", 8 | "name": "Marilyn Monroe", 9 | "slug": "Marilyn-Monroe" 10 | }, 11 | "tags": [ 12 | "friends", 13 | "heartbreak", 14 | "inspirational", 15 | "life", 16 | "love", 17 | "sisters" 18 | ], 19 | "text": "\u201cThis life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.\u201d" 20 | }, 21 | { 22 | "author": { 23 | "goodreads_link": "/author/show/1077326.J_K_Rowling", 24 | "name": "J.K. Rowling", 25 | "slug": "J-K-Rowling" 26 | }, 27 | "tags": [ 28 | "courage", 29 | "friends" 30 | ], 31 | "text": "\u201cIt takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.\u201d" 32 | }, 33 | { 34 | "author": { 35 | "goodreads_link": "/author/show/9810.Albert_Einstein", 36 | "name": "Albert Einstein", 37 | "slug": "Albert-Einstein" 38 | }, 39 | "tags": [ 40 | "simplicity", 41 | "understand" 42 | ], 43 | "text": "\u201cIf you can't explain it to a six year old, you don't understand it yourself.\u201d" 44 | }, 45 | { 46 | "author": { 47 | "goodreads_link": "/author/show/25241.Bob_Marley", 48 | "name": "Bob Marley", 49 | "slug": "Bob-Marley" 50 | }, 51 | "tags": [ 52 | "love" 53 | ], 54 | "text": "\u201cYou may not be her first, her last, or her only. She loved before she may love again. But if she loves you now, what else matters? She's not perfect\u2014you aren't either, and the two of you may never be perfect together but if she can make you laugh, cause you to think twice, and admit to being human and making mistakes, hold onto her and give her the most you can. She may not be thinking about you every second of the day, but she will give you a part of her that she knows you can break\u2014her heart. So don't hurt her, don't change her, don't analyze and don't expect more than she can give. Smile when she makes you happy, let her know when she makes you mad, and miss her when she's not there.\u201d" 55 | }, 56 | { 57 | "author": { 58 | "goodreads_link": "/author/show/61105.Dr_Seuss", 59 | "name": "Dr. Seuss", 60 | "slug": "Dr-Seuss" 61 | }, 62 | "tags": [ 63 | "fantasy" 64 | ], 65 | "text": "\u201cI like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.\u201d" 66 | }, 67 | { 68 | "author": { 69 | "goodreads_link": "/author/show/4.Douglas_Adams", 70 | "name": "Douglas Adams", 71 | "slug": "Douglas-Adams" 72 | }, 73 | "tags": [ 74 | "life", 75 | "navigation" 76 | ], 77 | "text": "\u201cI may not have gone where I intended to go, but I think I have ended up where I needed to be.\u201d" 78 | }, 79 | { 80 | "author": { 81 | "goodreads_link": "/author/show/1049.Elie_Wiesel", 82 | "name": "Elie Wiesel", 83 | "slug": "Elie-Wiesel" 84 | }, 85 | "tags": [ 86 | "activism", 87 | "apathy", 88 | "hate", 89 | "indifference", 90 | "inspirational", 91 | "love", 92 | "opposite", 93 | "philosophy" 94 | ], 95 | "text": "\u201cThe opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.\u201d" 96 | }, 97 | { 98 | "author": { 99 | "goodreads_link": "/author/show/1938.Friedrich_Nietzsche", 100 | "name": "Friedrich Nietzsche", 101 | "slug": "Friedrich-Nietzsche" 102 | }, 103 | "tags": [ 104 | "friendship", 105 | "lack-of-friendship", 106 | "lack-of-love", 107 | "love", 108 | "marriage", 109 | "unhappy-marriage" 110 | ], 111 | "text": "\u201cIt is not a lack of love, but a lack of friendship that makes unhappy marriages.\u201d" 112 | }, 113 | { 114 | "author": { 115 | "goodreads_link": "/author/show/1244.Mark_Twain", 116 | "name": "Mark Twain", 117 | "slug": "Mark-Twain" 118 | }, 119 | "tags": [ 120 | "books", 121 | "contentment", 122 | "friends", 123 | "friendship", 124 | "life" 125 | ], 126 | "text": "\u201cGood friends, good books, and a sleepy conscience: this is the ideal life.\u201d" 127 | }, 128 | { 129 | "author": { 130 | "goodreads_link": "/author/show/276029.Allen_Saunders", 131 | "name": "Allen Saunders", 132 | "slug": "Allen-Saunders" 133 | }, 134 | "tags": [ 135 | "fate", 136 | "life", 137 | "misattributed-john-lennon", 138 | "planning", 139 | "plans" 140 | ], 141 | "text": "\u201cLife is what happens to us while we are making other plans.\u201d" 142 | } 143 | ], 144 | "tag": null, 145 | "top_ten_tags": [ 146 | [ 147 | "love", 148 | 14 149 | ], 150 | [ 151 | "inspirational", 152 | 13 153 | ], 154 | [ 155 | "life", 156 | 13 157 | ], 158 | [ 159 | "humor", 160 | 12 161 | ], 162 | [ 163 | "books", 164 | 11 165 | ], 166 | [ 167 | "reading", 168 | 7 169 | ], 170 | [ 171 | "friendship", 172 | 5 173 | ], 174 | [ 175 | "friends", 176 | 4 177 | ], 178 | [ 179 | "truth", 180 | 4 181 | ], 182 | [ 183 | "simile", 184 | 3 185 | ] 186 | ] 187 | } -------------------------------------------------------------------------------- /tests/site/data/quotes3.json: -------------------------------------------------------------------------------- 1 | { 2 | "has_next": false, 3 | "page": 3, 4 | "quotes": [ 5 | { 6 | "author": { 7 | "goodreads_link": "/author/show/4026.Pablo_Neruda", 8 | "name": "Pablo Neruda", 9 | "slug": "Pablo-Neruda" 10 | }, 11 | "tags": [ 12 | "love", 13 | "poetry" 14 | ], 15 | "text": "\u201cI love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.\u201d" 16 | }, 17 | { 18 | "author": { 19 | "goodreads_link": "/author/show/12080.Ralph_Waldo_Emerson", 20 | "name": "Ralph Waldo Emerson", 21 | "slug": "Ralph-Waldo-Emerson" 22 | }, 23 | "tags": [ 24 | "happiness" 25 | ], 26 | "text": "\u201cFor every minute you are angry you lose sixty seconds of happiness.\u201d" 27 | }, 28 | { 29 | "author": { 30 | "goodreads_link": "/author/show/838305.Mother_Teresa", 31 | "name": "Mother Teresa", 32 | "slug": "Mother-Teresa" 33 | }, 34 | "tags": [ 35 | "attributed-no-source" 36 | ], 37 | "text": "\u201cIf you judge people, you have no time to love them.\u201d" 38 | }, 39 | { 40 | "author": { 41 | "goodreads_link": "/author/show/2014.Garrison_Keillor", 42 | "name": "Garrison Keillor", 43 | "slug": "Garrison-Keillor" 44 | }, 45 | "tags": [ 46 | "humor", 47 | "religion" 48 | ], 49 | "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d" 50 | }, 51 | { 52 | "author": { 53 | "goodreads_link": "/author/show/4427.Jim_Henson", 54 | "name": "Jim Henson", 55 | "slug": "Jim-Henson" 56 | }, 57 | "tags": [ 58 | "humor" 59 | ], 60 | "text": "\u201cBeauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.\u201d" 61 | }, 62 | { 63 | "author": { 64 | "goodreads_link": "/author/show/61105.Dr_Seuss", 65 | "name": "Dr. Seuss", 66 | "slug": "Dr-Seuss" 67 | }, 68 | "tags": [ 69 | "comedy", 70 | "life", 71 | "yourself" 72 | ], 73 | "text": "\u201cToday you are You, that is truer than true. There is no one alive who is Youer than You.\u201d" 74 | }, 75 | { 76 | "author": { 77 | "goodreads_link": "/author/show/9810.Albert_Einstein", 78 | "name": "Albert Einstein", 79 | "slug": "Albert-Einstein" 80 | }, 81 | "tags": [ 82 | "children", 83 | "fairy-tales" 84 | ], 85 | "text": "\u201cIf you want your children to be intelligent, read them fairy tales. If you want them to be more intelligent, read them more fairy tales.\u201d" 86 | }, 87 | { 88 | "author": { 89 | "goodreads_link": "/author/show/1077326.J_K_Rowling", 90 | "name": "J.K. Rowling", 91 | "slug": "J-K-Rowling" 92 | }, 93 | "tags": [], 94 | "text": "\u201cIt is impossible to live without failing at something, unless you live so cautiously that you might as well not have lived at all - in which case, you fail by default.\u201d" 95 | }, 96 | { 97 | "author": { 98 | "goodreads_link": "/author/show/9810.Albert_Einstein", 99 | "name": "Albert Einstein", 100 | "slug": "Albert-Einstein" 101 | }, 102 | "tags": [ 103 | "imagination" 104 | ], 105 | "text": "\u201cLogic will get you from A to Z; imagination will get you everywhere.\u201d" 106 | }, 107 | { 108 | "author": { 109 | "goodreads_link": "/author/show/25241.Bob_Marley", 110 | "name": "Bob Marley", 111 | "slug": "Bob-Marley" 112 | }, 113 | "tags": [ 114 | "music" 115 | ], 116 | "text": "\u201cOne good thing about music, when it hits you, you feel no pain.\u201d" 117 | } 118 | ], 119 | "tag": null, 120 | "top_ten_tags": [ 121 | [ 122 | "love", 123 | 14 124 | ], 125 | [ 126 | "inspirational", 127 | 13 128 | ], 129 | [ 130 | "life", 131 | 13 132 | ], 133 | [ 134 | "humor", 135 | 12 136 | ], 137 | [ 138 | "books", 139 | 11 140 | ], 141 | [ 142 | "reading", 143 | 7 144 | ], 145 | [ 146 | "friendship", 147 | 5 148 | ], 149 | [ 150 | "friends", 151 | 4 152 | ], 153 | [ 154 | "truth", 155 | 4 156 | ], 157 | [ 158 | "simile", 159 | 3 160 | ] 161 | ] 162 | } 163 | -------------------------------------------------------------------------------- /tests/site/files/mancha.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/files/mancha.pdf -------------------------------------------------------------------------------- /tests/site/gallery.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Gallery 5 | 6 | 7 | 8 | 9 | 10 |
11 |

Gallery

12 | 23 |
24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Awesome site 5 | 6 | 7 | 8 | 9 | 10 |
11 |

Awesome site

12 |

Lorem Ipsum

13 |

Infinite Scroll

14 |
15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/site/lorem_ipsum.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Lorem Ipsum 5 | 6 | 7 | 8 | 9 |
10 |

Lorem ipsum dolor sit amet, consectetur adipiscing elit.

11 |
12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/site/redirect.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Page should redirect 5 | 6 | 7 | 8 | 9 | 10 | 11 |

You should not see this because you are immediately redirected.

12 | 13 | 14 | -------------------------------------------------------------------------------- /tests/site/scroll.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Quotes to Scrape 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |
14 |
15 |
16 |
Loading...
17 |
18 | 19 | 20 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /tests/site/static/img/ales-krivec-ZMZHcvIVgbg-unsplash.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/static/img/ales-krivec-ZMZHcvIVgbg-unsplash.jpg -------------------------------------------------------------------------------- /tests/site/static/img/elyssa-fahndrich-MF16lGb95WY-unsplash.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/static/img/elyssa-fahndrich-MF16lGb95WY-unsplash.jpg -------------------------------------------------------------------------------- /tests/site/static/img/nathan-dumlao-RCfalHrnFAs-unsplash.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/static/img/nathan-dumlao-RCfalHrnFAs-unsplash.jpg -------------------------------------------------------------------------------- /tests/site/static/main.css: -------------------------------------------------------------------------------- 1 | /* Custom page CSS */ 2 | body { 3 | font-family: sans-serif; 4 | } 5 | 6 | .container .text-muted { 7 | margin: 20px 0; 8 | } 9 | 10 | .tags-box { 11 | text-align: right; 12 | } 13 | 14 | .tags-box h2 { 15 | margin-top: 0px; 16 | } 17 | 18 | .tag-item { 19 | display: block; 20 | margin: 4px; 21 | } 22 | 23 | .quote { 24 | padding: 10px; 25 | margin-bottom: 30px; 26 | border: 1px solid #333333; 27 | border-radius: 5px; 28 | box-shadow: 2px 2px 3px #333333; 29 | } 30 | 31 | .quote small.author { 32 | font-weight: bold; 33 | color: #3677E8; 34 | } 35 | 36 | .quote span.text { 37 | display: block; 38 | margin-bottom: 5px; 39 | font-size: large; 40 | font-style: italic; 41 | } 42 | 43 | .quote .tags { 44 | margin-top: 10px; 45 | } 46 | 47 | .tag { 48 | padding: 2px 5px; 49 | border-radius: 5px; 50 | color: white; 51 | font-size: small; 52 | background-color: #7CA3E6; 53 | } 54 | 55 | a.tag:hover { 56 | text-decoration: none; 57 | } 58 | 59 | /* Sticky footer styles */ 60 | html { 61 | position: relative; 62 | min-height: 100%; 63 | } 64 | 65 | body { 66 | /* Margin bottom by footer height */ 67 | margin-bottom: 60px; 68 | } 69 | 70 | .footer { 71 | position: absolute; 72 | bottom: 0; 73 | width: 100%; 74 | /* Set the fixed height of the footer here */ 75 | height: 6em; 76 | background-color: #f5f5f5; 77 | } 78 | 79 | .error { 80 | color: red; 81 | } 82 | 83 | .header-box { 84 | padding-bottom: 40px; 85 | } 86 | 87 | .header-box p { 88 | margin-top: 30px; 89 | float: right; 90 | } 91 | 92 | .author-details { 93 | width: 80%; 94 | } 95 | 96 | .author-description { 97 | text-align: justify; 98 | margin-bottom: 20px; 99 | } 100 | 101 | ul.pager { 102 | margin-bottom: 100px; 103 | } 104 | 105 | .copyright { 106 | text-align: center; 107 | } 108 | 109 | .sh-red { 110 | color: #cc0b0f; 111 | } 112 | -------------------------------------------------------------------------------- /tests/tests_asyncio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/tests_asyncio/__init__.py -------------------------------------------------------------------------------- /tests/tests_asyncio/test_browser.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import platform 5 | import random 6 | import re 7 | import signal 8 | import subprocess 9 | import time 10 | import uuid 11 | from contextlib import asynccontextmanager 12 | from pathlib import Path 13 | from threading import Thread 14 | from typing import Tuple 15 | from unittest import IsolatedAsyncioTestCase 16 | 17 | import psutil 18 | import pytest 19 | from playwright._impl._errors import TargetClosedError 20 | from playwright.async_api import async_playwright 21 | from scrapy import Request, Spider 22 | 23 | from tests import allow_windows, make_handler, assert_correct_response 24 | from tests.mockserver import StaticMockServer 25 | 26 | 27 | async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]: 28 | """Run a Chromium instance in a separate process, return the process 29 | object and a string with its devtools endpoint. 30 | """ 31 | async with async_playwright() as playwright: 32 | proc = subprocess.Popen( # pylint: disable=consider-using-with 33 | [playwright.chromium.executable_path, "--headless", "--remote-debugging-port=0"], 34 | text=True, 35 | stdout=subprocess.PIPE, 36 | stderr=subprocess.PIPE, 37 | ) 38 | devtools_url = None 39 | while devtools_url is None: 40 | line = proc.stderr.readline().strip() # type: ignore 41 | if not line: 42 | time.sleep(0.2) 43 | continue 44 | print("browser output:", line) 45 | if match := re.match(r"^DevTools listening on (.+)$", line): 46 | devtools_url = match.group(1) 47 | print("devtools_url:", devtools_url) 48 | return proc, devtools_url 49 | 50 | 51 | def _run_chromium_browser_server() -> Tuple[subprocess.Popen, str]: 52 | """Start a Playwright server in a separate process, return the process 53 | object and a string with its websocket endpoint. 54 | Pass fixed port and ws path as arguments instead of allowing Playwright 55 | to choose, for some reason I was unable to capture stdout/stderr :shrug: 56 | """ 57 | port = str(random.randint(60_000, 63_000)) 58 | ws_path = str(uuid.uuid4()) 59 | launch_server_script_path = str(Path(__file__).parent.parent / "launch_chromium_server.js") 60 | command = ["node", launch_server_script_path, port, ws_path] 61 | proc = subprocess.Popen(command) # pylint: disable=consider-using-with 62 | return proc, f"ws://localhost:{port}/{ws_path}" 63 | 64 | 65 | @asynccontextmanager 66 | async def remote_chromium(with_devtools_protocol: bool = True): 67 | """Launch a remote browser that lasts while in the context.""" 68 | proc = url = None 69 | try: 70 | if with_devtools_protocol: 71 | proc, url = await _run_chromium_devtools() 72 | else: 73 | proc, url = _run_chromium_browser_server() 74 | await asyncio.sleep(1) # allow some time for the browser to start 75 | except Exception: 76 | pass 77 | else: 78 | print(f"Browser URL: {url}") 79 | yield url 80 | finally: 81 | if proc: 82 | proc.kill() 83 | proc.communicate() 84 | 85 | 86 | class TestBrowserRemoteChromium(IsolatedAsyncioTestCase): 87 | @pytest.fixture(autouse=True) 88 | def inject_fixtures(self, caplog): 89 | caplog.set_level(logging.DEBUG) 90 | self._caplog = caplog 91 | 92 | @allow_windows 93 | async def test_connect_devtools(self): 94 | async with remote_chromium(with_devtools_protocol=True) as devtools_url: 95 | settings_dict = { 96 | "PLAYWRIGHT_BROWSER_TYPE": "chromium", 97 | "PLAYWRIGHT_CDP_URL": devtools_url, 98 | "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, 99 | } 100 | async with make_handler(settings_dict) as handler: 101 | with StaticMockServer() as server: 102 | req = Request(server.urljoin("/index.html"), meta={"playwright": True}) 103 | resp = await handler._download_request(req, Spider("foo")) 104 | assert_correct_response(resp, req) 105 | assert ( 106 | "scrapy-playwright", 107 | logging.WARNING, 108 | "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", 109 | ) in self._caplog.record_tuples 110 | 111 | @allow_windows 112 | async def test_connect(self): 113 | async with remote_chromium(with_devtools_protocol=False) as browser_url: 114 | settings_dict = { 115 | "PLAYWRIGHT_BROWSER_TYPE": "chromium", 116 | "PLAYWRIGHT_CONNECT_URL": browser_url, 117 | "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, 118 | } 119 | async with make_handler(settings_dict) as handler: 120 | with StaticMockServer() as server: 121 | req = Request(server.urljoin("/index.html"), meta={"playwright": True}) 122 | resp = await handler._download_request(req, Spider("foo")) 123 | assert_correct_response(resp, req) 124 | assert ( 125 | "scrapy-playwright", 126 | logging.INFO, 127 | "Connecting to remote Playwright", 128 | ) in self._caplog.record_tuples 129 | assert ( 130 | "scrapy-playwright", 131 | logging.INFO, 132 | "Connected to remote Playwright", 133 | ) in self._caplog.record_tuples 134 | assert ( 135 | "scrapy-playwright", 136 | logging.WARNING, 137 | "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", 138 | ) in self._caplog.record_tuples 139 | 140 | 141 | class TestBrowserReconnectChromium(IsolatedAsyncioTestCase): 142 | @pytest.fixture(autouse=True) 143 | def inject_fixtures(self, caplog): 144 | caplog.set_level(logging.DEBUG) 145 | self._caplog = caplog 146 | 147 | @staticmethod 148 | def kill_chrome(): 149 | for proc in psutil.process_iter(["pid", "name"]): 150 | if proc.info["name"].lower() in ("chrome", "chromium"): 151 | os.kill(proc.info["pid"], signal.SIGKILL) 152 | 153 | @allow_windows 154 | async def test_browser_closed_restart(self): 155 | spider = Spider("foo") 156 | async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler: 157 | with StaticMockServer() as server: 158 | req1 = Request( 159 | server.urljoin("/index.html"), 160 | meta={"playwright": True, "playwright_include_page": True}, 161 | ) 162 | resp1 = await handler._download_request(req1, spider) 163 | page = resp1.meta["playwright_page"] 164 | await page.context.browser.close() 165 | req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True}) 166 | resp2 = await handler._download_request(req2, spider) 167 | assert_correct_response(resp1, req1) 168 | assert_correct_response(resp2, req2) 169 | assert ( 170 | self._caplog.record_tuples.count( 171 | ( 172 | "scrapy-playwright", 173 | logging.DEBUG, 174 | "Browser disconnected", 175 | ) 176 | ) 177 | == 2 # one mid-crawl after calling Browser.close() manually, one at the end 178 | ) 179 | assert ( 180 | self._caplog.record_tuples.count( 181 | ( 182 | "scrapy-playwright", 183 | logging.INFO, 184 | "Launching browser chromium", 185 | ) 186 | ) 187 | == 2 # one at the beginning, one after calling Browser.close() manually 188 | ) 189 | 190 | @pytest.mark.skipif( 191 | platform.system() == "Windows", 192 | reason="os.kill does not work as expected on Windows", 193 | ) 194 | async def test_browser_crashed_restart(self): 195 | spider = Spider("foo") 196 | async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler: 197 | with StaticMockServer() as server: 198 | req1 = Request( 199 | server.urljoin("/index.html"), 200 | meta={"playwright": True, "playwright_include_page": True}, 201 | ) 202 | resp1 = await handler._download_request(req1, spider) 203 | thread = Thread(target=self.kill_chrome, daemon=True) 204 | thread.start() 205 | req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True}) 206 | req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True}) 207 | req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True}) 208 | resp2 = await handler._download_request(req2, spider) 209 | resp3 = await handler._download_request(req3, spider) 210 | resp4 = await handler._download_request(req4, spider) 211 | thread.join() 212 | assert_correct_response(resp1, req1) 213 | assert_correct_response(resp2, req2) 214 | assert_correct_response(resp3, req3) 215 | assert_correct_response(resp4, req4) 216 | assert ( 217 | self._caplog.record_tuples.count( 218 | ( 219 | "scrapy-playwright", 220 | logging.DEBUG, 221 | "Browser disconnected", 222 | ) 223 | ) 224 | == 2 # one mid-crawl after killing the browser process, one at the end 225 | ) 226 | assert ( 227 | self._caplog.record_tuples.count( 228 | ( 229 | "scrapy-playwright", 230 | logging.INFO, 231 | "Launching browser chromium", 232 | ) 233 | ) 234 | == 2 # one at the beginning, one after killing the broser process 235 | ) 236 | 237 | @pytest.mark.skipif( 238 | platform.system() == "Windows", 239 | reason="os.kill does not work as expected on Windows", 240 | ) 241 | async def test_browser_crashed_do_not_restart(self): 242 | spider = Spider("foo") 243 | settings_dict = { 244 | "PLAYWRIGHT_BROWSER_TYPE": "chromium", 245 | "PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER": False, 246 | } 247 | async with make_handler(settings_dict=settings_dict) as handler: 248 | with StaticMockServer() as server: 249 | await asyncio.sleep(1) # allow time for the browser to fully launch 250 | req1 = Request( 251 | server.urljoin("/index.html"), 252 | meta={"playwright": True, "playwright_include_page": True}, 253 | ) 254 | resp1 = await handler._download_request(req1, spider) 255 | assert_correct_response(resp1, req1) 256 | thread = Thread(target=self.kill_chrome, daemon=True) 257 | thread.start() 258 | req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True}) 259 | req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True}) 260 | req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True}) 261 | with pytest.raises(TargetClosedError): 262 | await handler._download_request(req2, spider) 263 | await handler._download_request(req3, spider) 264 | await handler._download_request(req4, spider) 265 | thread.join() 266 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_browser_contexts.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import platform 3 | import tempfile 4 | from pathlib import Path 5 | from unittest import IsolatedAsyncioTestCase 6 | from uuid import uuid4 7 | 8 | import pytest 9 | from playwright.async_api import Browser, TimeoutError as PlaywrightTimeoutError 10 | from scrapy import Spider, Request 11 | from scrapy_playwright.page import PageMethod 12 | 13 | from tests import allow_windows, make_handler 14 | from tests.mockserver import StaticMockServer 15 | 16 | 17 | class MixinTestCaseMultipleContexts: 18 | @allow_windows 19 | async def test_context_kwargs(self): 20 | settings_dict = { 21 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 22 | "PLAYWRIGHT_CONTEXTS": { 23 | "default": {"java_script_enabled": False}, 24 | }, 25 | } 26 | async with make_handler(settings_dict) as handler: 27 | with StaticMockServer() as server: 28 | req = Request( 29 | url=server.urljoin("/scroll.html"), 30 | meta={ 31 | "playwright": True, 32 | "playwright_page_methods": [ 33 | # cause a timeout by waiting on an element that is rendered with js 34 | PageMethod("wait_for_selector", selector="div.quote", timeout=1000), 35 | ], 36 | }, 37 | ) 38 | with pytest.raises(PlaywrightTimeoutError): 39 | await handler._download_request(req, Spider("foo")) 40 | 41 | @allow_windows 42 | async def test_contexts_max_pages(self): 43 | settings = { 44 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 45 | "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1, 46 | "PLAYWRIGHT_CONTEXTS": { 47 | "a": {"java_script_enabled": True}, 48 | "b": {"java_script_enabled": True}, 49 | }, 50 | } 51 | async with make_handler(settings) as handler: 52 | with StaticMockServer() as server: 53 | requests = [ 54 | handler._download_request( 55 | Request( 56 | server.urljoin(f"/index.html?a={i}"), 57 | meta={"playwright": True, "playwright_context": "a"}, 58 | ), 59 | Spider("foo"), 60 | ) 61 | for i in range(20) 62 | ] + [ 63 | handler._download_request( 64 | Request( 65 | server.urljoin(f"/index.html?b={i}"), 66 | meta={"playwright": True, "playwright_context": "b"}, 67 | ), 68 | Spider("foo"), 69 | ) 70 | for i in range(20) 71 | ] 72 | await asyncio.gather(*requests) 73 | 74 | assert handler.stats.get_value("playwright/page_count/max_concurrent") == 2 75 | 76 | @allow_windows 77 | async def test_max_contexts(self): 78 | def cb_close_context(task): 79 | response = task.result() 80 | asyncio.create_task(response.meta["playwright_page"].context.close()) 81 | 82 | settings = { 83 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 84 | "PLAYWRIGHT_MAX_CONTEXTS": 4, 85 | } 86 | async with make_handler(settings) as handler: 87 | with StaticMockServer() as server: 88 | tasks = [] 89 | for i in range(20): 90 | request = Request( 91 | url=server.urljoin(f"/index.html?a={i}"), 92 | callback=cb_close_context, 93 | meta={ 94 | "playwright": True, 95 | "playwright_include_page": True, 96 | "playwright_context": f"ctx-{i}", 97 | }, 98 | ) 99 | coro = handler._download_request( 100 | request=request, 101 | spider=Spider("foo"), 102 | ) 103 | # callbacks are not invoked at the download handler, call them explicitly 104 | task = asyncio.create_task(coro) 105 | task.add_done_callback(request.callback) 106 | tasks.append(task) 107 | await asyncio.gather(*tasks) 108 | 109 | assert handler.stats.get_value("playwright/context_count/max_concurrent") == 4 110 | 111 | @allow_windows 112 | async def test_contexts_startup(self): 113 | settings = { 114 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 115 | "PLAYWRIGHT_CONTEXTS": { 116 | "first": { 117 | "storage_state": { 118 | "cookies": [ 119 | { 120 | "url": "https://example.org", 121 | "name": "foo", 122 | "value": "bar", 123 | }, 124 | ], 125 | }, 126 | }, 127 | }, 128 | } 129 | async with make_handler(settings) as handler: 130 | assert len(handler.context_wrappers) == 1 131 | 132 | with StaticMockServer() as server: 133 | meta = { 134 | "playwright": True, 135 | "playwright_include_page": True, 136 | "playwright_context": "first", 137 | } 138 | req = Request(server.urljoin("/index.html"), meta=meta) 139 | resp = await handler._download_request(req, Spider("foo")) 140 | 141 | page = resp.meta["playwright_page"] 142 | storage_state = await page.context.storage_state() 143 | await page.close() 144 | await page.context.close() 145 | cookie = storage_state["cookies"][0] 146 | assert cookie["name"] == "foo" 147 | assert cookie["value"] == "bar" 148 | assert cookie["domain"] == "example.org" 149 | 150 | @allow_windows 151 | async def test_persistent_context(self): 152 | temp_dir = f"{tempfile.gettempdir()}/{uuid4()}" 153 | settings = { 154 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 155 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000, 156 | "PLAYWRIGHT_CONTEXTS": { 157 | "persistent": { 158 | "user_data_dir": temp_dir, 159 | }, 160 | }, 161 | } 162 | assert not Path(temp_dir).exists() 163 | async with make_handler(settings) as handler: 164 | assert Path(temp_dir).is_dir() 165 | assert len(handler.context_wrappers) == 1 166 | assert handler.context_wrappers["persistent"].persistent 167 | assert not hasattr(handler, "browser") 168 | 169 | @allow_windows 170 | async def test_mixed_persistent_contexts(self): 171 | temp_dir = f"{tempfile.gettempdir()}/{uuid4()}" 172 | settings = { 173 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 174 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000, 175 | "PLAYWRIGHT_CONTEXTS": { 176 | "persistent": { 177 | "user_data_dir": temp_dir, 178 | }, 179 | "non-persistent": { 180 | "java_script_enabled": False, 181 | }, 182 | }, 183 | } 184 | assert not Path(temp_dir).exists() 185 | async with make_handler(settings) as handler: 186 | assert Path(temp_dir).is_dir() 187 | assert len(handler.context_wrappers) == 2 188 | assert handler.context_wrappers["persistent"].persistent 189 | assert not handler.context_wrappers["non-persistent"].persistent 190 | assert isinstance(handler.browser, Browser) 191 | 192 | @allow_windows 193 | async def test_contexts_dynamic(self): 194 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 195 | assert len(handler.context_wrappers) == 0 196 | 197 | with StaticMockServer() as server: 198 | meta = { 199 | "playwright": True, 200 | "playwright_include_page": True, 201 | "playwright_context": "new", 202 | "playwright_context_kwargs": { 203 | "storage_state": { 204 | "cookies": [ 205 | { 206 | "url": "https://example.org", 207 | "name": "asdf", 208 | "value": "qwerty", 209 | }, 210 | ], 211 | }, 212 | }, 213 | } 214 | req = Request(server.urljoin("/index.html"), meta=meta) 215 | resp = await handler._download_request(req, Spider("foo")) 216 | 217 | assert len(handler.context_wrappers) == 1 218 | 219 | page = resp.meta["playwright_page"] 220 | storage_state = await page.context.storage_state() 221 | await page.close() 222 | cookie = storage_state["cookies"][0] 223 | assert cookie["name"] == "asdf" 224 | assert cookie["value"] == "qwerty" 225 | assert cookie["domain"] == "example.org" 226 | 227 | 228 | class TestCaseMultipleContextsChromium(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts): 229 | browser_type = "chromium" 230 | 231 | 232 | class TestCaseMultipleContextsFirefox(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts): 233 | browser_type = "firefox" 234 | 235 | 236 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin") 237 | class TestCaseMultipleContextsWebkit(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts): 238 | browser_type = "webkit" 239 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_extensions.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from asyncio.subprocess import Process as AsyncioProcess 3 | from unittest import IsolatedAsyncioTestCase 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pytest 7 | from playwright.async_api import PlaywrightContextManager 8 | from scrapy.exceptions import NotConfigured 9 | from scrapy.extensions.memusage import MemoryUsage 10 | 11 | from scrapy_playwright.memusage import ScrapyPlaywrightMemoryUsageExtension 12 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler 13 | 14 | 15 | SCHEMA_PID_MAP = {"http": 123, "https": 456} 16 | 17 | 18 | def mock_crawler_with_handlers() -> dict: 19 | handlers = {} 20 | for schema, pid in SCHEMA_PID_MAP.items(): 21 | process = MagicMock() 22 | process.pid = pid 23 | handlers[schema] = MagicMock(spec=ScrapyPlaywrightDownloadHandler) 24 | handlers[schema].playwright_context_manager._connection._transport._proc = process 25 | crawler = MagicMock() 26 | crawler.engine.downloader.handlers._handlers = handlers 27 | return crawler 28 | 29 | 30 | def raise_import_error(*args, **kwargs): 31 | raise ImportError 32 | 33 | 34 | class MockMemoryInfo: 35 | rss = 999 36 | 37 | 38 | @pytest.mark.skipif( 39 | platform.system() == "Windows", 40 | reason="resource stdlib module is not available on Windows", 41 | ) 42 | @patch("scrapy.extensions.memusage.MailSender") 43 | class TestMemoryUsageExtension(IsolatedAsyncioTestCase): 44 | async def test_process_availability(self, _MailSender): 45 | """The main node process should be accessible from the context manager""" 46 | ctx_manager = PlaywrightContextManager() 47 | await ctx_manager.start() 48 | assert isinstance(ctx_manager._connection._transport._proc, AsyncioProcess) 49 | await ctx_manager.__aexit__() 50 | 51 | @patch("scrapy_playwright.memusage.import_module", side_effect=raise_import_error) 52 | async def test_psutil_not_available_extension_disabled(self, _import_module, _MailSender): 53 | crawler = MagicMock() 54 | with pytest.raises(NotConfigured): 55 | ScrapyPlaywrightMemoryUsageExtension(crawler) 56 | 57 | async def test_get_process_ids_ok(self, _MailSender): 58 | crawler = mock_crawler_with_handlers() 59 | extension = ScrapyPlaywrightMemoryUsageExtension(crawler) 60 | assert extension._get_main_process_ids() == list(SCHEMA_PID_MAP.values()) 61 | 62 | async def test_get_process_ids_error(self, _MailSender): 63 | crawler = mock_crawler_with_handlers() 64 | crawler.engine.downloader.handlers._handlers = MagicMock() 65 | crawler.engine.downloader.handlers._handlers.values.side_effect = raise_import_error 66 | extension = ScrapyPlaywrightMemoryUsageExtension(crawler) 67 | assert extension._get_main_process_ids() == [] 68 | 69 | async def test_get_descendant_processes(self, _MailSender): 70 | p1 = MagicMock() 71 | p2 = MagicMock() 72 | p3 = MagicMock() 73 | p4 = MagicMock() 74 | p2.children.return_value = [p3, p4] 75 | p1.children.return_value = [p2] 76 | crawler = MagicMock() 77 | extension = ScrapyPlaywrightMemoryUsageExtension(crawler) 78 | assert extension._get_descendant_processes(p1) == [p2, p3, p4] 79 | 80 | async def test_get_total_process_size(self, _MailSender): 81 | crawler = MagicMock() 82 | extension = ScrapyPlaywrightMemoryUsageExtension(crawler) 83 | extension.psutil = MagicMock() 84 | extension.psutil.Process.return_value.memory_info.return_value = MockMemoryInfo() 85 | extension._get_main_process_ids = MagicMock(return_value=[1, 2, 3]) 86 | expected_size = MockMemoryInfo().rss * len(extension._get_main_process_ids()) 87 | assert extension._get_total_playwright_process_memory() == expected_size 88 | 89 | async def test_get_virtual_size_sum(self, _MailSender): 90 | crawler = MagicMock() 91 | extension = ScrapyPlaywrightMemoryUsageExtension(crawler) 92 | parent_cls_extension = MemoryUsage(crawler) 93 | extension._get_total_playwright_process_memory = MagicMock(return_value=123) 94 | assert extension.get_virtual_size() == parent_cls_extension.get_virtual_size() + 123 95 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_headers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import platform 4 | import warnings 5 | from unittest import IsolatedAsyncioTestCase 6 | 7 | import pytest 8 | from scrapy import Spider, Request 9 | 10 | from tests import allow_windows, make_handler 11 | from tests.mockserver import MockServer 12 | 13 | 14 | class MixinProcessHeadersTestCase: 15 | @pytest.fixture(autouse=True) 16 | def inject_fixtures(self, caplog): 17 | caplog.set_level(logging.DEBUG) 18 | self._caplog = caplog 19 | 20 | @allow_windows 21 | async def test_user_agent(self): 22 | settings_dict = { 23 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 24 | "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, 25 | "USER_AGENT": None, 26 | } 27 | async with make_handler(settings_dict) as handler: 28 | with MockServer() as server: 29 | # if Scrapy's user agent is None, use the one from the Browser 30 | req = Request( 31 | url=server.urljoin("/headers"), 32 | meta={"playwright": True}, 33 | ) 34 | resp = await handler._download_request(req, Spider("foo")) 35 | headers = json.loads(resp.css("pre::text").get()) 36 | headers = {key.lower(): value for key, value in headers.items()} 37 | assert headers["user-agent"] == self.browser_type 38 | 39 | # if Scrapy's user agent is set to some value, use it 40 | req = Request( 41 | url=server.urljoin("/headers"), 42 | meta={"playwright": True}, 43 | headers={"User-Agent": "foobar"}, 44 | ) 45 | resp = await handler._download_request(req, Spider("foo")) 46 | headers = json.loads(resp.css("pre::text").get()) 47 | headers = {key.lower(): value for key, value in headers.items()} 48 | assert headers["user-agent"] == "foobar" 49 | 50 | @allow_windows 51 | async def test_playwright_headers(self): 52 | """Ignore Scrapy headers""" 53 | settings_dict = { 54 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 55 | "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, 56 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, 57 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000, 58 | } 59 | async with make_handler(settings_dict) as handler: 60 | with MockServer() as server: 61 | req = Request( 62 | url=server.urljoin("/headers"), 63 | meta={"playwright": True}, 64 | headers={"User-Agent": "foobar", "Asdf": "qwerty"}, 65 | ) 66 | resp = await handler._download_request(req, Spider("foo")) 67 | headers = json.loads(resp.css("pre::text").get()) 68 | headers = {key.lower(): value for key, value in headers.items()} 69 | assert headers["user-agent"] == self.browser_type 70 | assert req.headers["user-agent"].decode("utf-8") == self.browser_type 71 | assert "asdf" not in headers 72 | assert "asdf" not in req.headers 73 | assert b"asdf" not in req.headers 74 | 75 | @allow_windows 76 | async def test_use_custom_headers_ok(self): 77 | """Custom header processing function""" 78 | 79 | async def important_headers( 80 | browser_type_name, # pylint: disable=unused-argument 81 | playwright_request, # pylint: disable=unused-argument 82 | scrapy_request_data, # pylint: disable=unused-argument 83 | ) -> dict: 84 | return {"foo": "bar"} 85 | 86 | settings_dict = { 87 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 88 | "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, 89 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, 90 | } 91 | async with make_handler(settings_dict) as handler: 92 | with MockServer() as server: 93 | req = Request( 94 | url=server.urljoin("/headers"), 95 | meta={"playwright": True}, 96 | headers={"User-Agent": "foobar", "Asdf": "qwerty"}, 97 | ) 98 | with warnings.catch_warnings(record=True) as warning_list: 99 | resp = await handler._download_request(req, Spider("foo")) 100 | assert not warning_list 101 | headers = json.loads(resp.css("pre::text").get()) 102 | headers = {key.lower(): value for key, value in headers.items()} 103 | assert headers["foo"] == "bar" 104 | assert headers.get("user-agent") not in (self.browser_type, "foobar") 105 | assert "asdf" not in headers 106 | 107 | @allow_windows 108 | async def test_use_custom_headers_deprecated_arg_handling(self): 109 | """Custom header processing function that receives deprecated args""" 110 | 111 | async def deprecated_args( 112 | browser_name, pw_req, headers # pylint: disable=unused-argument 113 | ) -> dict: 114 | return {"foo": "bar"} 115 | 116 | settings_dict = { 117 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 118 | "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, 119 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": deprecated_args, 120 | } 121 | async with make_handler(settings_dict) as handler: 122 | with MockServer() as server: 123 | req = Request( 124 | url=server.urljoin("/headers"), 125 | meta={"playwright": True}, 126 | headers={"User-Agent": "foobar", "Asdf": "qwerty"}, 127 | ) 128 | with warnings.catch_warnings(record=True) as warning_list: 129 | resp = await handler._download_request(req, Spider("foo")) 130 | headers = json.loads(resp.css("pre::text").get()) 131 | headers = {key.lower(): value for key, value in headers.items()} 132 | assert headers["foo"] == "bar" 133 | assert headers.get("user-agent") not in (self.browser_type, "foobar") 134 | assert "asdf" not in headers 135 | assert str(warning_list[0].message) == ( 136 | "Accepting positional arguments in the function passed to the" 137 | " PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function" 138 | " should accept three (3) keyword arguments instead:" 139 | " browser_type_name: str," 140 | " playwright_request: playwright.async_api.Request," 141 | " scrapy_request_data: dict" 142 | ) 143 | 144 | 145 | class TestProcessHeadersChromium(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase): 146 | browser_type = "chromium" 147 | 148 | 149 | class TestProcessHeadersFirefox(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase): 150 | browser_type = "firefox" 151 | 152 | 153 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin") 154 | class TestProcessHeadersWebkit(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase): 155 | browser_type = "webkit" 156 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_page_methods.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import platform 3 | import subprocess 4 | from tempfile import NamedTemporaryFile 5 | from unittest import IsolatedAsyncioTestCase 6 | 7 | import pytest 8 | from scrapy import Spider, Request 9 | from scrapy.http.response.html import HtmlResponse 10 | 11 | from playwright.async_api import Page 12 | from scrapy_playwright.page import PageMethod 13 | 14 | from tests import allow_windows, make_handler, assert_correct_response 15 | from tests.mockserver import StaticMockServer 16 | 17 | 18 | def get_mimetype(file): 19 | return subprocess.run( 20 | ["file", "--mime-type", "--brief", file.name], 21 | stdout=subprocess.PIPE, 22 | universal_newlines=True, 23 | check=False, 24 | ).stdout.strip() 25 | 26 | 27 | class TestPageMethods(IsolatedAsyncioTestCase): 28 | @allow_windows 29 | async def test_page_methods(self): 30 | screenshot = PageMethod("screenshot", "foo", 123, path="/tmp/file", type="png") 31 | assert screenshot.method == "screenshot" 32 | assert screenshot.args == ("foo", 123) 33 | assert screenshot.kwargs == {"path": "/tmp/file", "type": "png"} 34 | assert screenshot.result is None 35 | assert str(screenshot) == "" 36 | 37 | 38 | class MixinPageMethodTestCase: 39 | @pytest.fixture(autouse=True) 40 | def inject_fixtures(self, caplog): 41 | caplog.set_level(logging.DEBUG) 42 | self._caplog = caplog 43 | 44 | @allow_windows 45 | async def test_page_non_page_method(self): 46 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 47 | with StaticMockServer() as server: 48 | req = Request( 49 | url=server.urljoin("/index.html"), 50 | meta={ 51 | "playwright": True, 52 | "playwright_page_methods": [ 53 | "not-a-page-method", 54 | 5, 55 | None, 56 | ], 57 | }, 58 | ) 59 | resp = await handler._download_request(req, Spider("foo")) 60 | 61 | assert_correct_response(resp, req) 62 | for obj in req.meta["playwright_page_methods"]: 63 | assert ( 64 | "scrapy-playwright", 65 | logging.WARNING, 66 | f"Ignoring {repr(obj)}: expected PageMethod, got {repr(type(obj))}", 67 | ) in self._caplog.record_tuples 68 | 69 | @allow_windows 70 | async def test_page_mixed_page_methods(self): 71 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 72 | with StaticMockServer() as server: 73 | req = Request( 74 | url=server.urljoin("/index.html"), 75 | meta={ 76 | "playwright": True, 77 | "playwright_page_methods": { 78 | "does_not_exist": PageMethod("does_not_exist"), 79 | "is_closed": PageMethod("is_closed"), # not awaitable 80 | "title": PageMethod("title"), # awaitable 81 | }, 82 | }, 83 | ) 84 | resp = await handler._download_request(req, Spider("foo")) 85 | 86 | assert_correct_response(resp, req) 87 | does_not_exist = req.meta["playwright_page_methods"]["does_not_exist"] 88 | assert ( 89 | "scrapy-playwright", 90 | logging.WARNING, 91 | f"Ignoring {repr(does_not_exist)}: could not find method", 92 | ) in self._caplog.record_tuples 93 | assert not req.meta["playwright_page_methods"]["is_closed"].result 94 | assert req.meta["playwright_page_methods"]["title"].result == "Awesome site" 95 | 96 | @allow_windows 97 | async def test_page_method_navigation(self): 98 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 99 | with StaticMockServer() as server: 100 | req = Request( 101 | url=server.urljoin("/index.html"), 102 | meta={ 103 | "playwright": True, 104 | "playwright_page_methods": [PageMethod("click", "a.lorem_ipsum")], 105 | }, 106 | ) 107 | resp = await handler._download_request(req, Spider("foo")) 108 | 109 | assert isinstance(resp, HtmlResponse) 110 | assert resp.request is req 111 | assert resp.url == server.urljoin("/lorem_ipsum.html") 112 | assert resp.status == 200 113 | assert "playwright" in resp.flags 114 | assert resp.css("title::text").get() == "Lorem Ipsum" 115 | text = resp.css("p::text").get() 116 | assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." 117 | 118 | @allow_windows 119 | async def test_page_method_infinite_scroll(self): 120 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 121 | with StaticMockServer() as server: 122 | req = Request( 123 | url=server.urljoin("/scroll.html"), 124 | headers={"User-Agent": "scrapy-playwright"}, 125 | meta={ 126 | "playwright": True, 127 | "playwright_page_methods": [ 128 | PageMethod("wait_for_selector", selector="div.quote"), 129 | PageMethod( 130 | "evaluate", "window.scrollBy(0, document.body.scrollHeight)" 131 | ), 132 | PageMethod("wait_for_selector", selector="div.quote:nth-child(11)"), 133 | PageMethod( 134 | "evaluate", "window.scrollBy(0, document.body.scrollHeight)" 135 | ), 136 | PageMethod("wait_for_selector", selector="div.quote:nth-child(21)"), 137 | ], 138 | }, 139 | ) 140 | resp = await handler._download_request(req, Spider("foo")) 141 | 142 | assert_correct_response(resp, req) 143 | assert len(resp.css("div.quote")) == 30 144 | 145 | @allow_windows 146 | async def test_page_method_screenshot(self): 147 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 148 | with NamedTemporaryFile(mode="w+b", delete=False) as png_file: 149 | with StaticMockServer() as server: 150 | req = Request( 151 | url=server.urljoin("/index.html"), 152 | meta={ 153 | "playwright": True, 154 | "playwright_page_methods": { 155 | "png": PageMethod("screenshot", path=png_file.name, type="png"), 156 | }, 157 | }, 158 | ) 159 | await handler._download_request(req, Spider("foo")) 160 | 161 | png_file.file.seek(0) 162 | assert png_file.file.read() == req.meta["playwright_page_methods"]["png"].result 163 | if platform.system() != "Windows": 164 | assert get_mimetype(png_file) == "image/png" 165 | 166 | @allow_windows 167 | async def test_page_method_pdf(self): 168 | if self.browser_type != "chromium": 169 | pytest.skip("PDF generation is supported only in Chromium") 170 | 171 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 172 | with NamedTemporaryFile(mode="w+b", delete=False) as pdf_file: 173 | with StaticMockServer() as server: 174 | req = Request( 175 | url=server.urljoin("/index.html"), 176 | meta={ 177 | "playwright": True, 178 | "playwright_page_methods": { 179 | "pdf": PageMethod("pdf", path=pdf_file.name), 180 | }, 181 | }, 182 | ) 183 | await handler._download_request(req, Spider("foo")) 184 | 185 | pdf_file.file.seek(0) 186 | assert pdf_file.file.read() == req.meta["playwright_page_methods"]["pdf"].result 187 | if platform.system() != "Windows": 188 | assert get_mimetype(pdf_file) == "application/pdf" 189 | 190 | @allow_windows 191 | async def test_page_method_callable(self): 192 | 193 | async def scroll_page(page: Page) -> str: 194 | await page.wait_for_selector(selector="div.quote") 195 | await page.evaluate("window.scrollBy(0, document.body.scrollHeight)") 196 | await page.wait_for_selector(selector="div.quote:nth-child(11)") 197 | await page.evaluate("window.scrollBy(0, document.body.scrollHeight)") 198 | await page.wait_for_selector(selector="div.quote:nth-child(21)") 199 | return page.url 200 | 201 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 202 | with StaticMockServer() as server: 203 | req = Request( 204 | url=server.urljoin("/scroll.html"), 205 | meta={ 206 | "playwright": True, 207 | "playwright_page_methods": { 208 | "callable": PageMethod(scroll_page), 209 | }, 210 | }, 211 | ) 212 | resp = await handler._download_request(req, Spider("foo")) 213 | 214 | assert_correct_response(resp, req) 215 | assert len(resp.css("div.quote")) == 30 216 | assert resp.meta["playwright_page_methods"]["callable"].result == resp.url 217 | 218 | 219 | class TestPageMethodChromium(IsolatedAsyncioTestCase, MixinPageMethodTestCase): 220 | browser_type = "chromium" 221 | 222 | 223 | class TestPageMethodFirefox(IsolatedAsyncioTestCase, MixinPageMethodTestCase): 224 | browser_type = "firefox" 225 | 226 | 227 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin") 228 | class TestPageMethodWebkit(IsolatedAsyncioTestCase, MixinPageMethodTestCase): 229 | browser_type = "webkit" 230 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_playwright_requests.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import platform 5 | from ipaddress import ip_address 6 | from unittest import IsolatedAsyncioTestCase 7 | from unittest.mock import AsyncMock, MagicMock, patch 8 | 9 | import pytest 10 | from playwright.async_api import ( 11 | Dialog, 12 | Error as PlaywrightError, 13 | Page as PlaywrightPage, 14 | TimeoutError as PlaywrightTimeoutError, 15 | ) 16 | from scrapy import Spider, Request, FormRequest 17 | 18 | from scrapy_playwright.handler import DEFAULT_CONTEXT_NAME 19 | from scrapy_playwright.page import PageMethod 20 | 21 | from tests import allow_windows, make_handler, assert_correct_response 22 | from tests.mockserver import MockServer, StaticMockServer 23 | 24 | 25 | class DialogSpider(Spider): 26 | """A spider with a method to handle the "dialog" page event""" 27 | 28 | name = "dialog" 29 | 30 | def parse(self, **_kwargs) -> None: 31 | return None 32 | 33 | async def handle_dialog(self, dialog: Dialog) -> None: 34 | self.dialog_message = dialog.message 35 | await dialog.dismiss() 36 | 37 | 38 | class MixinTestCase: 39 | browser_type: str 40 | 41 | @pytest.fixture(autouse=True) 42 | def inject_fixtures(self, caplog): 43 | caplog.set_level(logging.DEBUG) 44 | self._caplog = caplog 45 | 46 | @allow_windows 47 | async def test_basic_response(self): 48 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 49 | with StaticMockServer() as server: 50 | meta = {"playwright": True, "playwright_include_page": True} 51 | req = Request(server.urljoin("/index.html"), meta=meta) 52 | resp = await handler._download_request(req, Spider("foo")) 53 | 54 | assert_correct_response(resp, req) 55 | assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"] 56 | assert isinstance(resp.meta["playwright_page"], PlaywrightPage) 57 | assert resp.meta["playwright_page"].url == resp.url 58 | await resp.meta["playwright_page"].close() 59 | 60 | @allow_windows 61 | async def test_post_request(self): 62 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 63 | with MockServer() as server: 64 | req = FormRequest( 65 | server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"} 66 | ) 67 | resp = await handler._download_request(req, Spider("foo")) 68 | 69 | assert_correct_response(resp, req) 70 | assert "Request body: foo=bar" in resp.text 71 | 72 | @allow_windows 73 | async def test_timeout_error(self): 74 | settings_dict = { 75 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 76 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 100, 77 | } 78 | async with make_handler(settings_dict) as handler: 79 | with MockServer() as server: 80 | req = Request(server.urljoin("/headers?delay=1"), meta={"playwright": True}) 81 | with pytest.raises(PlaywrightTimeoutError) as excinfo: 82 | await handler._download_request(req, Spider("foo")) 83 | assert ( 84 | "scrapy-playwright", 85 | logging.WARNING, 86 | f"Closing page due to failed request: {req}" 87 | f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}", 88 | ) in self._caplog.record_tuples 89 | 90 | @allow_windows 91 | async def test_retry_page_content_still_navigating(self): 92 | if self.browser_type != "chromium": 93 | pytest.skip("Only Chromium seems to redirect meta tags within the same goto call") 94 | 95 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 96 | with StaticMockServer() as server: 97 | req = Request(server.urljoin("/redirect.html"), meta={"playwright": True}) 98 | resp = await handler._download_request(req, Spider("foo")) 99 | 100 | assert resp.request is req 101 | assert resp.url == server.urljoin("/index.html") # redirected 102 | assert resp.status == 200 103 | assert "playwright" in resp.flags 104 | assert ( 105 | "scrapy-playwright", 106 | logging.DEBUG, 107 | f"Retrying to get content from page '{req.url}', error: 'Unable to retrieve" 108 | " content because the page is navigating and changing the content.'", 109 | ) in self._caplog.record_tuples 110 | 111 | @patch("scrapy_playwright.handler.logger") 112 | @allow_windows 113 | async def test_route_continue_exception(self, logger): 114 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 115 | scrapy_request = Request(url="https://example.org", method="GET") 116 | spider = Spider("foo") 117 | initial_request_done = asyncio.Event() 118 | req_handler = handler._make_request_handler( 119 | context_name=DEFAULT_CONTEXT_NAME, 120 | method=scrapy_request.method, 121 | url=scrapy_request.url, 122 | headers=scrapy_request.headers, 123 | body=None, 124 | encoding="utf-8", 125 | spider=spider, 126 | initial_request_done=initial_request_done, 127 | ) 128 | route = MagicMock() 129 | playwright_request = AsyncMock() 130 | playwright_request.url = scrapy_request.url 131 | playwright_request.method = scrapy_request.method 132 | playwright_request.is_navigation_request = MagicMock(return_value=True) 133 | playwright_request.all_headers.return_value = {} 134 | 135 | # safe error, only warn 136 | ex = PlaywrightError("Target page, context or browser has been closed") 137 | route.continue_.side_effect = ex 138 | await req_handler(route, playwright_request) 139 | logger.warning.assert_called_with( 140 | "Failed processing Playwright request: <%s %s> exc_type=%s exc_msg=%s", 141 | playwright_request.method, 142 | playwright_request.url, 143 | type(ex), 144 | str(ex), 145 | extra={ 146 | "spider": spider, 147 | "context_name": DEFAULT_CONTEXT_NAME, 148 | "scrapy_request_url": scrapy_request.url, 149 | "scrapy_request_method": scrapy_request.method, 150 | "playwright_request_url": playwright_request.url, 151 | "playwright_request_method": playwright_request.method, 152 | "exception": ex, 153 | }, 154 | exc_info=True, 155 | ) 156 | 157 | # unknown errors, re-raise 158 | route.continue_.side_effect = ZeroDivisionError("asdf") 159 | with pytest.raises(ZeroDivisionError): 160 | await req_handler(route, playwright_request) 161 | route.continue_.side_effect = PlaywrightError("qwerty") 162 | with pytest.raises(PlaywrightError): 163 | await req_handler(route, playwright_request) 164 | 165 | @allow_windows 166 | async def test_event_handler_dialog_callable(self): 167 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 168 | with StaticMockServer() as server: 169 | spider = DialogSpider() 170 | req = Request( 171 | url=server.urljoin("/index.html"), 172 | meta={ 173 | "playwright": True, 174 | "playwright_page_methods": [ 175 | # trigger an alert 176 | PageMethod("evaluate", "alert('foobar');"), 177 | ], 178 | "playwright_page_event_handlers": { 179 | "dialog": spider.handle_dialog, 180 | }, 181 | }, 182 | ) 183 | await handler._download_request(req, spider) 184 | 185 | assert spider.dialog_message == "foobar" 186 | 187 | @allow_windows 188 | async def test_event_handler_dialog_str(self): 189 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 190 | with StaticMockServer() as server: 191 | spider = DialogSpider() 192 | req = Request( 193 | url=server.urljoin("/index.html"), 194 | meta={ 195 | "playwright": True, 196 | "playwright_page_methods": [ 197 | # trigger an alert 198 | PageMethod("evaluate", "alert('foobar');"), 199 | ], 200 | "playwright_page_event_handlers": { 201 | "dialog": "handle_dialog", 202 | }, 203 | }, 204 | ) 205 | await handler._download_request(req, spider) 206 | 207 | assert spider.dialog_message == "foobar" 208 | 209 | @allow_windows 210 | async def test_event_handler_dialog_missing(self): 211 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 212 | with StaticMockServer() as server: 213 | spider = DialogSpider() 214 | req = Request( 215 | url=server.urljoin("/index.html"), 216 | meta={ 217 | "playwright": True, 218 | "playwright_page_event_handlers": { 219 | "dialog": "missing_method", 220 | }, 221 | }, 222 | ) 223 | await handler._download_request(req, spider) 224 | 225 | assert ( 226 | "scrapy-playwright", 227 | logging.WARNING, 228 | "Spider 'dialog' does not have a 'missing_method' attribute," 229 | " ignoring handler for event 'dialog'", 230 | ) in self._caplog.record_tuples 231 | assert getattr(spider, "dialog_message", None) is None 232 | 233 | @allow_windows 234 | async def test_response_attributes(self): 235 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 236 | with MockServer() as server: 237 | req = Request( 238 | url=server.urljoin(), 239 | meta={"playwright": True}, 240 | ) 241 | response = await handler._download_request(req, Spider("spider_name")) 242 | 243 | assert response.ip_address == ip_address(server.address) 244 | 245 | @allow_windows 246 | async def test_page_goto_kwargs_referer(self): 247 | if self.browser_type != "chromium": 248 | pytest.skip("referer as goto kwarg seems to work only with chromium :shrug:") 249 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 250 | with MockServer() as server: 251 | fake_referer = server.urljoin("/fake/referer") 252 | req = Request( 253 | url=server.urljoin("/headers"), 254 | meta={ 255 | "playwright": True, 256 | "playwright_page_goto_kwargs": {"referer": fake_referer}, 257 | }, 258 | ) 259 | response = await handler._download_request(req, Spider("spider_name")) 260 | 261 | headers = json.loads(response.css("pre::text").get()) 262 | assert headers["Referer"] == fake_referer 263 | 264 | @allow_windows 265 | async def test_navigation_returns_none(self): 266 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 267 | with MockServer(): 268 | req = Request(url="about:blank", meta={"playwright": True}) 269 | response = await handler._download_request(req, Spider("spider_name")) 270 | 271 | assert ( 272 | "scrapy-playwright", 273 | logging.WARNING, 274 | f"Navigating to {req!r} returned None, the response" 275 | " will have empty headers and status 200", 276 | ) in self._caplog.record_tuples 277 | assert not response.headers 278 | assert response.status == 200 279 | 280 | @allow_windows 281 | async def test_abort_requests(self): 282 | async def should_abort_request_async(request): 283 | return request.resource_type == "image" 284 | 285 | def should_abort_request_sync(request): 286 | return request.resource_type == "image" 287 | 288 | for predicate in ( 289 | lambda request: request.resource_type == "image", 290 | should_abort_request_async, 291 | should_abort_request_sync, 292 | ): 293 | settings_dict = { 294 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 295 | "PLAYWRIGHT_ABORT_REQUEST": predicate, 296 | } 297 | async with make_handler(settings_dict) as handler: 298 | with StaticMockServer() as server: 299 | req = Request( 300 | url=server.urljoin("/gallery.html"), 301 | meta={"playwright": True}, 302 | ) 303 | await handler._download_request(req, Spider("foo")) 304 | 305 | req_prefix = "playwright/request_count" 306 | resp_prefix = "playwright/response_count" 307 | assert handler.stats.get_value(f"{req_prefix}/resource_type/document") == 1 308 | assert handler.stats.get_value(f"{req_prefix}/resource_type/image") == 3 309 | assert handler.stats.get_value(f"{resp_prefix}/resource_type/document") == 1 310 | assert handler.stats.get_value(f"{resp_prefix}/resource_type/image") is None 311 | assert handler.stats.get_value(f"{req_prefix}/aborted") == 3 312 | 313 | @allow_windows 314 | async def test_page_initialization_ok(self): 315 | async def init_page(page, _request): 316 | await page.set_extra_http_headers({"Extra-Header": "Qwerty"}) 317 | 318 | settings_dict = { 319 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 320 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, 321 | } 322 | async with make_handler(settings_dict) as handler: 323 | with MockServer() as server: 324 | req = Request( 325 | url=server.urljoin("/headers"), 326 | meta={"playwright": True, "playwright_page_init_callback": init_page}, 327 | ) 328 | response = await handler._download_request(req, Spider("spider_name")) 329 | assert response.status == 200 330 | headers = json.loads(response.css("pre::text").get()) 331 | headers = {key.lower(): value for key, value in headers.items()} 332 | assert headers["extra-header"] == "Qwerty" 333 | 334 | @allow_windows 335 | async def test_page_initialization_fail(self): 336 | async def init_page(page, _request, _missing): 337 | await page.set_extra_http_headers({"Extra-Header": "Qwerty"}) 338 | 339 | settings_dict = { 340 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 341 | "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None, 342 | } 343 | async with make_handler(settings_dict) as handler: 344 | with MockServer() as server: 345 | req = Request( 346 | url=server.urljoin("/headers"), 347 | meta={"playwright": True, "playwright_page_init_callback": init_page}, 348 | ) 349 | response = await handler._download_request(req, Spider("spider_name")) 350 | assert response.status == 200 351 | headers = json.loads(response.css("pre::text").get()) 352 | headers = {key.lower(): value for key, value in headers.items()} 353 | assert "extra-header" not in headers 354 | for entry in self._caplog.record_tuples: 355 | if "Page init callback exception for" in entry[2]: 356 | assert entry[0] == "scrapy-playwright" 357 | assert entry[1] == logging.WARNING 358 | assert f"[Context=default] Page init callback exception for {req!r}" in entry[2] 359 | assert "init_page() missing 1 required positional argument: '_missing'" in entry[2] 360 | 361 | @allow_windows 362 | async def test_redirect(self): 363 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 364 | with MockServer() as server: 365 | req = Request( 366 | url=server.urljoin("/redirect2"), 367 | meta={"playwright": True}, 368 | ) 369 | response = await handler._download_request(req, Spider("spider_name")) 370 | 371 | assert response.url == server.urljoin("/headers") 372 | assert response.meta["redirect_times"] == 2 373 | assert response.meta["redirect_reasons"] == [302, 301] 374 | assert response.meta["redirect_urls"] == [ 375 | server.urljoin("/redirect2"), 376 | server.urljoin("/redirect"), 377 | ] 378 | 379 | @allow_windows 380 | async def test_logging_record_spider(self): 381 | """Make sure at least one log record has the spider as an attribute 382 | (records sent before opening the spider will not have it). 383 | """ 384 | spider = Spider("spider_name") 385 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 386 | with MockServer() as server: 387 | req = Request(url=server.urljoin("/index.html"), meta={"playwright": True}) 388 | await handler._download_request(req, spider) 389 | 390 | assert any(getattr(rec, "spider", None) is spider for rec in self._caplog.records) 391 | 392 | @allow_windows 393 | @patch("scrapy_playwright.handler._make_request_logger") 394 | async def test_request_logger_disabled(self, make_request_logger: MagicMock): 395 | self._caplog.set_level(logging.DEBUG + 1, "scrapy-playwright") 396 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 397 | with MockServer() as server: 398 | req = Request(url=server.urljoin("/index.html"), meta={"playwright": True}) 399 | await handler._download_request(req, Spider("foo")) 400 | 401 | debug_message = ( 402 | f"[Context=default] Request: <{req.method} {req.url}> (resource type: document)" 403 | ) 404 | assert not any(rec.message == debug_message for rec in self._caplog.records) 405 | make_request_logger.assert_not_called() 406 | 407 | @allow_windows 408 | async def test_request_logger_enabled(self): 409 | self._caplog.set_level(logging.DEBUG, "scrapy-playwright") 410 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 411 | with MockServer() as server: 412 | req = Request(url=server.urljoin("/index.html"), meta={"playwright": True}) 413 | await handler._download_request(req, Spider("foo")) 414 | 415 | debug_message = ( 416 | f"[Context=default] Request: <{req.method} {req.url}> (resource type: document)" 417 | ) 418 | assert any(rec.message == debug_message for rec in self._caplog.records) 419 | 420 | @allow_windows 421 | @patch("scrapy_playwright.handler._make_response_logger") 422 | async def test_response_logger_disabled(self, make_response_logger: MagicMock): 423 | self._caplog.set_level(logging.DEBUG + 1, "scrapy-playwright") 424 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 425 | with MockServer() as server: 426 | req = Request(url=server.urljoin("/index.html"), meta={"playwright": True}) 427 | response = await handler._download_request(req, Spider("foo")) 428 | 429 | debug_message = f"[Context=default] Response: <{response.status} {response.url}>" 430 | assert not any(rec.message == debug_message for rec in self._caplog.records) 431 | make_response_logger.assert_not_called() 432 | 433 | @allow_windows 434 | async def test_response_logger_enabled(self): 435 | self._caplog.set_level(logging.DEBUG, "scrapy-playwright") 436 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 437 | with MockServer() as server: 438 | request = Request(url=server.urljoin("/index.html"), meta={"playwright": True}) 439 | response = await handler._download_request(request, Spider("foo")) 440 | 441 | debug_message = f"[Context=default] Response: <{response.status} {response.url}>" 442 | assert any(rec.message == debug_message for rec in self._caplog.records) 443 | 444 | @allow_windows 445 | async def test_download_file_ok(self): 446 | settings_dict = { 447 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 448 | } 449 | async with make_handler(settings_dict) as handler: 450 | with MockServer() as server: 451 | request = Request( 452 | url=server.urljoin("mancha.pdf"), 453 | meta={"playwright": True}, 454 | ) 455 | response = await handler._download_request(request, Spider("foo")) 456 | assert response.meta["playwright_suggested_filename"] == "mancha.pdf" 457 | assert response.body.startswith(b"%PDF-1.5") 458 | assert response.headers.get("Content-Type") == b"application/pdf" 459 | assert handler.stats.get_value("playwright/download_count") == 1 460 | 461 | @allow_windows 462 | async def test_download_file_delay_ok(self): 463 | settings_dict = { 464 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 465 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0, 466 | } 467 | async with make_handler(settings_dict) as handler: 468 | with MockServer() as server: 469 | request = Request( 470 | url=server.urljoin("/mancha.pdf?delay=1"), 471 | meta={"playwright": True}, 472 | ) 473 | response = await handler._download_request(request, Spider("foo")) 474 | assert response.meta["playwright_suggested_filename"] == "mancha.pdf" 475 | assert response.body.startswith(b"%PDF-1.5") 476 | assert handler.stats.get_value("playwright/download_count") == 1 477 | 478 | @allow_windows 479 | async def test_download_file_delay_error(self): 480 | settings_dict = { 481 | "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, 482 | "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 10, 483 | } 484 | async with make_handler(settings_dict) as handler: 485 | with MockServer() as server: 486 | request = Request( 487 | url=server.urljoin("/mancha.pdf?delay=1"), 488 | meta={"playwright": True}, 489 | ) 490 | with pytest.raises(PlaywrightError) as excinfo: 491 | await handler._download_request(request, Spider("foo")) 492 | assert ( 493 | "scrapy-playwright", 494 | logging.WARNING, 495 | f"Closing page due to failed request: {request}" 496 | f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}", 497 | ) in self._caplog.record_tuples 498 | 499 | @allow_windows 500 | async def test_download_file_failure(self): 501 | if self.browser_type != "chromium": 502 | pytest.skip() 503 | 504 | async def cancel_download(download): 505 | await download.cancel() 506 | 507 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 508 | with MockServer() as server: 509 | request = Request( 510 | url=server.urljoin("/mancha.pdf?content_length_multiplier=1000"), 511 | meta={ 512 | "playwright": True, 513 | "playwright_event_handlers": {"download": cancel_download}, 514 | }, 515 | ) 516 | with pytest.raises(RuntimeError) as excinfo: 517 | await handler._download_request(request, Spider("foo")) 518 | assert ( 519 | "scrapy-playwright", 520 | logging.WARNING, 521 | f"Closing page due to failed request: {request}" 522 | f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}", 523 | ) in self._caplog.record_tuples 524 | 525 | @allow_windows 526 | async def test_fail_status_204(self): 527 | async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: 528 | with MockServer() as server: 529 | request = Request( 530 | url=server.urljoin("/status/204"), 531 | meta={"playwright": True}, 532 | ) 533 | with pytest.raises(PlaywrightError) as excinfo: 534 | await handler._download_request(request, Spider("foo")) 535 | assert ( 536 | "scrapy-playwright", 537 | logging.WARNING, 538 | f"Closing page due to failed request: {request}" 539 | f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}", 540 | ) in self._caplog.record_tuples 541 | 542 | 543 | class TestCaseChromium(IsolatedAsyncioTestCase, MixinTestCase): 544 | browser_type = "chromium" 545 | 546 | 547 | class TestCaseFirefox(IsolatedAsyncioTestCase, MixinTestCase): 548 | browser_type = "firefox" 549 | 550 | 551 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin") 552 | class TestCaseWebkit(IsolatedAsyncioTestCase, MixinTestCase): 553 | browser_type = "webkit" 554 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_settings.py: -------------------------------------------------------------------------------- 1 | from unittest import IsolatedAsyncioTestCase 2 | 3 | import pytest 4 | from scrapy.exceptions import NotSupported 5 | from scrapy.settings import Settings 6 | 7 | from scrapy_playwright.handler import Config 8 | 9 | from tests import allow_windows, make_handler 10 | 11 | 12 | class TestSettings(IsolatedAsyncioTestCase): 13 | async def test_settings_timeout_value(self): 14 | config = Config.from_settings(Settings({})) 15 | assert config.navigation_timeout is None 16 | 17 | config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None})) 18 | assert config.navigation_timeout is None 19 | 20 | config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0})) 21 | assert config.navigation_timeout == 0 22 | 23 | config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123})) 24 | assert config.navigation_timeout == 123 25 | 26 | config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5})) 27 | assert config.navigation_timeout == 0.5 28 | 29 | async def test_max_pages_per_context(self): 30 | config = Config.from_settings(Settings({"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234})) 31 | assert config.max_pages_per_context == 1234 32 | 33 | config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876})) 34 | assert config.max_pages_per_context == 9876 35 | 36 | async def test_connect_remote_urls(self): 37 | with pytest.raises(NotSupported) as exc_info: 38 | Config.from_settings( 39 | Settings({"PLAYWRIGHT_CONNECT_URL": "asdf", "PLAYWRIGHT_CDP_URL": "qwerty"}) 40 | ) 41 | assert ( 42 | str(exc_info.value) 43 | == "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported" 44 | ) 45 | 46 | @allow_windows 47 | async def test_max_contexts(self): 48 | async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler: 49 | assert not hasattr(handler, "context_semaphore") 50 | 51 | async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": 1234}) as handler: 52 | assert handler.context_semaphore._value == 1234 53 | -------------------------------------------------------------------------------- /tests/tests_asyncio/test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from decimal import Decimal 3 | from unittest import IsolatedAsyncioTestCase 4 | from unittest.mock import AsyncMock 5 | 6 | import pytest 7 | from playwright.async_api import Error as PlaywrightError 8 | from scrapy import Spider 9 | from scrapy.http.headers import Headers 10 | from scrapy.settings import Settings 11 | from scrapy_playwright._utils import ( 12 | _NAVIGATION_ERROR_MSG, 13 | _encode_body, 14 | _get_float_setting, 15 | _get_header_value, 16 | _get_page_content, 17 | _maybe_await, 18 | ) 19 | 20 | 21 | class TestPageContent(IsolatedAsyncioTestCase): 22 | @pytest.fixture(autouse=True) 23 | def inject_fixtures(self, caplog): 24 | caplog.set_level(logging.DEBUG) 25 | self._caplog = caplog 26 | 27 | async def test_get_page_content_ok(self): 28 | expected_content = "lorem ipsum" 29 | page = AsyncMock() 30 | page.content.return_value = expected_content 31 | content = await _get_page_content( 32 | page=page, 33 | spider=Spider("foo"), 34 | context_name="context", 35 | scrapy_request_url="https://example.org", 36 | scrapy_request_method="GET", 37 | ) 38 | assert content == expected_content 39 | 40 | async def test_get_page_content_retry_known_exception(self): 41 | expected_content = "lorem ipsum" 42 | page = AsyncMock() 43 | page.url = "FAKE URL" 44 | page.content.side_effect = [PlaywrightError(_NAVIGATION_ERROR_MSG), expected_content] 45 | content = await _get_page_content( 46 | page=page, 47 | spider=Spider("foo"), 48 | context_name="context", 49 | scrapy_request_url="https://example.org", 50 | scrapy_request_method="GET", 51 | ) 52 | assert content == expected_content 53 | assert ( 54 | "scrapy-playwright", 55 | logging.DEBUG, 56 | f"Retrying to get content from page '{page.url}', error: 'Unable to retrieve" 57 | " content because the page is navigating and changing the content.'", 58 | ) in self._caplog.record_tuples 59 | 60 | async def test_get_page_content_reraise_unknown_exception(self): 61 | expected_exception_message = "nope" 62 | page = AsyncMock() 63 | page.content.side_effect = PlaywrightError(expected_exception_message) 64 | with pytest.raises(PlaywrightError, match=expected_exception_message): 65 | await _get_page_content( 66 | page=page, 67 | spider=Spider("foo"), 68 | context_name="context", 69 | scrapy_request_url="https://example.org", 70 | scrapy_request_method="GET", 71 | ) 72 | 73 | 74 | class TestBodyEncoding(IsolatedAsyncioTestCase): 75 | @staticmethod 76 | def body_str(charset: str, content: str = "áéíóú") -> str: 77 | return f""" 78 | 79 | 80 | 81 | 82 | 83 | 84 |

{content}

85 | 86 | 87 | """.strip() 88 | 89 | async def test_encode_from_headers(self): 90 | """Charset declared in headers takes precedence""" 91 | text = self.body_str(charset="gb2312") 92 | body, encoding = _encode_body( 93 | headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), 94 | text=text, 95 | ) 96 | assert encoding == "cp1252" 97 | assert body == text.encode(encoding) 98 | 99 | async def test_encode_from_body(self): 100 | """No charset declared in headers, use the one declared in the body""" 101 | text = self.body_str(charset="gb2312") 102 | body, encoding = _encode_body(headers=Headers({}), text=text) 103 | assert encoding == "gb18030" 104 | assert body == text.encode(encoding) 105 | 106 | async def test_encode_fallback_utf8(self): 107 | """No charset declared, use utf-8 as fallback""" 108 | text = "áéíóú" 109 | body, encoding = _encode_body(headers=Headers(), text=text) 110 | assert encoding == "utf-8" 111 | assert body == text.encode(encoding) 112 | 113 | async def test_encode_mismatch(self): 114 | """Charset declared in headers and body do not match, and the headers 115 | one fails to encode: use the one in the body (first one that works) 116 | """ 117 | text = self.body_str(charset="gb2312", content="空手道") 118 | body, encoding = _encode_body( 119 | headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}), 120 | text=text, 121 | ) 122 | assert encoding == "gb18030" 123 | assert body == text.encode(encoding) 124 | 125 | 126 | class TestHeaderValue(IsolatedAsyncioTestCase): 127 | async def test_get_header_value(self): 128 | async def _identity(x): 129 | return x 130 | 131 | res1 = AsyncMock() 132 | res1.header_value = _identity 133 | assert "asdf" == await _get_header_value(res1, "asdf") 134 | assert "qwerty" == await _get_header_value(res1, "qwerty") 135 | 136 | res2 = AsyncMock() 137 | res2.header_value.side_effect = Exception("nope") 138 | assert await _get_header_value(res2, "asdf") is None 139 | assert await _get_header_value(res2, "qwerty") is None 140 | 141 | 142 | class TestMaybeAwait(IsolatedAsyncioTestCase): 143 | async def test_maybe_await(self): 144 | async def _awaitable_identity(x): 145 | return x 146 | 147 | assert await _maybe_await(_awaitable_identity("asdf")) == "asdf" 148 | assert await _maybe_await(_awaitable_identity("qwerty")) == "qwerty" 149 | assert await _maybe_await(_awaitable_identity(1234)) == 1234 150 | assert await _maybe_await("foo") == "foo" 151 | assert await _maybe_await("bar") == "bar" 152 | assert await _maybe_await(1234) == 1234 153 | 154 | 155 | class TestGetFloatSetting(IsolatedAsyncioTestCase): 156 | async def test_get_float_setting(self): 157 | settings = Settings( 158 | { 159 | "ZERO": 0, 160 | "FLOAT": 1.5, 161 | "DECIMAL": Decimal("2.5"), 162 | "INT": 3, 163 | "NUMERIC_STRING": "123", 164 | "NON_NUMERIC_STRING": "asdf", 165 | "NONE": None, 166 | "LIST": [1, 2, 3], 167 | } 168 | ) 169 | assert _get_float_setting(settings, "ZERO") == 0.0 170 | assert _get_float_setting(settings, "FLOAT") == 1.5 171 | assert _get_float_setting(settings, "DECIMAL") == 2.5 172 | assert _get_float_setting(settings, "INT") == 3.0 173 | assert _get_float_setting(settings, "NUMERIC_STRING") == 123 174 | assert _get_float_setting(settings, "NON_NUMERIC_STRING") is None 175 | assert _get_float_setting(settings, "NONE") is None 176 | assert _get_float_setting(settings, "LIST") is None 177 | assert _get_float_setting(settings, "MISSING_KEY") is None 178 | -------------------------------------------------------------------------------- /tests/tests_twisted/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/tests_twisted/__init__.py -------------------------------------------------------------------------------- /tests/tests_twisted/test_mixed_requests.py: -------------------------------------------------------------------------------- 1 | from scrapy import Spider 2 | from scrapy.http import Request, Response 3 | from scrapy.utils.test import get_crawler 4 | from twisted.internet import defer 5 | from twisted.trial.unittest import TestCase 6 | 7 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler 8 | from tests.mockserver import StaticMockServer 9 | 10 | 11 | class MixedRequestsTestCase(TestCase): 12 | """ 13 | This test case ensures the handler's 'download_request' method works as expected, and 14 | non-playwright requests are processed correctly. The rest of the tests directly call 15 | '_download_request', which is a coroutine ('download_request' returns a Deferred). 16 | """ 17 | 18 | timeout_ms = 500 19 | 20 | @defer.inlineCallbacks 21 | def setUp(self): 22 | self.server = StaticMockServer() 23 | self.server.__enter__() 24 | self.handler = ScrapyPlaywrightDownloadHandler.from_crawler( 25 | get_crawler(settings_dict={"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": self.timeout_ms}) 26 | ) 27 | yield self.handler._engine_started() 28 | 29 | @defer.inlineCallbacks 30 | def tearDown(self): 31 | self.server.__exit__(None, None, None) 32 | yield self.handler.close() 33 | 34 | @defer.inlineCallbacks 35 | def test_download_request(self): 36 | def _check_regular(response, request): 37 | self.assertIsInstance(response, Response) 38 | self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) 39 | self.assertEqual(response.url, request.url) 40 | self.assertEqual(response.status, 200) 41 | self.assertNotIn("playwright", response.flags) 42 | 43 | def _check_playwright_ok(response, request): 44 | self.assertIsInstance(response, Response) 45 | self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) 46 | self.assertEqual(response.url, request.url) 47 | self.assertEqual(response.status, 200) 48 | self.assertIn("playwright", response.flags) 49 | 50 | def _check_playwright_error(failure, url): 51 | # different errors depending on the platform 52 | self.assertTrue( 53 | f"Page.goto: net::ERR_CONNECTION_REFUSED at {url}" in str(failure.value) 54 | or f"Page.goto: Timeout {self.timeout_ms}ms exceeded" in str(failure.value) 55 | ) 56 | 57 | req1 = Request(self.server.urljoin("/index.html")) 58 | yield self.handler.download_request(req1, Spider("foo")).addCallback( 59 | _check_regular, request=req1 60 | ) 61 | 62 | req2 = Request(self.server.urljoin("/index.html"), meta={"playwright": True}) 63 | yield self.handler.download_request(req2, Spider("foo")).addCallback( 64 | _check_playwright_ok, request=req2 65 | ) 66 | 67 | req3 = Request("http://localhost:12345/asdf", meta={"playwright": True}) 68 | yield self.handler.download_request(req3, Spider("foo")).addErrback( 69 | _check_playwright_error, url=req3.url 70 | ) 71 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = bandit,black,flake8,typing,pylint,py,py-twisted 3 | 4 | [testenv] 5 | deps = 6 | pytest==7.4.0 7 | pytest_cov==4.1.0 8 | pytest_twisted==1.14 9 | psutil==5.9.7 10 | playwright==1.44 # version must match the one installed with npm below 11 | allowlist_externals = 12 | npm 13 | npx 14 | commands = 15 | playwright install --with-deps 16 | npm install playwright@1.44 17 | npx playwright install chromium 18 | py.test -vv --reactor=asyncio \ 19 | --cov-report=term-missing \ 20 | --cov-report=xml:coverage-asyncio.xml \ 21 | --cov-report=html:coverage-asyncio \ 22 | --cov=scrapy_playwright {posargs: scrapy_playwright tests/tests_asyncio} 23 | setenv = 24 | DEBUG=pw:api 25 | 26 | [testenv:py] 27 | basepython = python3 28 | 29 | [testenv:py-twisted] 30 | basepython = python3 31 | commands = 32 | playwright install --with-deps 33 | py.test -vv --reactor=asyncio \ 34 | --cov-report=term-missing \ 35 | --cov-report=xml:coverage-twisted.xml \ 36 | --cov-report=html:coverage-twisted \ 37 | --cov=scrapy_playwright {posargs: scrapy_playwright tests/tests_twisted} 38 | 39 | [testenv:bandit] 40 | deps = 41 | bandit 42 | commands = 43 | bandit -r {posargs: scrapy_playwright setup.py examples} 44 | 45 | [testenv:black] 46 | deps = 47 | black==24.4.2 48 | commands = 49 | black --check {posargs: scrapy_playwright setup.py tests examples} 50 | 51 | [testenv:flake8] 52 | deps = 53 | flake8==7.0.0 54 | commands = 55 | flake8 --exclude=.git,.tox,venv* {posargs: scrapy_playwright setup.py tests examples} 56 | 57 | [testenv:typing] 58 | deps = 59 | mypy==1.10.0 60 | commands = 61 | mypy --show-error-codes --ignore-missing-imports \ 62 | --follow-imports=skip {posargs: scrapy_playwright setup.py tests examples} 63 | 64 | [testenv:pylint] 65 | deps = 66 | psutil==5.9.7 67 | pylint==3.2.2 68 | pytest==7.4.0 69 | commands = 70 | pip install -e . 71 | pylint {posargs: scrapy_playwright setup.py tests} 72 | --------------------------------------------------------------------------------