├── .bumpversion.cfg
├── .github
    └── workflows
    │   ├── checks.yml
    │   ├── publish.yml
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── changelog.md
    └── faq.md
├── examples
    ├── .gitignore
    ├── books.py
    ├── books
    │   └── .gitignore
    ├── contexts.py
    ├── download.py
    ├── events.py
    ├── exception_errback.py
    ├── exception_middleware.py
    ├── headers.py
    ├── init_page.py
    ├── max_pages.py
    ├── post.py
    ├── scroll.py
    └── storage.py
├── pylintrc
├── pyproject.toml
├── scrapy_playwright
    ├── __init__.py
    ├── _utils.py
    ├── handler.py
    ├── headers.py
    ├── memusage.py
    └── page.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── launch_chromium_server.js
    ├── mockserver.py
    ├── site
    │   ├── data
    │   │   ├── quotes1.json
    │   │   ├── quotes2.json
    │   │   └── quotes3.json
    │   ├── files
    │   │   └── mancha.pdf
    │   ├── gallery.html
    │   ├── index.html
    │   ├── lorem_ipsum.html
    │   ├── redirect.html
    │   ├── scroll.html
    │   └── static
    │   │   ├── bootstrap.min.css
    │   │   ├── img
    │   │       ├── ales-krivec-ZMZHcvIVgbg-unsplash.jpg
    │   │       ├── elyssa-fahndrich-MF16lGb95WY-unsplash.jpg
    │   │       └── nathan-dumlao-RCfalHrnFAs-unsplash.jpg
    │   │   ├── jquery.js
    │   │   └── main.css
    ├── tests_asyncio
    │   ├── __init__.py
    │   ├── test_browser.py
    │   ├── test_browser_contexts.py
    │   ├── test_extensions.py
    │   ├── test_headers.py
    │   ├── test_page_methods.py
    │   ├── test_playwright_requests.py
    │   ├── test_settings.py
    │   └── test_utils.py
    └── tests_twisted
    │   ├── __init__.py
    │   └── test_mixed_requests.py
└── tox.ini


/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 0.0.43
3 | commit = True
4 | tag = True
5 | 
6 | [bumpversion:file:scrapy_playwright/__init__.py]
7 | 


--------------------------------------------------------------------------------
/.github/workflows/checks.yml:
--------------------------------------------------------------------------------
 1 | name: Checks
 2 | on: [push, pull_request, workflow_dispatch]
 3 | 
 4 | jobs:
 5 |   checks:
 6 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 7 |     runs-on: ubuntu-latest
 8 |     timeout-minutes: 5
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         include:
13 |         - env:
14 |             TOXENV: bandit
15 |         - env:
16 |             TOXENV: black
17 |         - env:
18 |             TOXENV: flake8
19 |         - env:
20 |             TOXENV: typing
21 |         - env:
22 |             TOXENV: pylint
23 | 
24 |     steps:
25 |     - uses: actions/checkout@v4
26 | 
27 |     - name: Set up Python
28 |       uses: actions/setup-python@v5
29 |       with:
30 |         python-version: 3.11
31 | 
32 |     - name: Run check
33 |       env: ${{ matrix.env }}
34 |       run: |
35 |         pip install -U pip
36 |         pip install -U tox
37 |         tox
38 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | 
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-latest
 9 |     timeout-minutes: 5
10 | 
11 |     steps:
12 |     - uses: actions/checkout@v4
13 | 
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v5
16 |       with:
17 |         python-version: 3.11
18 | 
19 |     - name: Publish to PyPI
20 |       run: |
21 |         pip install --upgrade pip
22 |         pip install --upgrade setuptools wheel twine
23 |         python setup.py sdist bdist_wheel
24 |         export TWINE_USERNAME=__token__
25 |         export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }}
26 |         twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on: [push, pull_request, workflow_dispatch]
 3 | 
 4 | jobs:
 5 |   tests:
 6 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 7 |     runs-on: ${{ matrix.os }}
 8 |     timeout-minutes: 20
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         os: [ubuntu-22.04]
13 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
14 |         include:
15 |           - os: macos-14
16 |             python-version: "3.12"
17 |           - os: windows-2022
18 |             python-version: "3.12"
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v4
22 | 
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 | 
28 |     - name: Set up node
29 |       uses: actions/setup-node@v4
30 |       with:
31 |         node-version: 18
32 | 
33 |     - name: Install tox
34 |       run: pip install tox
35 | 
36 |     - name: Run asyncio tests
37 |       run: tox -e py
38 | 
39 |     - name: Run twisted tests
40 |       run: tox -e py-twisted
41 | 
42 |     - name: Upload coverage report (Linux)
43 |       if: runner.os == 'Linux'
44 |       env:
45 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
46 |       run: |
47 |         curl -Os https://uploader.codecov.io/latest/linux/codecov
48 |         chmod +x codecov
49 |         ./codecov
50 | 
51 |     - name: Upload coverage report (macOS)
52 |       if: runner.os == 'macOS'
53 |       env:
54 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
55 |       run: |
56 |         curl -Os https://uploader.codecov.io/latest/macos/codecov
57 |         chmod +x codecov
58 |         ./codecov
59 | 
60 |     - name: Upload coverage report (Windows)
61 |       if: runner.os == 'Windows'
62 |       env:
63 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
64 |       run: |
65 |         $ProgressPreference = 'SilentlyContinue'
66 |         Invoke-WebRequest -Uri https://uploader.codecov.io/latest/windows/codecov.exe -Outfile codecov.exe
67 |         .\codecov.exe
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .~lock*
 3 | .DS_Store
 4 | .mypy_cache/
 5 | *.egg-info/
 6 | .tox/
 7 | build/
 8 | dist/
 9 | examples/*.png
10 | pip-wheel-metadata/
11 | 
12 | # coverage
13 | .coverage
14 | .coverage.*
15 | htmlcov/
16 | coverage.xml
17 | coverage-*.xml
18 | coverage-asyncio/
19 | coverage-twisted/
20 | 
21 | # nodejs stuff
22 | node_modules/
23 | package-lock.json
24 | package.json
25 | 
26 | .idea
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Eugenio Lacuesta
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification,
 4 | are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice,
 7 | this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation and/or
11 | other materials provided with the distribution.
12 | 
13 | 3. Neither the name of the copyright holder nor the names of its contributors
14 | may be used to endorse or promote products derived from this software without
15 | specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
  1 | # scrapy-playwright changelog
  2 | 
  3 | 
  4 | ### [v0.0.43](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.43) (2025-02-22)
  5 | 
  6 | * Only register request and response loggers when needed (#336)
  7 | 
  8 | 
  9 | ### [v0.0.42](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.42) (2024-11-06)
 10 | 
 11 | * Allow custom PageMethod callbacks (#318)
 12 | * Fix download errors caused by Content-Encoding header (#322)
 13 | 
 14 | 
 15 | ### [v0.0.41](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.41) (2024-08-13)
 16 | 
 17 | * Keyword arguments for PLAYWRIGHT_PROCESS_REQUEST_HEADERS, pass additional Request data (#303).
 18 |   Deprecated positional argument handling for the function passed to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS
 19 |   setting, arguments should now be handled by keyword.
 20 | * Retry to create page on browser crash (#305)
 21 | * Fix typo in log message (#312)
 22 | 
 23 | 
 24 | ### [v0.0.40](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.40) (2024-07-16)
 25 | 
 26 | * Enforce asyncio reactor in all platforms (#298)
 27 | * Allow multiple handlers in separate thread (#299)
 28 | 
 29 | 
 30 | ### [v0.0.39](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.39) (2024-07-11)
 31 | 
 32 | * Return proper status and headers for downloads (#293)
 33 | * Restart on browser crash (#295)
 34 | * Override method and/or body only for the first matching request (#297)
 35 | 
 36 | 
 37 | ### [v0.0.38](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.38) (2024-07-06)
 38 | 
 39 | * Fix freezing on responses with status 204 (#292)
 40 | * Connect to remote browser using BrowserType.connect (#283)
 41 | 
 42 | 
 43 | ### [v0.0.37](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.37) (2024-07-03)
 44 | 
 45 | * Improve Windows concurrency (#286)
 46 | 
 47 | 
 48 | ### [v0.0.36](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.36) (2024-06-24)
 49 | 
 50 | * Windows support (#276)
 51 | 
 52 | 
 53 | ### [v0.0.35](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.35) (2024-06-01)
 54 | 
 55 | * Update exception message check
 56 | 
 57 | 
 58 | ### [v0.0.34](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.34) (2024-01-01)
 59 | 
 60 | * Update dev status classifier to 4 - beta
 61 | * Official Python 3.12 support (#254)
 62 | * Custom memusage extension (#257)
 63 | 
 64 | 
 65 | ### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19)
 66 | 
 67 | * Handle downloads as binary responses (#228)
 68 | 
 69 | 
 70 | ### [v0.0.32](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.32) (2023-09-04)
 71 | 
 72 | * Connect to browser using CDP (#227)
 73 | 
 74 | 
 75 | ### [v0.0.31](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.31) (2023-08-28)
 76 | 
 77 | * Do not fail when getting referer header for debug log messages (#225)
 78 | * Do not override headers with values from asset requests (#226)
 79 | 
 80 | 
 81 | ### [v0.0.30](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.30) (2023-08-17)
 82 | 
 83 | * Fix page_init_callback duplication (#222)
 84 | * Bump minimum Python version from 3.7 to 3.8 (#223)
 85 | 
 86 | 
 87 | ### [v0.0.29](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.29) (2023-08-11)
 88 | 
 89 | * Set exc_info=True for warning log records (#219)
 90 | * Invoke page_init_callback after setting route (#205)
 91 | 
 92 | 
 93 | ### [v0.0.28](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.28) (2023-08-05)
 94 | 
 95 | * Retry page.content if necessary (#218)
 96 | 
 97 | 
 98 | ### [v0.0.27](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.27) (2023-07-24)
 99 | 
100 | * Override method only for navigation requests (#177)
101 | * Pass spider argument to _create_browser_context (#212)
102 | * await AsyncPlaywright.stop on close (#214)
103 | 
104 | 
105 | ### [v0.0.26](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.26) (2023-02-01)
106 | 
107 | * Fix logging (pass extra args instead of updating log record factory)
108 | * Miscellaneous adjustments (naming, typing, etc)
109 | 
110 | 
111 | ### [v0.0.25](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.25) (2023-01-24)
112 | 
113 | * Set spider attribute on log records
114 | 
115 | 
116 | ### [v0.0.24](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.24) (2022-12-04)
117 | 
118 | * Fix request method override
119 | 
120 | 
121 | ### [v0.0.23](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.23) (2022-11-27)
122 | 
123 | * Set redirect request metadata
124 | 
125 | 
126 | ### [v0.0.22](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.22) (2022-10-09)
127 | 
128 | * Remove deprecated code (`PageCoroutine` class, `playwright_page_coroutines` request meta key,
129 |   `use_playwright_headers` function).
130 | * `playwright_page_init_callback` meta key (page initialization callback)
131 | 
132 | 
133 | ### [v0.0.21](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.21) (2022-08-08)
134 | 
135 | * Fixed TypeError exception when getting server IP address
136 | 
137 | 
138 | ### [v0.0.20](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.20) (2022-08-03)
139 | 
140 | * Don't raise exceptions if `Page.goto` returns `None`
141 | 
142 | 
143 | ### [v0.0.19](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.19) (2022-07-17)
144 | 
145 | * Add support for `Page.goto` keyword arguments (`playwright_page_goto_kwargs` request meta key)
146 | 
147 | 
148 | ### [v0.0.18](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.18) (2022-06-18)
149 | 
150 | * Always override request headers
151 | 
152 | 
153 | ### [v0.0.17](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.17) (2022-05-22)
154 | 
155 | * Support for persistent contexts
156 | * Limit concurrent context count (`PLAYWRIGHT_MAX_CONTEXTS` setting)
157 | 
158 | 
159 | ### [v0.0.16](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.16) (2022-05-14)
160 | 
161 | * Use new headers API introduced in Playwright 1.15 (bump required Playwright version)
162 | * Deprecate `scrapy_playwright.headers.use_playwright_headers`, set `PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None` instead
163 | 
164 | 
165 | ### [v0.0.15](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.15) (2022-05-08)
166 | 
167 | * Remove deprecated `PLAYWRIGHT_CONTEXT_ARGS` setting
168 | * Warn on failed requests
169 | * `PLAYWRIGHT_ABORT_REQUEST` setting: accept coroutine functions
170 | * `PLAYWRIGHT_PROCESS_REQUEST_HEADERS` setting: accept sync functions to process headers
171 | * Set `playwright_page` request meta key early
172 | 
173 | 
174 | ### [v0.0.14](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.14) (2022-03-26)
175 | 
176 | * Renamed `scrapy_playwright.page.PageCoroutine` to `scrapy_playwright.page.PageMethod`
177 |   (`PageCoroutine` is now deprecated). Also deprecated the `playwright_page_coroutines`
178 |   Request meta key in favor of `playwright_page_methods`.
179 | 
180 | 
181 | ### [v0.0.13](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.13) (2022-03-24)
182 | 
183 | * PageCoroutine checks
184 | * Fix encoding detection
185 | * Ability to abort requests via setting
186 | 
187 | 
188 | ### [v0.0.12](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.12) (2022-03-15)
189 | 
190 | * Avoid exceptions during cleanup when the browser could not start
191 | * Warn when non PageCoroutine objects are passed to Request.meta.playwright_page_coroutines
192 | 
193 | 
194 | ### [v0.0.11](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.11) (2022-03-12)
195 | 
196 | * Set the maximum amount of pages per context
197 | * Response.ip_address attribute
198 | * Response security details
199 | 
200 | 
201 | ### [v0.0.10](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.10) (2022-03-02)
202 | 
203 | * Fix response encoding detection
204 | 
205 | 
206 | ### [v0.0.9](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.9) (2022-01-27)
207 | 
208 | * Ability to process request headers
209 | 
210 | 
211 | ### [v0.0.8](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.8) (2022-01-13)
212 | 
213 | * Fix PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT setting (allow zero value)
214 | 
215 | 
216 | ### [v0.0.7](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.7) (2021-10-20)
217 | 
218 | * Log all requests/responses (debug level)
219 | 
220 | 
221 | ### [v0.0.6](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.6) (2021-10-19)
222 | 
223 | * Page event handlers
224 | * Python 3.10 support
225 | * Doc fixes
226 | * Override User-Agent header
227 | 
228 | 
229 | ### [v0.0.5](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.5) (2021-08-20)
230 | 
231 | * Improve garbage collection by removing unnecessary reference
232 | 
233 | ### [v0.0.4](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.4) (2021-07-16)
234 | 
235 | * Add support for multiple browser contexts ([#13](https://github.com/scrapy-plugins/scrapy-playwright/pull/13))
236 | * Deprecate `PLAYWRIGHT_CONTEXT_ARGS` setting in favor of `PLAYWRIGHT_CONTEXTS`
237 | 
238 | 
239 | ### [v0.0.3](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.3) (2021-02-22)
240 | 
241 | * Snake case (requires playwright-python >= [v1.8.0a1](https://github.com/microsoft/playwright-python/releases/tag/v1.8.0a1))
242 | 
243 | 
244 | ### [v0.0.2](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.2) (2021-01-13)
245 | 
246 | * `PLAYWRIGHT_CONTEXT_ARGS` setting (ability to pass keyword arguments to the browser context)
247 | 
248 | ### [v0.0.1](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.1) (2020-12-18)
249 | 
250 | Initial public release.
251 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | 
 4 | ## How to use scrapy-playwright with the [CrawlSpider](https://docs.scrapy.org/en/latest/topics/spiders.html#crawlspider)?
 5 | 
 6 | By specifying a `process_request` method that modifies requests in-place in your
 7 | [crawling rules](https://docs.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Rule).
 8 | For instance:
 9 | 
10 | ```python
11 | def set_playwright_true(request, response):
12 |     request.meta["playwright"] = True
13 |     return request
14 | 
15 | class MyCrawlSpider(CrawlSpider):
16 |     ...
17 |     rules = (
18 |         Rule(
19 |             link_extractor=LinkExtractor(...),
20 |             callback="parse_item",
21 |             follow=False,
22 |             process_request=set_playwright_true,
23 |         ),
24 |     )
25 | ```
26 | 
27 | 
28 | ## How to download all requests using scrapy-playwright?
29 | 
30 | If you want all requests to be processed by Playwright and don't want to repeat
31 | yourself, or you're using a generic spider that doesn't support request
32 | customization (e.g. `scrapy.spiders.SitemapSpider`), you can use a middleware
33 | to edit the `meta` attribute for all requests.
34 | 
35 | Depending on your project and the interactions with other components, you might
36 | decide to use a
37 | [spider middleware](https://docs.scrapy.org/en/latest/topics/spider-middleware.html)
38 | or a
39 | [downloader middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html).
40 | 
41 | Spider middleware example:
42 | 
43 | ```python
44 | class PlaywrightSpiderMiddleware:
45 |     def process_spider_output(self, response, result, spider):
46 |         for obj in result:
47 |             if isinstance(obj, scrapy.Request):
48 |                 obj.meta.setdefault("playwright", True)
49 |             yield obj
50 | ```
51 | 
52 | Downloader middleware example:
53 | 
54 | ```python
55 | class PlaywrightDownloaderMiddleware:
56 |     def process_request(self, request, spider):
57 |         request.meta.setdefault("playwright", True)
58 |         return None
59 | ```
60 | 
61 | 
62 | ## How to increase the allowed memory size for the browser?
63 | 
64 | If you're seeing messages such as `JavaScript heap out of memory`, there's a
65 | chance you're falling into the scope of
66 | https://github.com/microsoft/playwright/issues/6319. As a workaround, it's
67 | possible to increase the amount of memory allowed for the Node.js process by
68 | specifying a value for the the `--max-old-space-size` V8 option in the
69 | `NODE_OPTIONS` environment variable, e.g.:
70 | 
71 | ```
72 | $ export NODE_OPTIONS=--max-old-space-size=SIZE  # in megabytes
73 | ```
74 | 
75 | Sources & further reading:
76 | * https://github.com/scrapy-plugins/scrapy-playwright/issues/19#issuecomment-886211045
77 | * https://github.com/npm/npm/issues/12238#issuecomment-367147962
78 | * https://medium.com/the-node-js-collection/node-options-has-landed-in-8-x-5fba57af703d
79 | * https://nodejs.org/dist/latest-v8.x/docs/api/cli.html#cli_node_options_options
80 | * https://nodejs.org/api/cli.html#cli_max_old_space_size_size_in_megabytes
81 | 


--------------------------------------------------------------------------------
/examples/.gitignore:
--------------------------------------------------------------------------------
1 | *.png
2 | *.pdf
3 | 


--------------------------------------------------------------------------------
/examples/books.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import logging
 3 | from pathlib import Path
 4 | from typing import Generator, Optional
 5 | 
 6 | from playwright.async_api import Page
 7 | from scrapy import Spider
 8 | from scrapy.http.response import Response
 9 | 
10 | 
11 | class BooksSpider(Spider):
12 |     """Extract all books, save screenshots."""
13 | 
14 |     name = "books"
15 |     custom_settings = {
16 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
17 |         "DOWNLOAD_HANDLERS": {
18 |             # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
19 |             "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
20 |         },
21 |         "CONCURRENT_REQUESTS": 32,
22 |         "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 4,
23 |         "CLOSESPIDER_ITEMCOUNT": 100,
24 |         "FEEDS": {
25 |             "books.json": {"format": "json", "encoding": "utf-8", "indent": 4},
26 |         },
27 |     }
28 |     start_urls = ["http://books.toscrape.com"]
29 | 
30 |     def __init__(self, name=None, **kwargs):
31 |         super().__init__(name, **kwargs)
32 |         logging.getLogger("scrapy.core.engine").setLevel(logging.WARNING)
33 |         logging.getLogger("scrapy.core.scraper").setLevel(logging.WARNING)
34 | 
35 |     def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
36 |         page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)")
37 |         page_count = int(page_count)
38 |         for page in range(2, page_count + 1):
39 |             yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page})
40 | 
41 |         current_page = current_page or 1
42 |         for book in response.css("article.product_pod a"):
43 |             yield response.follow(
44 |                 book,
45 |                 callback=self.parse_book,
46 |                 meta={
47 |                     "playwright": True,
48 |                     "playwright_include_page": True,
49 |                     "playwright_context": f"page-{current_page}",
50 |                 },
51 |             )
52 | 
53 |     async def parse_book(self, response: Response) -> dict:
54 |         url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
55 |         page: Page = response.meta["playwright_page"]
56 |         await page.screenshot(
57 |             path=Path(__file__).parent / "books" / f"{url_sha256}.png", full_page=True
58 |         )
59 |         await page.close()
60 |         return {
61 |             "url": response.url,
62 |             "title": response.css("h1::text").get(),
63 |             "price": response.css("p.price_color::text").get(),
64 |             "breadcrumbs": response.css(".breadcrumb a::text").getall(),
65 |             "image": f"books/{url_sha256}.png",
66 |         }
67 | 


--------------------------------------------------------------------------------
/examples/books/.gitignore:
--------------------------------------------------------------------------------
1 | *.png
2 | 


--------------------------------------------------------------------------------
/examples/contexts.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | from playwright.async_api import Page
  4 | from scrapy import Spider, Request
  5 | 
  6 | 
  7 | class MultipleContextsSpider(Spider):
  8 |     """Handle multiple browser contexts"""
  9 | 
 10 |     name = "contexts"
 11 |     custom_settings = {
 12 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
 13 |         "DOWNLOAD_HANDLERS": {
 14 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
 15 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
 16 |         },
 17 |         "PLAYWRIGHT_MAX_CONTEXTS": 6,
 18 |         "PLAYWRIGHT_CONTEXTS": {
 19 |             "first": {
 20 |                 "storage_state": {
 21 |                     "cookies": [
 22 |                         {
 23 |                             "url": "https://example.org",
 24 |                             "name": "context",
 25 |                             "value": "first",
 26 |                         },
 27 |                     ],
 28 |                 },
 29 |             },
 30 |             "second": {
 31 |                 "storage_state": {
 32 |                     "cookies": [
 33 |                         {
 34 |                             "url": "https://example.org",
 35 |                             "name": "context",
 36 |                             "value": "second",
 37 |                         },
 38 |                     ],
 39 |                 },
 40 |             },
 41 |             "persistent": {
 42 |                 "user_data_dir": str(Path.home() / "playwright-persistent-context"),
 43 |                 "java_script_enabled": False,
 44 |             },
 45 |         },
 46 |     }
 47 | 
 48 |     def start_requests(self):
 49 |         # using existing contexts
 50 |         for ctx_name in self.custom_settings["PLAYWRIGHT_CONTEXTS"].keys():
 51 |             yield Request(
 52 |                 url="https://example.org",
 53 |                 meta={
 54 |                     "playwright": True,
 55 |                     "playwright_context": ctx_name,
 56 |                     "playwright_include_page": True,
 57 |                 },
 58 |                 dont_filter=True,
 59 |             )
 60 |         # create a new context
 61 |         yield Request(
 62 |             url="https://example.org",
 63 |             meta={
 64 |                 "playwright": True,
 65 |                 "playwright_context": "third",
 66 |                 "playwright_context_kwargs": {
 67 |                     "storage_state": {
 68 |                         "cookies": [
 69 |                             {
 70 |                                 "url": "https://example.org",
 71 |                                 "name": "context",
 72 |                                 "value": "third",
 73 |                             },
 74 |                         ],
 75 |                     },
 76 |                 },
 77 |                 "playwright_include_page": True,
 78 |             },
 79 |             dont_filter=True,
 80 |         )
 81 |         # default context
 82 |         yield Request(
 83 |             url="https://example.org",
 84 |             meta={"playwright": True, "playwright_include_page": True},
 85 |             dont_filter=True,
 86 |         )
 87 |         # each request on a different context
 88 |         for i in range(20):
 89 |             yield Request(
 90 |                 url=f"https://example.org?foo={i}",
 91 |                 meta={
 92 |                     "playwright": True,
 93 |                     "playwright_context": f"context-{i}",
 94 |                     "playwright_include_page": True,
 95 |                 },
 96 |                 dont_filter=True,
 97 |             )
 98 | 
 99 |     async def parse(self, response, **kwargs):
100 |         page: Page = response.meta["playwright_page"]
101 |         context_name = response.meta["playwright_context"]
102 |         storage_state = await page.context.storage_state()
103 |         await page.close()
104 |         await page.context.close()
105 |         return {
106 |             "url": response.url,
107 |             "context": context_name,
108 |             "cookies": storage_state["cookies"],
109 |         }
110 | 


--------------------------------------------------------------------------------
/examples/download.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from scrapy import Spider, Request
 4 | 
 5 | 
 6 | class DownloadSpider(Spider):
 7 |     name = "download"
 8 |     custom_settings = {
 9 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
10 |         "DOWNLOAD_HANDLERS": {
11 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
12 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
13 |         },
14 |     }
15 | 
16 |     def start_requests(self):
17 |         yield Request(url="https://example.org", meta={"playwright": True})
18 |         yield Request(
19 |             url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
20 |             meta={"playwright": True},
21 |         )
22 | 
23 |     def parse(self, response, **kwargs):
24 |         if filename := response.meta.get("playwright_suggested_filename"):
25 |             (Path(__file__).parent / filename).write_bytes(response.body)
26 |         yield {
27 |             "url": response.url,
28 |             "response_cls": response.__class__.__name__,
29 |             "first_bytes": response.body[:60],
30 |             "filename": filename,
31 |         }
32 | 


--------------------------------------------------------------------------------
/examples/events.py:
--------------------------------------------------------------------------------
 1 | from playwright.async_api import Dialog, Response as PlaywrightResponse
 2 | from scrapy import Spider, Request
 3 | from scrapy_playwright.page import PageMethod
 4 | 
 5 | 
 6 | class EventsSpider(Spider):
 7 |     """Handle page events."""
 8 | 
 9 |     name = "events"
10 |     custom_settings = {
11 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
12 |         "DOWNLOAD_HANDLERS": {
13 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
14 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15 |         },
16 |     }
17 | 
18 |     def start_requests(self):
19 |         yield Request(
20 |             url="https://example.org",
21 |             meta={
22 |                 "playwright": True,
23 |                 "playwright_page_methods": [
24 |                     PageMethod("evaluate", "alert('foobar');"),
25 |                 ],
26 |                 "playwright_page_event_handlers": {
27 |                     "dialog": self.handle_dialog,
28 |                     "response": "handle_response",
29 |                 },
30 |             },
31 |         )
32 | 
33 |     async def handle_dialog(self, dialog: Dialog) -> None:
34 |         self.logger.info(f"Handled dialog with message: {dialog.message}")
35 |         await dialog.dismiss()
36 | 
37 |     async def handle_response(self, response: PlaywrightResponse) -> None:
38 |         self.logger.info(f"Received response with URL {response.url}")
39 | 
40 |     def parse(self, response, **kwargs):
41 |         return {"url": response.url}
42 | 


--------------------------------------------------------------------------------
/examples/exception_errback.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from scrapy import Spider, Request
 4 | 
 5 | 
 6 | class HandleExceptionInErrbackSpider(Spider):
 7 |     """Handle exceptions in the Playwright downloader, such as TimeoutError"""
 8 | 
 9 |     name = "awesome"
10 |     custom_settings = {
11 |         "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000,  # milliseconds
12 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
13 |         "DOWNLOAD_HANDLERS": {
14 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
16 |         },
17 |         "RETRY_TIMES": 0,
18 |     }
19 | 
20 |     def start_requests(self):
21 |         yield Request(
22 |             url="https://httpbin.org/delay/10",
23 |             meta={"playwright": True},
24 |             errback=self.errback,
25 |         )
26 | 
27 |     def errback(self, failure):
28 |         logging.info(
29 |             "Handling failure in errback, request=%r, exception=%r", failure.request, failure.value
30 |         )
31 | 


--------------------------------------------------------------------------------
/examples/exception_middleware.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | 
 4 | from scrapy import Spider, Request
 5 | from scrapy_playwright.page import PageMethod
 6 | 
 7 | 
 8 | class HandleTimeoutMiddleware:
 9 |     def process_exception(self, request, exception, spider):
10 |         new_url = "https://httpbin.org/get"
11 |         logging.info(
12 |             "Caught exception: %s for request %s, recovering to %s",
13 |             exception.__class__,
14 |             request,
15 |             new_url,
16 |         )
17 |         return Request(
18 |             url=new_url,
19 |             meta={
20 |                 "playwright": True,
21 |                 "playwright_page_methods": [
22 |                     PageMethod(
23 |                         "screenshot", path=Path(__file__).parent / "recovered.png", full_page=True
24 |                     ),
25 |                 ],
26 |             },
27 |         )
28 | 
29 | 
30 | class HandleExceptionInMiddlewareSpider(Spider):
31 |     """Handle exceptions in the Playwright downloader, such as TimeoutError"""
32 | 
33 |     name = "awesome"
34 |     custom_settings = {
35 |         "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000,  # milliseconds
36 |         "DOWNLOADER_MIDDLEWARES": {
37 |             HandleTimeoutMiddleware: 100,
38 |         },
39 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
40 |         "DOWNLOAD_HANDLERS": {
41 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
42 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
43 |         },
44 |         "RETRY_TIMES": 0,
45 |     }
46 | 
47 |     def start_requests(self):
48 |         yield Request(
49 |             url="https://httpbin.org/delay/10",
50 |             meta={"playwright": True},
51 |         )
52 | 
53 |     def parse(self, response, **kwargs):
54 |         logging.info("Received response for %s", response.url)
55 |         yield {"url": response.url}
56 | 


--------------------------------------------------------------------------------
/examples/headers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | from scrapy import Spider, Request
 5 | from scrapy_playwright.page import PageMethod
 6 | 
 7 | 
 8 | class HeadersSpider(Spider):
 9 |     """Control how requests headers are handled via the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting.
10 | 
11 |     If PLAYWRIGHT_PROCESS_REQUEST_HEADERS=None, neither USER_AGENT nor cookies will be sent to the
12 |     website, comment out PLAYWRIGHT_PROCESS_REQUEST_HEADERS to sent them.
13 |     """
14 | 
15 |     name = "headers"
16 |     custom_settings = {
17 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
18 |         "DOWNLOAD_HANDLERS": {
19 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
20 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
21 |         },
22 |         "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
23 |         "USER_AGENT": "Overridden user agent",
24 |     }
25 | 
26 |     def start_requests(self):
27 |         yield Request(
28 |             url="https://httpbin.org/headers",
29 |             meta={
30 |                 "playwright": True,
31 |                 "playwright_page_methods": [
32 |                     PageMethod(
33 |                         "screenshot", path=Path(__file__).parent / "headers.png", full_page=True
34 |                     ),
35 |                 ],
36 |             },
37 |             cookies={"foo": "bar"},
38 |         )
39 | 
40 |     def parse(self, response, **kwargs):
41 |         headers = json.loads(response.css("pre::text").get())["headers"]
42 |         yield {"url": response.url, "headers": headers}
43 | 


--------------------------------------------------------------------------------
/examples/init_page.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | async def init_page(page, request):
 7 |     await page.set_extra_http_headers({"Asdf": "Qwerty"})
 8 | 
 9 | 
10 | class InitPageSpider(scrapy.Spider):
11 |     """A spider that initializes pages upon creation."""
12 | 
13 |     name = "init_page"
14 |     custom_settings = {
15 |         "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,  # needed to keep playwright headers
16 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
17 |         "DOWNLOAD_HANDLERS": {
18 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
19 |         },
20 |     }
21 | 
22 |     def start_requests(self):
23 |         yield scrapy.Request(
24 |             url="https://httpbin.org/headers",
25 |             meta={
26 |                 "playwright": True,
27 |                 "playwright_page_init_callback": init_page,
28 |             },
29 |         )
30 | 
31 |     def parse(self, response, **kwargs):
32 |         json_str = response.css("pre::text").get()
33 |         print(json_str)
34 |         return {"data": json.loads(json_str)}
35 | 


--------------------------------------------------------------------------------
/examples/max_pages.py:
--------------------------------------------------------------------------------
 1 | from playwright.async_api import Page
 2 | from scrapy import Spider, Request
 3 | 
 4 | 
 5 | class MaxPagesPerContextContextsSpider(Spider):
 6 |     """Limit pages by context"""
 7 | 
 8 |     name = "contexts"
 9 |     custom_settings = {
10 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
11 |         "DOWNLOAD_HANDLERS": {
12 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
13 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
14 |         },
15 |         "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 2,
16 |         "PLAYWRIGHT_CONTEXTS": {
17 |             "a": {"java_script_enabled": True},
18 |             "b": {"java_script_enabled": True},
19 |         },
20 |     }
21 | 
22 |     def start_requests(self):
23 |         for _ in range(20):
24 |             yield Request(
25 |                 url="https://httpbin.org/status?n=404",
26 |                 meta={
27 |                     "playwright": True,
28 |                     "playwright_context": "a",
29 |                     "playwright_include_page": True,
30 |                 },
31 |                 dont_filter=True,
32 |                 errback=self.errback,
33 |             )
34 |         for i in range(20):
35 |             yield Request(
36 |                 url=f"https://httpbin.org/get?a={i}",
37 |                 meta={"playwright": True, "playwright_context": "a"},
38 |             )
39 |         for i in range(20):
40 |             yield Request(
41 |                 url=f"https://httpbin.org/get?b={i}",
42 |                 meta={"playwright": True, "playwright_context": "b"},
43 |             )
44 | 
45 |     def parse(self, response, **kwargs):
46 |         return {"url": response.url}
47 | 
48 |     async def errback(self, failure):
49 |         page: Page = failure.request.meta["playwright_page"]
50 |         await page.close()
51 | 


--------------------------------------------------------------------------------
/examples/post.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from scrapy import Spider, FormRequest
 4 | from scrapy_playwright.page import PageMethod
 5 | 
 6 | 
 7 | class PostSpider(Spider):
 8 |     """Send data using the POST verb."""
 9 | 
10 |     name = "post"
11 |     custom_settings = {
12 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
13 |         "DOWNLOAD_HANDLERS": {
14 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
16 |         },
17 |     }
18 | 
19 |     def start_requests(self):
20 |         yield FormRequest(
21 |             url="https://httpbin.org/post",
22 |             formdata={"foo": "bar"},
23 |             meta={
24 |                 "playwright": True,
25 |                 "playwright_page_methods": [
26 |                     PageMethod(
27 |                         "screenshot", path=Path(__file__).parent / "post.png", full_page=True
28 |                     ),
29 |                 ],
30 |             },
31 |         )
32 | 
33 |     def parse(self, response, **kwargs):
34 |         yield {"url": response.url}
35 | 


--------------------------------------------------------------------------------
/examples/scroll.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from scrapy import Spider, Request
 4 | from scrapy_playwright.page import PageMethod
 5 | 
 6 | 
 7 | class ScrollSpider(Spider):
 8 |     """Scroll down on an infinite-scroll page."""
 9 | 
10 |     name = "scroll"
11 |     custom_settings = {
12 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
13 |         "DOWNLOAD_HANDLERS": {
14 |             # "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15 |             "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
16 |         },
17 |         "LOG_LEVEL": "INFO",
18 |     }
19 | 
20 |     def start_requests(self):
21 |         yield Request(
22 |             url="http://quotes.toscrape.com/scroll",
23 |             cookies={"foo": "bar", "asdf": "qwerty"},
24 |             meta={
25 |                 "playwright": True,
26 |                 "playwright_page_methods": [
27 |                     PageMethod("wait_for_selector", "div.quote"),
28 |                     PageMethod("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
29 |                     PageMethod("wait_for_selector", "div.quote:nth-child(11)"),  # 10 per page
30 |                     PageMethod(
31 |                         "screenshot", path=Path(__file__).parent / "scroll.png", full_page=True
32 |                     ),
33 |                 ],
34 |             },
35 |         )
36 | 
37 |     def parse(self, response, **kwargs):
38 |         return {"url": response.url, "count": len(response.css("div.quote"))}
39 | 


--------------------------------------------------------------------------------
/examples/storage.py:
--------------------------------------------------------------------------------
 1 | from playwright.async_api import Page
 2 | from scrapy import Spider, Request
 3 | from scrapy_playwright.page import PageMethod
 4 | 
 5 | 
 6 | class StorageSpider(Spider):
 7 |     """Set and get storage state, get the server's IP address."""
 8 | 
 9 |     name = "storage"
10 |     custom_settings = {
11 |         "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
12 |         "DOWNLOAD_HANDLERS": {
13 |             "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
14 |             # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
15 |         },
16 |     }
17 | 
18 |     def start_requests(self):
19 |         yield Request(
20 |             url="https://example.org",
21 |             meta={
22 |                 "playwright": True,
23 |                 "playwright_include_page": True,
24 |                 "playwright_page_methods": [
25 |                     PageMethod("evaluate_handle", "window.localStorage.setItem('foo', 'bar');"),
26 |                 ],
27 |             },
28 |         )
29 | 
30 |     async def parse(self, response, **kwargs):
31 |         page: Page = response.meta["playwright_page"]
32 |         storage_state = await page.context.storage_state()
33 |         await page.close()
34 |         return {
35 |             "url": response.url,
36 |             "storage_state": storage_state,
37 |             "ip_address": response.ip_address,
38 |         }
39 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | disable=
 3 |     attribute-defined-outside-init,
 4 |     broad-except,
 5 |     invalid-name,
 6 |     missing-class-docstring,
 7 |     missing-function-docstring,
 8 |     missing-module-docstring,
 9 |     too-few-public-methods,
10 |     too-many-arguments,
11 |     too-many-instance-attributes,
12 |     # tests
13 |     duplicate-code,
14 |     import-outside-toplevel,
15 |     protected-access,
16 |     too-many-public-methods,
17 |     unnecessary-dunder-call,
18 | 
19 | 
20 | [FORMAT]
21 | expected-line-ending-format=LF
22 | max-line-length=99
23 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 99
3 | 


--------------------------------------------------------------------------------
/scrapy_playwright/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.43"
2 | 


--------------------------------------------------------------------------------
/scrapy_playwright/_utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import platform
  4 | import threading
  5 | from typing import Awaitable, Dict, Iterator, Optional, Tuple, Union
  6 | 
  7 | import scrapy
  8 | from playwright.async_api import Error, Page, Request, Response
  9 | from scrapy.http.headers import Headers
 10 | from scrapy.settings import Settings
 11 | from scrapy.utils.python import to_unicode
 12 | from twisted.internet.defer import Deferred
 13 | from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
 14 | 
 15 | 
 16 | logger = logging.getLogger("scrapy-playwright")
 17 | 
 18 | 
 19 | async def _maybe_await(obj):
 20 |     if isinstance(obj, Awaitable):
 21 |         return await obj
 22 |     return obj
 23 | 
 24 | 
 25 | def _possible_encodings(headers: Headers, text: str) -> Iterator[str]:
 26 |     if headers.get("content-type"):
 27 |         content_type = to_unicode(headers["content-type"])
 28 |         yield http_content_type_encoding(content_type)
 29 |     yield html_body_declared_encoding(text)
 30 | 
 31 | 
 32 | def _encode_body(headers: Headers, text: str) -> Tuple[bytes, str]:
 33 |     for encoding in filter(None, _possible_encodings(headers, text)):
 34 |         try:
 35 |             body = text.encode(encoding)
 36 |         except UnicodeEncodeError:
 37 |             pass
 38 |         else:
 39 |             return body, encoding
 40 |     return text.encode("utf-8"), "utf-8"  # fallback
 41 | 
 42 | 
 43 | def _is_safe_close_error(error: Error) -> bool:
 44 |     """
 45 |     Taken almost verbatim from
 46 |     https://github.com/microsoft/playwright-python/blob/v1.20.0/playwright/_impl/_helper.py#L234-L238
 47 |     """
 48 |     message = str(error)
 49 |     return message.endswith("Browser has been closed") or message.endswith(
 50 |         "Target page, context or browser has been closed"
 51 |     )
 52 | 
 53 | 
 54 | _NAVIGATION_ERROR_MSG = (
 55 |     "Unable to retrieve content because the page is navigating and changing the content."
 56 | )
 57 | 
 58 | 
 59 | async def _get_page_content(
 60 |     page: Page,
 61 |     spider: scrapy.Spider,
 62 |     context_name: str,
 63 |     scrapy_request_url: str,
 64 |     scrapy_request_method: str,
 65 | ) -> str:
 66 |     """Wrapper around Page.content to retry if necessary.
 67 |     Arguments other than the page are only for logging.
 68 |     """
 69 |     try:
 70 |         return await page.content()
 71 |     except Error as err:
 72 |         if _NAVIGATION_ERROR_MSG in err.message:
 73 |             logger.debug(
 74 |                 "Retrying to get content from page '%s', error: '%s'",
 75 |                 page.url,
 76 |                 _NAVIGATION_ERROR_MSG,
 77 |                 extra={
 78 |                     "spider": spider,
 79 |                     "context_name": context_name,
 80 |                     "scrapy_request_url": scrapy_request_url,
 81 |                     "scrapy_request_method": scrapy_request_method,
 82 |                     "playwright_page_url": page.url,
 83 |                 },
 84 |             )
 85 |             return await page.content()
 86 |         raise
 87 | 
 88 | 
 89 | def _get_float_setting(settings: Settings, key: str) -> Optional[float]:
 90 |     try:
 91 |         return float(settings[key])
 92 |     except Exception:
 93 |         return None
 94 | 
 95 | 
 96 | async def _get_header_value(
 97 |     resource: Union[Request, Response],
 98 |     header_name: str,
 99 | ) -> Optional[str]:
100 |     try:
101 |         return await resource.header_value(header_name)
102 |     except Exception:
103 |         return None
104 | 
105 | 
106 | class _ThreadedLoopAdapter:
107 |     """Utility class to start an asyncio event loop in a new thread and redirect coroutines.
108 |     This allows to run Playwright in a different loop than the Scrapy crawler, allowing to
109 |     use ProactorEventLoop which is supported by Playwright on Windows.
110 |     """
111 | 
112 |     _loop: asyncio.AbstractEventLoop
113 |     _thread: threading.Thread
114 |     _coro_queue: asyncio.Queue = asyncio.Queue()
115 |     _stop_events: Dict[int, asyncio.Event] = {}
116 | 
117 |     @classmethod
118 |     async def _handle_coro(cls, coro, future) -> None:
119 |         try:
120 |             future.set_result(await coro)
121 |         except Exception as exc:
122 |             future.set_exception(exc)
123 | 
124 |     @classmethod
125 |     async def _process_queue(cls) -> None:
126 |         while any(not ev.is_set() for ev in cls._stop_events.values()):
127 |             coro, future = await cls._coro_queue.get()
128 |             asyncio.create_task(cls._handle_coro(coro, future))
129 |             cls._coro_queue.task_done()
130 | 
131 |     @classmethod
132 |     def _deferred_from_coro(cls, coro) -> Deferred:
133 |         future: asyncio.Future = asyncio.Future()
134 |         asyncio.run_coroutine_threadsafe(cls._coro_queue.put((coro, future)), cls._loop)
135 |         return scrapy.utils.defer.deferred_from_coro(future)
136 | 
137 |     @classmethod
138 |     def start(cls, caller_id: int) -> None:
139 |         cls._stop_events[caller_id] = asyncio.Event()
140 |         if not getattr(cls, "_loop", None):
141 |             policy = asyncio.DefaultEventLoopPolicy()
142 |             if platform.system() == "Windows":
143 |                 policy = asyncio.WindowsProactorEventLoopPolicy()  # type: ignore[attr-defined]
144 |             cls._loop = policy.new_event_loop()
145 |             asyncio.set_event_loop(cls._loop)
146 | 
147 |         if not getattr(cls, "_thread", None):
148 |             cls._thread = threading.Thread(target=cls._loop.run_forever, daemon=True)
149 |             cls._thread.start()
150 |             logger.info("Started loop on separate thread: %s", cls._loop)
151 |             asyncio.run_coroutine_threadsafe(cls._process_queue(), cls._loop)
152 | 
153 |     @classmethod
154 |     def stop(cls, caller_id: int) -> None:
155 |         """Wait until all handlers are closed to stop the event loop and join the thread."""
156 |         cls._stop_events[caller_id].set()
157 |         if all(ev.is_set() for ev in cls._stop_events.values()):
158 |             asyncio.run_coroutine_threadsafe(cls._coro_queue.join(), cls._loop)
159 |             cls._loop.call_soon_threadsafe(cls._loop.stop)
160 |             cls._thread.join()
161 | 


--------------------------------------------------------------------------------
/scrapy_playwright/handler.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | import logging
  4 | import platform
  5 | import warnings
  6 | from contextlib import suppress
  7 | from dataclasses import dataclass, field as dataclass_field
  8 | from functools import partial
  9 | from ipaddress import ip_address
 10 | from time import time
 11 | from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
 12 | 
 13 | from playwright._impl._errors import TargetClosedError
 14 | from playwright.async_api import (
 15 |     BrowserContext,
 16 |     BrowserType,
 17 |     Download as PlaywrightDownload,
 18 |     Error as PlaywrightError,
 19 |     Page,
 20 |     Playwright as AsyncPlaywright,
 21 |     PlaywrightContextManager,
 22 |     Request as PlaywrightRequest,
 23 |     Response as PlaywrightResponse,
 24 |     Route,
 25 | )
 26 | from scrapy import Spider, signals
 27 | from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
 28 | from scrapy.crawler import Crawler
 29 | from scrapy.exceptions import NotSupported, ScrapyDeprecationWarning
 30 | from scrapy.http import Request, Response
 31 | from scrapy.http.headers import Headers
 32 | from scrapy.responsetypes import responsetypes
 33 | from scrapy.settings import Settings
 34 | from scrapy.utils.defer import deferred_from_coro
 35 | from scrapy.utils.misc import load_object
 36 | from scrapy.utils.reactor import verify_installed_reactor
 37 | from twisted.internet.defer import Deferred, inlineCallbacks
 38 | 
 39 | from scrapy_playwright.headers import use_scrapy_headers
 40 | from scrapy_playwright.page import PageMethod
 41 | from scrapy_playwright._utils import (
 42 |     _ThreadedLoopAdapter,
 43 |     _encode_body,
 44 |     _get_float_setting,
 45 |     _get_header_value,
 46 |     _get_page_content,
 47 |     _is_safe_close_error,
 48 |     _maybe_await,
 49 | )
 50 | 
 51 | 
 52 | __all__ = ["ScrapyPlaywrightDownloadHandler"]
 53 | 
 54 | 
 55 | PlaywrightHandler = TypeVar("PlaywrightHandler", bound="ScrapyPlaywrightDownloadHandler")
 56 | 
 57 | 
 58 | logger = logging.getLogger("scrapy-playwright")
 59 | 
 60 | 
 61 | DEFAULT_BROWSER_TYPE = "chromium"
 62 | DEFAULT_CONTEXT_NAME = "default"
 63 | PERSISTENT_CONTEXT_PATH_KEY = "user_data_dir"
 64 | 
 65 | 
 66 | @dataclass
 67 | class BrowserContextWrapper:
 68 |     context: BrowserContext
 69 |     semaphore: asyncio.Semaphore
 70 |     persistent: bool
 71 | 
 72 | 
 73 | @dataclass
 74 | class Download:
 75 |     body: bytes = b""
 76 |     url: str = ""
 77 |     suggested_filename: str = ""
 78 |     exception: Optional[Exception] = None
 79 |     response_status: int = 200
 80 |     headers: dict = dataclass_field(default_factory=dict)
 81 | 
 82 |     def __bool__(self) -> bool:
 83 |         return bool(self.body) or bool(self.exception)
 84 | 
 85 | 
 86 | @dataclass
 87 | class Config:
 88 |     cdp_url: Optional[str]
 89 |     cdp_kwargs: dict
 90 |     connect_url: Optional[str]
 91 |     connect_kwargs: dict
 92 |     browser_type_name: str
 93 |     launch_options: dict
 94 |     max_pages_per_context: int
 95 |     max_contexts: Optional[int]
 96 |     startup_context_kwargs: dict
 97 |     navigation_timeout: Optional[float]
 98 |     restart_disconnected_browser: bool
 99 |     target_closed_max_retries: int = 3
100 |     use_threaded_loop: bool = False
101 | 
102 |     @classmethod
103 |     def from_settings(cls, settings: Settings) -> "Config":
104 |         if settings.get("PLAYWRIGHT_CDP_URL") and settings.get("PLAYWRIGHT_CONNECT_URL"):
105 |             msg = "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported"
106 |             logger.error(msg)
107 |             raise NotSupported(msg)
108 |         cfg = cls(
109 |             cdp_url=settings.get("PLAYWRIGHT_CDP_URL"),
110 |             cdp_kwargs=settings.getdict("PLAYWRIGHT_CDP_KWARGS") or {},
111 |             connect_url=settings.get("PLAYWRIGHT_CONNECT_URL"),
112 |             connect_kwargs=settings.getdict("PLAYWRIGHT_CONNECT_KWARGS") or {},
113 |             browser_type_name=settings.get("PLAYWRIGHT_BROWSER_TYPE") or DEFAULT_BROWSER_TYPE,
114 |             launch_options=settings.getdict("PLAYWRIGHT_LAUNCH_OPTIONS") or {},
115 |             max_pages_per_context=settings.getint("PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"),
116 |             max_contexts=settings.getint("PLAYWRIGHT_MAX_CONTEXTS") or None,
117 |             startup_context_kwargs=settings.getdict("PLAYWRIGHT_CONTEXTS"),
118 |             navigation_timeout=_get_float_setting(
119 |                 settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"
120 |             ),
121 |             restart_disconnected_browser=settings.getbool(
122 |                 "PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER", default=True
123 |             ),
124 |             use_threaded_loop=platform.system() == "Windows"
125 |             or settings.getbool("_PLAYWRIGHT_THREADED_LOOP", False),
126 |         )
127 |         cfg.cdp_kwargs.pop("endpoint_url", None)
128 |         cfg.connect_kwargs.pop("ws_endpoint", None)
129 |         if not cfg.max_pages_per_context:
130 |             cfg.max_pages_per_context = settings.getint("CONCURRENT_REQUESTS")
131 |         if (cfg.cdp_url or cfg.connect_url) and cfg.launch_options:
132 |             logger.warning("Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS")
133 |         return cfg
134 | 
135 | 
136 | class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
137 |     playwright_context_manager: Optional[PlaywrightContextManager] = None
138 |     playwright: Optional[AsyncPlaywright] = None
139 | 
140 |     def __init__(self, crawler: Crawler) -> None:
141 |         super().__init__(settings=crawler.settings, crawler=crawler)
142 |         verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
143 |         crawler.signals.connect(self._engine_started, signals.engine_started)
144 |         self.stats = crawler.stats
145 |         self.config = Config.from_settings(crawler.settings)
146 | 
147 |         if self.config.use_threaded_loop:
148 |             _ThreadedLoopAdapter.start(id(self))
149 | 
150 |         self.browser_launch_lock = asyncio.Lock()
151 |         self.context_launch_lock = asyncio.Lock()
152 |         self.context_wrappers: Dict[str, BrowserContextWrapper] = {}
153 |         if self.config.max_contexts:
154 |             self.context_semaphore = asyncio.Semaphore(value=self.config.max_contexts)
155 | 
156 |         # headers
157 |         if "PLAYWRIGHT_PROCESS_REQUEST_HEADERS" in crawler.settings:
158 |             if crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] is None:
159 |                 self.process_request_headers = None
160 |             else:
161 |                 self.process_request_headers = load_object(
162 |                     crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"]
163 |                 )
164 |         else:
165 |             self.process_request_headers = use_scrapy_headers
166 | 
167 |         self.abort_request: Optional[Callable[[PlaywrightRequest], Union[Awaitable, bool]]] = None
168 |         if crawler.settings.get("PLAYWRIGHT_ABORT_REQUEST"):
169 |             self.abort_request = load_object(crawler.settings["PLAYWRIGHT_ABORT_REQUEST"])
170 | 
171 |     @classmethod
172 |     def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
173 |         return cls(crawler)
174 | 
175 |     def _deferred_from_coro(self, coro: Awaitable) -> Deferred:
176 |         if self.config.use_threaded_loop:
177 |             return _ThreadedLoopAdapter._deferred_from_coro(coro)
178 |         return deferred_from_coro(coro)
179 | 
180 |     def _engine_started(self) -> Deferred:
181 |         """Launch the browser. Use the engine_started signal as it supports returning deferreds."""
182 |         return self._deferred_from_coro(self._launch())
183 | 
184 |     async def _launch(self) -> None:
185 |         """Launch Playwright manager and configured startup context(s)."""
186 |         logger.info("Starting download handler")
187 |         self.playwright_context_manager = PlaywrightContextManager()
188 |         self.playwright = await self.playwright_context_manager.start()
189 |         self.browser_type: BrowserType = getattr(self.playwright, self.config.browser_type_name)
190 |         if self.config.startup_context_kwargs:
191 |             logger.info("Launching %i startup context(s)", len(self.config.startup_context_kwargs))
192 |             await asyncio.gather(
193 |                 *[
194 |                     self._create_browser_context(name=name, context_kwargs=kwargs)
195 |                     for name, kwargs in self.config.startup_context_kwargs.items()
196 |                 ]
197 |             )
198 |             self._set_max_concurrent_context_count()
199 |             logger.info("Startup context(s) launched")
200 |             self.stats.set_value("playwright/page_count", self._get_total_page_count())
201 | 
202 |     async def _maybe_launch_browser(self) -> None:
203 |         async with self.browser_launch_lock:
204 |             if not hasattr(self, "browser"):
205 |                 logger.info("Launching browser %s", self.browser_type.name)
206 |                 self.browser = await self.browser_type.launch(**self.config.launch_options)
207 |                 logger.info("Browser %s launched", self.browser_type.name)
208 |                 self.stats.inc_value("playwright/browser_count")
209 |                 self.browser.on("disconnected", self._browser_disconnected_callback)
210 | 
211 |     async def _maybe_connect_remote_devtools(self) -> None:
212 |         async with self.browser_launch_lock:
213 |             if not hasattr(self, "browser"):
214 |                 logger.info("Connecting using CDP: %s", self.config.cdp_url)
215 |                 self.browser = await self.browser_type.connect_over_cdp(
216 |                     self.config.cdp_url, **self.config.cdp_kwargs
217 |                 )
218 |                 logger.info("Connected using CDP: %s", self.config.cdp_url)
219 |                 self.stats.inc_value("playwright/browser_count")
220 |                 self.browser.on("disconnected", self._browser_disconnected_callback)
221 | 
222 |     async def _maybe_connect_remote(self) -> None:
223 |         async with self.browser_launch_lock:
224 |             if not hasattr(self, "browser"):
225 |                 logger.info("Connecting to remote Playwright")
226 |                 self.browser = await self.browser_type.connect(
227 |                     self.config.connect_url, **self.config.connect_kwargs
228 |                 )
229 |                 logger.info("Connected to remote Playwright")
230 |                 self.stats.inc_value("playwright/browser_count")
231 |                 self.browser.on("disconnected", self._browser_disconnected_callback)
232 | 
233 |     async def _create_browser_context(
234 |         self,
235 |         name: str,
236 |         context_kwargs: Optional[dict],
237 |         spider: Optional[Spider] = None,
238 |     ) -> BrowserContextWrapper:
239 |         """Create a new context, also launching a local browser or connecting
240 |         to a remote one if necessary.
241 |         """
242 |         if hasattr(self, "context_semaphore"):
243 |             await self.context_semaphore.acquire()
244 |         context_kwargs = context_kwargs or {}
245 |         persistent = remote = False
246 |         if context_kwargs.get(PERSISTENT_CONTEXT_PATH_KEY):
247 |             context = await self.browser_type.launch_persistent_context(**context_kwargs)
248 |             persistent = True
249 |         elif self.config.cdp_url:
250 |             await self._maybe_connect_remote_devtools()
251 |             context = await self.browser.new_context(**context_kwargs)
252 |             remote = True
253 |         elif self.config.connect_url:
254 |             await self._maybe_connect_remote()
255 |             context = await self.browser.new_context(**context_kwargs)
256 |             remote = True
257 |         else:
258 |             await self._maybe_launch_browser()
259 |             context = await self.browser.new_context(**context_kwargs)
260 | 
261 |         context.on(
262 |             "close", self._make_close_browser_context_callback(name, persistent, remote, spider)
263 |         )
264 |         self.stats.inc_value("playwright/context_count")
265 |         self.stats.inc_value(f"playwright/context_count/persistent/{persistent}")
266 |         self.stats.inc_value(f"playwright/context_count/remote/{remote}")
267 |         logger.debug(
268 |             "Browser context started: '%s' (persistent=%s, remote=%s)",
269 |             name,
270 |             persistent,
271 |             remote,
272 |             extra={
273 |                 "spider": spider,
274 |                 "context_name": name,
275 |                 "persistent": persistent,
276 |                 "remote": remote,
277 |             },
278 |         )
279 |         if self.config.navigation_timeout is not None:
280 |             context.set_default_navigation_timeout(self.config.navigation_timeout)
281 |         self.context_wrappers[name] = BrowserContextWrapper(
282 |             context=context,
283 |             semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context),
284 |             persistent=persistent,
285 |         )
286 |         self._set_max_concurrent_context_count()
287 |         return self.context_wrappers[name]
288 | 
289 |     async def _create_page(self, request: Request, spider: Spider) -> Page:
290 |         """Create a new page in a context, also creating a new context if necessary."""
291 |         context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
292 |         # this block needs to be locked because several attempts to launch a context
293 |         # with the same name could happen at the same time from different requests
294 |         async with self.context_launch_lock:
295 |             ctx_wrapper = self.context_wrappers.get(context_name)
296 |             if ctx_wrapper is None:
297 |                 ctx_wrapper = await self._create_browser_context(
298 |                     name=context_name,
299 |                     context_kwargs=request.meta.get("playwright_context_kwargs"),
300 |                     spider=spider,
301 |                 )
302 | 
303 |         await ctx_wrapper.semaphore.acquire()
304 |         page = await ctx_wrapper.context.new_page()
305 |         self.stats.inc_value("playwright/page_count")
306 |         total_page_count = self._get_total_page_count()
307 |         logger.debug(
308 |             "[Context=%s] New page created, page count is %i (%i for all contexts)",
309 |             context_name,
310 |             len(ctx_wrapper.context.pages),
311 |             total_page_count,
312 |             extra={
313 |                 "spider": spider,
314 |                 "context_name": context_name,
315 |                 "context_page_count": len(ctx_wrapper.context.pages),
316 |                 "total_page_count": total_page_count,
317 |                 "scrapy_request_url": request.url,
318 |                 "scrapy_request_method": request.method,
319 |             },
320 |         )
321 |         self._set_max_concurrent_page_count()
322 |         if self.config.navigation_timeout is not None:
323 |             page.set_default_navigation_timeout(self.config.navigation_timeout)
324 | 
325 |         page.on("close", self._make_close_page_callback(context_name))
326 |         page.on("crash", self._make_close_page_callback(context_name))
327 |         page.on("request", self._increment_request_stats)
328 |         page.on("response", self._increment_response_stats)
329 |         if logger.getEffectiveLevel() <= logging.DEBUG:
330 |             page.on("request", _make_request_logger(context_name, spider))
331 |             page.on("response", _make_response_logger(context_name, spider))
332 | 
333 |         return page
334 | 
335 |     def _get_total_page_count(self):
336 |         return sum(len(ctx.context.pages) for ctx in self.context_wrappers.values())
337 | 
338 |     def _set_max_concurrent_page_count(self):
339 |         count = self._get_total_page_count()
340 |         current_max_count = self.stats.get_value("playwright/page_count/max_concurrent")
341 |         if current_max_count is None or count > current_max_count:
342 |             self.stats.set_value("playwright/page_count/max_concurrent", count)
343 | 
344 |     def _set_max_concurrent_context_count(self):
345 |         current_max_count = self.stats.get_value("playwright/context_count/max_concurrent")
346 |         if current_max_count is None or len(self.context_wrappers) > current_max_count:
347 |             self.stats.set_value(
348 |                 "playwright/context_count/max_concurrent", len(self.context_wrappers)
349 |             )
350 | 
351 |     @inlineCallbacks
352 |     def close(self) -> Deferred:
353 |         logger.info("Closing download handler")
354 |         yield super().close()
355 |         yield self._deferred_from_coro(self._close())
356 |         if self.config.use_threaded_loop:
357 |             _ThreadedLoopAdapter.stop(id(self))
358 | 
359 |     async def _close(self) -> None:
360 |         with suppress(TargetClosedError):
361 |             await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])
362 |         self.context_wrappers.clear()
363 |         if hasattr(self, "browser"):
364 |             logger.info("Closing browser")
365 |             await self.browser.close()
366 |         if self.playwright_context_manager:
367 |             await self.playwright_context_manager.__aexit__()
368 |         if self.playwright:
369 |             await self.playwright.stop()
370 | 
371 |     def download_request(self, request: Request, spider: Spider) -> Deferred:
372 |         if request.meta.get("playwright"):
373 |             return self._deferred_from_coro(self._download_request(request, spider))
374 |         return super().download_request(request, spider)
375 | 
376 |     async def _download_request(self, request: Request, spider: Spider) -> Response:
377 |         counter = 0
378 |         while True:
379 |             try:
380 |                 return await self._download_request_with_retry(request=request, spider=spider)
381 |             except TargetClosedError as ex:
382 |                 counter += 1
383 |                 if counter > self.config.target_closed_max_retries:
384 |                     raise ex
385 |                 logger.debug(
386 |                     "Target closed, retrying to create page for %s",
387 |                     request,
388 |                     extra={
389 |                         "spider": spider,
390 |                         "scrapy_request_url": request.url,
391 |                         "scrapy_request_method": request.method,
392 |                         "exception": ex,
393 |                     },
394 |                 )
395 | 
396 |     async def _download_request_with_retry(self, request: Request, spider: Spider) -> Response:
397 |         page = request.meta.get("playwright_page")
398 |         if not isinstance(page, Page) or page.is_closed():
399 |             page = await self._create_page(request=request, spider=spider)
400 |         context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
401 | 
402 |         _attach_page_event_handlers(
403 |             page=page, request=request, spider=spider, context_name=context_name
404 |         )
405 | 
406 |         # We need to identify the Playwright request that matches the Scrapy request
407 |         # in order to override method and body if necessary.
408 |         # Checking the URL and Request.is_navigation_request() is not enough, e.g.
409 |         # requests produced by submitting forms can produce false positives.
410 |         # Let's track only the first request that matches the above conditions.
411 |         initial_request_done = asyncio.Event()
412 | 
413 |         await page.unroute("**")
414 |         await page.route(
415 |             "**",
416 |             self._make_request_handler(
417 |                 context_name=context_name,
418 |                 method=request.method,
419 |                 url=request.url,
420 |                 headers=request.headers,
421 |                 body=request.body,
422 |                 encoding=request.encoding,
423 |                 spider=spider,
424 |                 initial_request_done=initial_request_done,
425 |             ),
426 |         )
427 | 
428 |         await _maybe_execute_page_init_callback(
429 |             page=page, request=request, context_name=context_name, spider=spider
430 |         )
431 | 
432 |         try:
433 |             return await self._download_request_with_page(request, page, spider)
434 |         except Exception as ex:
435 |             if not request.meta.get("playwright_include_page") and not page.is_closed():
436 |                 logger.warning(
437 |                     "Closing page due to failed request: %s exc_type=%s exc_msg=%s",
438 |                     request,
439 |                     type(ex),
440 |                     str(ex),
441 |                     extra={
442 |                         "spider": spider,
443 |                         "context_name": context_name,
444 |                         "scrapy_request_url": request.url,
445 |                         "scrapy_request_method": request.method,
446 |                         "exception": ex,
447 |                     },
448 |                     exc_info=True,
449 |                 )
450 |                 await page.close()
451 |                 self.stats.inc_value("playwright/page_count/closed")
452 |             raise
453 | 
454 |     async def _download_request_with_page(
455 |         self, request: Request, page: Page, spider: Spider
456 |     ) -> Response:
457 |         # set this early to make it available in errbacks even if something fails
458 |         if request.meta.get("playwright_include_page"):
459 |             request.meta["playwright_page"] = page
460 | 
461 |         start_time = time()
462 |         response, download = await self._get_response_and_download(request, page, spider)
463 |         if isinstance(response, PlaywrightResponse):
464 |             await _set_redirect_meta(request=request, response=response)
465 |             headers = Headers(await response.all_headers())
466 |             headers.pop("Content-Encoding", None)
467 |         elif not download:
468 |             logger.warning(
469 |                 "Navigating to %s returned None, the response"
470 |                 " will have empty headers and status 200",
471 |                 request,
472 |                 extra={
473 |                     "spider": spider,
474 |                     "context_name": request.meta.get("playwright_context"),
475 |                     "scrapy_request_url": request.url,
476 |                     "scrapy_request_method": request.method,
477 |                 },
478 |             )
479 |             headers = Headers()
480 | 
481 |         await self._apply_page_methods(page, request, spider)
482 |         body_str = await _get_page_content(
483 |             page=page,
484 |             spider=spider,
485 |             context_name=request.meta.get("playwright_context"),
486 |             scrapy_request_url=request.url,
487 |             scrapy_request_method=request.method,
488 |         )
489 |         request.meta["download_latency"] = time() - start_time
490 | 
491 |         server_ip_address = None
492 |         if response is not None:
493 |             request.meta["playwright_security_details"] = await response.security_details()
494 |             with suppress(KeyError, TypeError, ValueError):
495 |                 server_addr = await response.server_addr()
496 |                 server_ip_address = ip_address(server_addr["ipAddress"])
497 | 
498 |         if download and download.exception:
499 |             raise download.exception
500 | 
501 |         if not request.meta.get("playwright_include_page"):
502 |             await page.close()
503 |             self.stats.inc_value("playwright/page_count/closed")
504 | 
505 |         if download:
506 |             request.meta["playwright_suggested_filename"] = download.suggested_filename
507 |             respcls = responsetypes.from_args(url=download.url, body=download.body)
508 |             download_headers = Headers(download.headers)
509 |             download_headers.pop("Content-Encoding", None)
510 |             return respcls(
511 |                 url=download.url,
512 |                 status=download.response_status,
513 |                 headers=download_headers,
514 |                 body=download.body,
515 |                 request=request,
516 |                 flags=["playwright"],
517 |             )
518 | 
519 |         body, encoding = _encode_body(headers=headers, text=body_str)
520 |         respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
521 |         return respcls(
522 |             url=page.url,
523 |             status=response.status if response is not None else 200,
524 |             headers=headers,
525 |             body=body,
526 |             request=request,
527 |             flags=["playwright"],
528 |             encoding=encoding,
529 |             ip_address=server_ip_address,
530 |         )
531 | 
532 |     async def _get_response_and_download(
533 |         self, request: Request, page: Page, spider: Spider
534 |     ) -> Tuple[Optional[PlaywrightResponse], Optional[Download]]:
535 |         response: Optional[PlaywrightResponse] = None
536 |         download: Download = Download()  # updated in-place in _handle_download
537 |         download_started = asyncio.Event()
538 |         download_ready = asyncio.Event()
539 | 
540 |         async def _handle_download(dwnld: PlaywrightDownload) -> None:
541 |             download_started.set()
542 |             self.stats.inc_value("playwright/download_count")
543 |             try:
544 |                 if failure := await dwnld.failure():
545 |                     raise RuntimeError(f"Failed to download {dwnld.url}: {failure}")
546 |                 download.body = (await dwnld.path()).read_bytes()
547 |                 download.url = dwnld.url
548 |                 download.suggested_filename = dwnld.suggested_filename
549 |             except Exception as ex:
550 |                 download.exception = ex
551 |             finally:
552 |                 download_ready.set()
553 | 
554 |         async def _handle_response(response: PlaywrightResponse) -> None:
555 |             download.response_status = response.status
556 |             download.headers = await response.all_headers()
557 |             download_started.set()
558 | 
559 |         page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
560 |         page_goto_kwargs.pop("url", None)
561 |         page.on("download", _handle_download)
562 |         page.on("response", _handle_response)
563 |         try:
564 |             response = await page.goto(url=request.url, **page_goto_kwargs)
565 |         except PlaywrightError as err:
566 |             if not (
567 |                 self.config.browser_type_name in ("firefox", "webkit")
568 |                 and "Download is starting" in err.message
569 |                 or self.config.browser_type_name == "chromium"
570 |                 and "net::ERR_ABORTED" in err.message
571 |             ):
572 |                 raise
573 | 
574 |             logger.debug(
575 |                 "Navigating to %s failed",
576 |                 request.url,
577 |                 extra={
578 |                     "spider": spider,
579 |                     "context_name": request.meta.get("playwright_context"),
580 |                     "scrapy_request_url": request.url,
581 |                     "scrapy_request_method": request.method,
582 |                 },
583 |             )
584 |             await download_started.wait()
585 | 
586 |             if download.response_status == 204:
587 |                 raise err
588 | 
589 |             logger.debug(
590 |                 "Waiting on download to finish for %s",
591 |                 request.url,
592 |                 extra={
593 |                     "spider": spider,
594 |                     "context_name": request.meta.get("playwright_context"),
595 |                     "scrapy_request_url": request.url,
596 |                     "scrapy_request_method": request.method,
597 |                 },
598 |             )
599 |             await download_ready.wait()
600 |         finally:
601 |             page.remove_listener("download", _handle_download)
602 |             page.remove_listener("response", _handle_response)
603 | 
604 |         return response, download if download else None
605 | 
606 |     async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None:
607 |         context_name = request.meta.get("playwright_context")
608 |         page_methods = request.meta.get("playwright_page_methods") or ()
609 |         if isinstance(page_methods, dict):
610 |             page_methods = page_methods.values()
611 |         for pm in page_methods:
612 |             if isinstance(pm, PageMethod):
613 |                 try:
614 |                     if callable(pm.method):
615 |                         method = partial(pm.method, page)
616 |                     else:
617 |                         method = getattr(page, pm.method)
618 |                 except AttributeError as ex:
619 |                     logger.warning(
620 |                         "Ignoring %r: could not find method",
621 |                         pm,
622 |                         extra={
623 |                             "spider": spider,
624 |                             "context_name": context_name,
625 |                             "scrapy_request_url": request.url,
626 |                             "scrapy_request_method": request.method,
627 |                             "exception": ex,
628 |                         },
629 |                         exc_info=True,
630 |                     )
631 |                 else:
632 |                     pm.result = await _maybe_await(method(*pm.args, **pm.kwargs))
633 |                     await page.wait_for_load_state(timeout=self.config.navigation_timeout)
634 |             else:
635 |                 logger.warning(
636 |                     "Ignoring %r: expected PageMethod, got %r",
637 |                     pm,
638 |                     type(pm),
639 |                     extra={
640 |                         "spider": spider,
641 |                         "context_name": context_name,
642 |                         "scrapy_request_url": request.url,
643 |                         "scrapy_request_method": request.method,
644 |                     },
645 |                 )
646 | 
647 |     def _increment_request_stats(self, request: PlaywrightRequest) -> None:
648 |         stats_prefix = "playwright/request_count"
649 |         self.stats.inc_value(stats_prefix)
650 |         self.stats.inc_value(f"{stats_prefix}/resource_type/{request.resource_type}")
651 |         self.stats.inc_value(f"{stats_prefix}/method/{request.method}")
652 |         if request.is_navigation_request():
653 |             self.stats.inc_value(f"{stats_prefix}/navigation")
654 | 
655 |     def _increment_response_stats(self, response: PlaywrightResponse) -> None:
656 |         stats_prefix = "playwright/response_count"
657 |         self.stats.inc_value(stats_prefix)
658 |         self.stats.inc_value(f"{stats_prefix}/resource_type/{response.request.resource_type}")
659 |         self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}")
660 | 
661 |     async def _browser_disconnected_callback(self) -> None:
662 |         close_context_coros = [
663 |             ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()
664 |         ]
665 |         self.context_wrappers.clear()
666 |         with suppress(TargetClosedError):
667 |             await asyncio.gather(*close_context_coros)
668 |         logger.debug("Browser disconnected")
669 |         if self.config.restart_disconnected_browser:
670 |             del self.browser
671 | 
672 |     def _make_close_page_callback(self, context_name: str) -> Callable:
673 |         def close_page_callback() -> None:
674 |             if context_name in self.context_wrappers:
675 |                 self.context_wrappers[context_name].semaphore.release()
676 | 
677 |         return close_page_callback
678 | 
679 |     def _make_close_browser_context_callback(
680 |         self, name: str, persistent: bool, remote: bool, spider: Optional[Spider] = None
681 |     ) -> Callable:
682 |         def close_browser_context_callback() -> None:
683 |             self.context_wrappers.pop(name, None)
684 |             if hasattr(self, "context_semaphore"):
685 |                 self.context_semaphore.release()
686 |             logger.debug(
687 |                 "Browser context closed: '%s' (persistent=%s, remote=%s)",
688 |                 name,
689 |                 persistent,
690 |                 remote,
691 |                 extra={
692 |                     "spider": spider,
693 |                     "context_name": name,
694 |                     "persistent": persistent,
695 |                     "remote": remote,
696 |                 },
697 |             )
698 | 
699 |         return close_browser_context_callback
700 | 
701 |     def _make_request_handler(
702 |         self,
703 |         context_name: str,
704 |         method: str,
705 |         url: str,
706 |         headers: Headers,
707 |         body: Optional[bytes],
708 |         encoding: str,
709 |         spider: Spider,
710 |         initial_request_done: asyncio.Event,
711 |     ) -> Callable:
712 |         async def _request_handler(route: Route, playwright_request: PlaywrightRequest) -> None:
713 |             """Override request headers, method and body."""
714 |             if self.abort_request:
715 |                 should_abort = await _maybe_await(self.abort_request(playwright_request))
716 |                 if should_abort:
717 |                     await route.abort()
718 |                     logger.debug(
719 |                         "[Context=%s] Aborted Playwright request <%s %s>",
720 |                         context_name,
721 |                         playwright_request.method.upper(),
722 |                         playwright_request.url,
723 |                         extra={
724 |                             "spider": spider,
725 |                             "context_name": context_name,
726 |                             "scrapy_request_url": url,
727 |                             "scrapy_request_method": method,
728 |                             "playwright_request_url": playwright_request.url,
729 |                             "playwright_request_method": playwright_request.method,
730 |                         },
731 |                     )
732 |                     self.stats.inc_value("playwright/request_count/aborted")
733 |                     return None
734 | 
735 |             overrides: dict = {}
736 | 
737 |             if self.process_request_headers is None:
738 |                 final_headers = await playwright_request.all_headers()
739 |             elif (sig := inspect.signature(self.process_request_headers)) and (
740 |                 "browser_type_name" in sig.parameters
741 |                 and "playwright_request" in sig.parameters
742 |                 and "scrapy_request_data" in sig.parameters
743 |             ):
744 |                 overrides["headers"] = final_headers = await _maybe_await(
745 |                     self.process_request_headers(
746 |                         browser_type_name=self.config.browser_type_name,
747 |                         playwright_request=playwright_request,
748 |                         scrapy_request_data={
749 |                             "method": method,
750 |                             "url": url,
751 |                             "headers": headers,
752 |                             "body": body,
753 |                             "encoding": encoding,
754 |                         },
755 |                     )
756 |                 )
757 |             else:
758 |                 warnings.warn(
759 |                     "Accepting positional arguments in the function passed to the"
760 |                     " PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function"
761 |                     " should accept three (3) keyword arguments instead:"
762 |                     " browser_type_name: str,"
763 |                     " playwright_request: playwright.async_api.Request,"
764 |                     " scrapy_request_data: dict",
765 |                     category=ScrapyDeprecationWarning,
766 |                     stacklevel=1,
767 |                 )
768 |                 overrides["headers"] = final_headers = await _maybe_await(
769 |                     self.process_request_headers(
770 |                         self.config.browser_type_name,
771 |                         playwright_request,
772 |                         headers,
773 |                     )
774 |                 )
775 | 
776 |             # if the current request corresponds to the original scrapy one
777 |             if (
778 |                 playwright_request.url.rstrip("/") == url.rstrip("/")
779 |                 and playwright_request.is_navigation_request()
780 |                 and not initial_request_done.is_set()
781 |             ):
782 |                 initial_request_done.set()
783 |                 if method.upper() != playwright_request.method.upper():
784 |                     overrides["method"] = method
785 |                 if body:
786 |                     overrides["post_data"] = body.decode(encoding)
787 |                 # the request that reaches the callback should contain the final headers
788 |                 headers.clear()
789 |                 headers.update(final_headers)
790 | 
791 |             del final_headers
792 | 
793 |             original_playwright_method: str = playwright_request.method
794 |             try:
795 |                 await route.continue_(**overrides)
796 |                 if overrides.get("method"):
797 |                     logger.debug(
798 |                         "[Context=%s] Overridden method for Playwright request"
799 |                         " to %s: original=%s new=%s",
800 |                         context_name,
801 |                         playwright_request.url,
802 |                         original_playwright_method,
803 |                         overrides["method"],
804 |                         extra={
805 |                             "spider": spider,
806 |                             "context_name": context_name,
807 |                             "scrapy_request_url": url,
808 |                             "scrapy_request_method": method,
809 |                             "playwright_request_url": playwright_request.url,
810 |                             "playwright_request_method_original": original_playwright_method,
811 |                             "playwright_request_method_new": overrides["method"],
812 |                         },
813 |                     )
814 |             except PlaywrightError as ex:
815 |                 if _is_safe_close_error(ex):
816 |                     logger.warning(
817 |                         "Failed processing Playwright request: <%s %s> exc_type=%s exc_msg=%s",
818 |                         playwright_request.method,
819 |                         playwright_request.url,
820 |                         type(ex),
821 |                         str(ex),
822 |                         extra={
823 |                             "spider": spider,
824 |                             "context_name": context_name,
825 |                             "scrapy_request_url": url,
826 |                             "scrapy_request_method": method,
827 |                             "playwright_request_url": playwright_request.url,
828 |                             "playwright_request_method": playwright_request.method,
829 |                             "exception": ex,
830 |                         },
831 |                         exc_info=True,
832 |                     )
833 |                 else:
834 |                     raise
835 | 
836 |         return _request_handler
837 | 
838 | 
839 | def _attach_page_event_handlers(
840 |     page: Page, request: Request, spider: Spider, context_name: str
841 | ) -> None:
842 |     event_handlers = request.meta.get("playwright_page_event_handlers") or {}
843 |     for event, handler in event_handlers.items():
844 |         if callable(handler):
845 |             page.on(event, handler)
846 |         elif isinstance(handler, str):
847 |             try:
848 |                 page.on(event, getattr(spider, handler))
849 |             except AttributeError as ex:
850 |                 logger.warning(
851 |                     "Spider '%s' does not have a '%s' attribute,"
852 |                     " ignoring handler for event '%s'",
853 |                     spider.name,
854 |                     handler,
855 |                     event,
856 |                     extra={
857 |                         "spider": spider,
858 |                         "context_name": context_name,
859 |                         "scrapy_request_url": request.url,
860 |                         "scrapy_request_method": request.method,
861 |                         "exception": ex,
862 |                     },
863 |                     exc_info=True,
864 |                 )
865 | 
866 | 
867 | async def _set_redirect_meta(request: Request, response: PlaywrightResponse) -> None:
868 |     """Update a Scrapy request with metadata about redirects."""
869 |     redirect_times: int = 0
870 |     redirect_urls: list = []
871 |     redirect_reasons: list = []
872 |     redirected = response.request.redirected_from
873 |     while redirected is not None:
874 |         redirect_times += 1
875 |         redirect_urls.append(redirected.url)
876 |         redirected_response = await redirected.response()
877 |         reason = None if redirected_response is None else redirected_response.status
878 |         redirect_reasons.append(reason)
879 |         redirected = redirected.redirected_from
880 |     if redirect_times:
881 |         request.meta["redirect_times"] = redirect_times
882 |         request.meta["redirect_urls"] = list(reversed(redirect_urls))
883 |         request.meta["redirect_reasons"] = list(reversed(redirect_reasons))
884 | 
885 | 
886 | async def _maybe_execute_page_init_callback(
887 |     page: Page,
888 |     request: Request,
889 |     context_name: str,
890 |     spider: Spider,
891 | ) -> None:
892 |     page_init_callback = request.meta.get("playwright_page_init_callback")
893 |     if page_init_callback:
894 |         try:
895 |             page_init_callback = load_object(page_init_callback)
896 |             await page_init_callback(page, request)
897 |         except Exception as ex:
898 |             logger.warning(
899 |                 "[Context=%s] Page init callback exception for %s exc_type=%s exc_msg=%s",
900 |                 context_name,
901 |                 repr(request),
902 |                 type(ex),
903 |                 str(ex),
904 |                 extra={
905 |                     "spider": spider,
906 |                     "context_name": context_name,
907 |                     "scrapy_request_url": request.url,
908 |                     "scrapy_request_method": request.method,
909 |                     "exception": ex,
910 |                 },
911 |                 exc_info=True,
912 |             )
913 | 
914 | 
915 | def _make_request_logger(context_name: str, spider: Spider) -> Callable:
916 |     async def _log_request(request: PlaywrightRequest) -> None:
917 |         log_args = [context_name, request.method.upper(), request.url, request.resource_type]
918 |         referrer = await _get_header_value(request, "referer")
919 |         if referrer:
920 |             log_args.append(referrer)
921 |             log_msg = "[Context=%s] Request: <%s %s> (resource type: %s, referrer: %s)"
922 |         else:
923 |             log_msg = "[Context=%s] Request: <%s %s> (resource type: %s)"
924 |         logger.debug(
925 |             log_msg,
926 |             *log_args,
927 |             extra={
928 |                 "spider": spider,
929 |                 "context_name": context_name,
930 |                 "playwright_request_url": request.url,
931 |                 "playwright_request_method": request.method,
932 |                 "playwright_resource_type": request.resource_type,
933 |             },
934 |         )
935 | 
936 |     return _log_request
937 | 
938 | 
939 | def _make_response_logger(context_name: str, spider: Spider) -> Callable:
940 |     async def _log_response(response: PlaywrightResponse) -> None:
941 |         log_args = [context_name, response.status, response.url]
942 |         location = await _get_header_value(response, "location")
943 |         if location:
944 |             log_args.append(location)
945 |             log_msg = "[Context=%s] Response: <%i %s> (location: %s)"
946 |         else:
947 |             log_msg = "[Context=%s] Response: <%i %s>"
948 |         logger.debug(
949 |             log_msg,
950 |             *log_args,
951 |             extra={
952 |                 "spider": spider,
953 |                 "context_name": context_name,
954 |                 "playwright_response_url": response.url,
955 |                 "playwright_response_status": response.status,
956 |             },
957 |         )
958 | 
959 |     return _log_response
960 | 


--------------------------------------------------------------------------------
/scrapy_playwright/headers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module includes functions to process request headers.
 3 | Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
 4 | """
 5 | 
 6 | from typing import Dict
 7 | from urllib.parse import urlparse
 8 | 
 9 | from playwright.async_api import Request as PlaywrightRequest
10 | 
11 | 
12 | async def use_scrapy_headers(
13 |     *,
14 |     browser_type_name: str,
15 |     playwright_request: PlaywrightRequest,
16 |     scrapy_request_data: dict,
17 | ) -> Dict[str, str]:
18 |     """Scrapy headers take precedence over Playwright headers for navigation requests.
19 |     For non-navigation requests, only User-Agent is taken from the Scrapy headers."""
20 | 
21 |     scrapy_headers_str = scrapy_request_data["headers"].to_unicode_dict()
22 |     playwright_headers = await playwright_request.all_headers()
23 | 
24 |     # Scrapy's user agent has priority over Playwright's
25 |     scrapy_headers_str.setdefault("user-agent", playwright_headers.get("user-agent"))
26 | 
27 |     if playwright_request.is_navigation_request():
28 |         # if referer header is set via playwright_page_goto_kwargs
29 |         if referer := playwright_headers.get("referer"):
30 |             scrapy_headers_str.setdefault("referer", referer)
31 | 
32 |         # otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET
33 |         if browser_type_name == "firefox":
34 |             scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc
35 | 
36 |         return scrapy_headers_str
37 | 
38 |     # override user agent, for consistency with other requests
39 |     if scrapy_headers_str.get("user-agent"):
40 |         playwright_headers["user-agent"] = scrapy_headers_str["user-agent"]
41 |     return playwright_headers
42 | 


--------------------------------------------------------------------------------
/scrapy_playwright/memusage.py:
--------------------------------------------------------------------------------
 1 | from contextlib import suppress
 2 | from importlib import import_module
 3 | from typing import List
 4 | 
 5 | from scrapy.exceptions import NotConfigured
 6 | from scrapy.extensions.memusage import MemoryUsage
 7 | 
 8 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler, logger
 9 | 
10 | 
11 | _MIB_FACTOR = 1024**2
12 | 
13 | 
14 | class ScrapyPlaywrightMemoryUsageExtension(MemoryUsage):
15 |     def __init__(self, *args, **kwargs) -> None:
16 |         super().__init__(*args, **kwargs)
17 |         try:
18 |             self.psutil = import_module("psutil")
19 |         except ImportError as exc:
20 |             raise NotConfigured("The psutil module is not available") from exc
21 | 
22 |     def _get_main_process_ids(self) -> List[int]:
23 |         try:
24 |             return [
25 |                 handler.playwright_context_manager._connection._transport._proc.pid
26 |                 for handler in self.crawler.engine.downloader.handlers._handlers.values()
27 |                 if isinstance(handler, ScrapyPlaywrightDownloadHandler)
28 |                 and handler.playwright_context_manager
29 |             ]
30 |         except Exception:
31 |             return []
32 | 
33 |     def _get_descendant_processes(self, process) -> list:
34 |         children = process.children()
35 |         result = children.copy()
36 |         for child in children:
37 |             result.extend(self._get_descendant_processes(child))
38 |         return result
39 | 
40 |     def _get_total_playwright_process_memory(self) -> int:
41 |         process_list = [self.psutil.Process(pid) for pid in self._get_main_process_ids()]
42 |         for proc in process_list.copy():
43 |             process_list.extend(self._get_descendant_processes(proc))
44 |         total_process_size = 0
45 |         for proc in process_list:
46 |             with suppress(Exception):  # might fail if the process exited in the meantime
47 |                 total_process_size += proc.memory_info().rss
48 |         logger.debug(
49 |             "Total Playwright process memory: %i Bytes (%i MiB)",
50 |             total_process_size,
51 |             total_process_size / _MIB_FACTOR,
52 |         )
53 |         return total_process_size
54 | 
55 |     def get_virtual_size(self) -> int:
56 |         return super().get_virtual_size() + self._get_total_playwright_process_memory()
57 | 


--------------------------------------------------------------------------------
/scrapy_playwright/page.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Union
 2 | 
 3 | 
 4 | __all__ = ["PageMethod"]
 5 | 
 6 | 
 7 | class PageMethod:
 8 |     """
 9 |     Represents a method to be called (and awaited if necessary) on a
10 |     Playwright page, such as "click", "screenshot", "evaluate", etc.
11 | 
12 |     If a callable is received, it will be called with the page as its first argument.
13 |     Any additional arguments are passed to the callable after the page.
14 |     """
15 | 
16 |     def __init__(self, method: Union[str, Callable], *args, **kwargs) -> None:
17 |         self.method: Union[str, Callable] = method
18 |         self.args: tuple = args
19 |         self.kwargs: dict = kwargs
20 |         self.result: Any = None
21 | 
22 |     def __str__(self) -> str:
23 |         return f"<{self.__class__.__name__} for method '{self.method}'>"
24 | 
25 |     __repr__ = __str__
26 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 99
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | from scrapy_playwright import __version__
 4 | 
 5 | 
 6 | with open("README.md", "r", encoding="utf-8") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | 
10 | setuptools.setup(
11 |     name="scrapy-playwright",
12 |     version=__version__,
13 |     license="BSD",
14 |     description="Playwright integration for Scrapy",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     author="Eugenio Lacuesta",
18 |     author_email="eugenio.lacuesta@gmail.com",
19 |     url="https://github.com/scrapy-plugins/scrapy-playwright",
20 |     packages=["scrapy_playwright"],
21 |     classifiers=[
22 |         "Development Status :: 4 - Beta",
23 |         "License :: OSI Approved :: BSD License",
24 |         "Programming Language :: Python",
25 |         "Programming Language :: Python :: 3.8",
26 |         "Programming Language :: Python :: 3.9",
27 |         "Programming Language :: Python :: 3.10",
28 |         "Programming Language :: Python :: 3.11",
29 |         "Programming Language :: Python :: 3.12",
30 |         "Framework :: Scrapy",
31 |         "Intended Audience :: Developers",
32 |         "Topic :: Internet :: WWW/HTTP",
33 |         "Topic :: Software Development :: Libraries :: Application Frameworks",
34 |         "Topic :: Software Development :: Libraries :: Python Modules",
35 |     ],
36 |     python_requires=">=3.8",
37 |     install_requires=[
38 |         "scrapy>=2.0,!=2.4.0",
39 |         "playwright>=1.15",
40 |     ],
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import inspect
 3 | import logging
 4 | import platform
 5 | from contextlib import asynccontextmanager
 6 | from functools import wraps
 7 | from typing import Optional
 8 | 
 9 | from scrapy import Request
10 | from scrapy.http.response.html import HtmlResponse
11 | from scrapy.utils.test import get_crawler
12 | 
13 | 
14 | logger = logging.getLogger("scrapy-playwright-tests")
15 | 
16 | 
17 | if platform.system() == "Windows":
18 |     from scrapy_playwright._utils import _ThreadedLoopAdapter
19 | 
20 |     def allow_windows(test_method):
21 |         """Wrap tests with the _ThreadedLoopAdapter class on Windows."""
22 |         if not inspect.iscoroutinefunction(test_method):
23 |             raise RuntimeError(f"{test_method} must be an async def method")
24 | 
25 |         @wraps(test_method)
26 |         async def wrapped(self, *args, **kwargs):
27 |             caller_id = 1234
28 |             _ThreadedLoopAdapter.start(caller_id)
29 |             coro = test_method(self, *args, **kwargs)
30 |             asyncio.run_coroutine_threadsafe(coro=coro, loop=_ThreadedLoopAdapter._loop).result()
31 |             _ThreadedLoopAdapter.stop(caller_id)
32 | 
33 |         return wrapped
34 | 
35 | else:
36 | 
37 |     def allow_windows(test_method):
38 |         return test_method
39 | 
40 | 
41 | @asynccontextmanager
42 | async def make_handler(settings_dict: Optional[dict] = None):
43 |     """Convenience function to obtain an initialized handler and close it gracefully"""
44 |     from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
45 | 
46 |     settings: dict = settings_dict or {}
47 |     settings.setdefault("TELNETCONSOLE_ENABLED", False)
48 |     crawler = get_crawler(settings_dict=settings)
49 |     handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
50 |     try:
51 |         await handler._launch()
52 |     except:  # noqa (E722), pylint: disable=bare-except
53 |         pass
54 |     else:
55 |         yield handler
56 |     finally:
57 |         await handler._close()
58 | 
59 | 
60 | def assert_correct_response(response: HtmlResponse, request: Request) -> None:
61 |     assert isinstance(response, HtmlResponse)
62 |     assert response.request is request
63 |     assert response.url == request.url
64 |     assert response.status == 200
65 |     assert "playwright" in response.flags
66 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.hookimpl(tryfirst=True)
 7 | def pytest_configure(config):
 8 |     # https://twistedmatrix.com/trac/ticket/9766
 9 |     # https://github.com/pytest-dev/pytest-twisted/issues/80
10 | 
11 |     if config.getoption("reactor", "default") == "asyncio" and platform.system() == "Windows":
12 |         import asyncio
13 | 
14 |         selector_policy = asyncio.WindowsSelectorEventLoopPolicy()
15 |         asyncio.set_event_loop_policy(selector_policy)
16 | 
17 | 
18 | def pytest_sessionstart(session):  # pylint: disable=unused-argument
19 |     """
20 |     Called after the Session object has been created and before performing
21 |     collection and entering the run test loop.
22 |     """
23 |     from twisted.internet.asyncioreactor import install, AsyncioSelectorReactor
24 |     from twisted.internet.error import ReactorAlreadyInstalledError
25 | 
26 |     try:
27 |         install()
28 |     except ReactorAlreadyInstalledError as exc:
29 |         from twisted.internet import reactor
30 | 
31 |         if not isinstance(reactor, AsyncioSelectorReactor):
32 |             raise RuntimeError(f"Wrong reactor installed: {type(reactor)}") from exc
33 | 


--------------------------------------------------------------------------------
/tests/launch_chromium_server.js:
--------------------------------------------------------------------------------
 1 | // used to start a browser server to test the PLAYWRIGHT_CONNECT_URL setting
 2 | // usage:
 3 | //   node launch_browser_server.js PORT WS_PATH
 4 | 
 5 | const { chromium } = require('playwright');  // Or 'webkit' or 'firefox'.
 6 | 
 7 | (async () => {
 8 |     const browserServer = await chromium.launchServer({
 9 |         host: 'localhost',
10 |         port: process.argv[2],
11 |         wsPath: process.argv[3]
12 |     });
13 |     console.log(browserServer.wsEndpoint())
14 | })();
15 | 


--------------------------------------------------------------------------------
/tests/mockserver.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import sys
  4 | import time
  5 | from http.server import HTTPServer, BaseHTTPRequestHandler
  6 | from pathlib import Path
  7 | from subprocess import Popen, PIPE
  8 | from threading import Thread
  9 | from typing import Optional
 10 | from urllib.parse import urljoin, urlparse, parse_qs
 11 | 
 12 | 
 13 | class StaticMockServer:
 14 |     """A web server that serves the contents of the sibling "site" directory.
 15 |     To be used as a context manager:
 16 | 
 17 |         with StaticMockServer() as server:
 18 |             url = server.urljoin("/index.html")
 19 |             ...
 20 |     """
 21 | 
 22 |     def __enter__(self):
 23 |         self.proc = Popen(
 24 |             [sys.executable, "-u", "-m", "http.server", "0", "--bind", "127.0.0.1"],
 25 |             stdout=PIPE,
 26 |             cwd=str(Path(__file__).absolute().parent / "site"),
 27 |         )
 28 |         self.address, self.port = re.search(
 29 |             r"^Serving HTTP on (\d+\.\d+\.\d+\.\d+) port (\d+)",
 30 |             self.proc.stdout.readline().strip().decode("ascii"),
 31 |         ).groups()
 32 |         return self
 33 | 
 34 |     def __exit__(self, exc_type, exc_value, traceback):
 35 |         self.proc.kill()
 36 |         self.proc.communicate()
 37 | 
 38 |     def urljoin(self, url):
 39 |         return urljoin(f"http://{self.address}:{self.port}", url)
 40 | 
 41 | 
 42 | class _RequestHandler(BaseHTTPRequestHandler):
 43 |     def do_POST(self) -> None:
 44 |         """Echo back the request body"""
 45 |         content_length = int(self.headers.get("Content-Length") or 0)
 46 |         body_bytes = b"Request body: " + self.rfile.read(content_length)
 47 |         self.send_response(200)
 48 |         self.send_header("Content-Length", str(len(body_bytes)))
 49 |         self.end_headers()
 50 |         self.wfile.write(body_bytes)
 51 | 
 52 |     def do_GET(self) -> None:
 53 |         parsed_path = urlparse(self.path)
 54 |         query_string = {key: values[0] for key, values in parse_qs(parsed_path.query).items()}
 55 | 
 56 |         if delay := int(query_string.get("delay") or 0):
 57 |             print(f"Sleeping {delay} seconds on path {parsed_path.path}...")
 58 |             time.sleep(delay)
 59 | 
 60 |         if parsed_path.path == "/headers":
 61 |             self._send_json(dict(self.headers))
 62 |         elif parsed_path.path == "/status/204":
 63 |             self.send_response(204)
 64 |             self.end_headers()
 65 |         elif parsed_path.path == "/redirect2":
 66 |             self.send_response(302)
 67 |             self.send_header("Content-Length", "0")
 68 |             self.send_header("Location", "/redirect")
 69 |             self.end_headers()
 70 |         elif parsed_path.path == "/redirect":
 71 |             self.send_response(301)
 72 |             self.send_header("Content-Length", "0")
 73 |             self.send_header("Location", "/headers")
 74 |             self.end_headers()
 75 |         elif parsed_path.path == "/mancha.pdf":
 76 |             body_bytes = (Path(__file__).absolute().parent / "site/files/mancha.pdf").read_bytes()
 77 |             content_length_multiplier = int(query_string.get("content_length_multiplier") or 1)
 78 |             self.send_response(200)
 79 |             self.send_header("Content-Type", "application/pdf")
 80 |             self.send_header("Content-Disposition", 'attachment; filename="mancha.pdf"')
 81 |             self.send_header("Content-Length", str(len(body_bytes) * content_length_multiplier))
 82 |             self.end_headers()
 83 |             self.wfile.write(body_bytes)
 84 |         else:
 85 |             self._send_json({"error": "unknown path"}, status=404)
 86 | 
 87 |     def _send_json(self, body: dict, status: int = 200) -> None:
 88 |         body_bytes = json.dumps(body, indent=2).encode("utf8")
 89 |         self.send_response(status)
 90 |         self.send_header("Content-Length", str(len(body_bytes)))
 91 |         self.send_header("Content-Type", "application/json")
 92 |         self.end_headers()
 93 |         self.wfile.write(body_bytes)
 94 | 
 95 | 
 96 | class MockServer:
 97 |     """A context manager web server using the _RequestHandler class to handle requests."""
 98 | 
 99 |     def __enter__(self):
100 |         self.httpd = HTTPServer(("127.0.0.1", 0), _RequestHandler)
101 |         self.address, self.port = self.httpd.server_address
102 |         self.thread = Thread(target=self.httpd.serve_forever)
103 |         self.thread.start()
104 |         return self
105 | 
106 |     def __exit__(self, exc_type, exc_value, traceback):
107 |         self.httpd.shutdown()
108 |         self.thread.join()
109 | 
110 |     def urljoin(self, url: Optional[str] = None) -> str:
111 |         return urljoin(f"http://{self.address}:{self.port}", url)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     with MockServer() as server:
116 |         print(f"Listening at http://{server.address}:{server.port}")
117 |         while True:
118 |             pass
119 | 


--------------------------------------------------------------------------------
/tests/site/data/quotes1.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "has_next": true,
  3 |   "page": 1,
  4 |   "quotes": [
  5 |     {
  6 |       "author": {
  7 |         "goodreads_link": "/author/show/9810.Albert_Einstein",
  8 |         "name": "Albert Einstein",
  9 |         "slug": "Albert-Einstein"
 10 |       },
 11 |       "tags": [
 12 |         "change",
 13 |         "deep-thoughts",
 14 |         "thinking",
 15 |         "world"
 16 |       ],
 17 |       "text": "\u201cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\u201d"
 18 |     },
 19 |     {
 20 |       "author": {
 21 |         "goodreads_link": "/author/show/1077326.J_K_Rowling",
 22 |         "name": "J.K. Rowling",
 23 |         "slug": "J-K-Rowling"
 24 |       },
 25 |       "tags": [
 26 |         "abilities",
 27 |         "choices"
 28 |       ],
 29 |       "text": "\u201cIt is our choices, Harry, that show what we truly are, far more than our abilities.\u201d"
 30 |     },
 31 |     {
 32 |       "author": {
 33 |         "goodreads_link": "/author/show/9810.Albert_Einstein",
 34 |         "name": "Albert Einstein",
 35 |         "slug": "Albert-Einstein"
 36 |       },
 37 |       "tags": [
 38 |         "inspirational",
 39 |         "life",
 40 |         "live",
 41 |         "miracle",
 42 |         "miracles"
 43 |       ],
 44 |       "text": "\u201cThere are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.\u201d"
 45 |     },
 46 |     {
 47 |       "author": {
 48 |         "goodreads_link": "/author/show/1265.Jane_Austen",
 49 |         "name": "Jane Austen",
 50 |         "slug": "Jane-Austen"
 51 |       },
 52 |       "tags": [
 53 |         "aliteracy",
 54 |         "books",
 55 |         "classic",
 56 |         "humor"
 57 |       ],
 58 |       "text": "\u201cThe person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.\u201d"
 59 |     },
 60 |     {
 61 |       "author": {
 62 |         "goodreads_link": "/author/show/82952.Marilyn_Monroe",
 63 |         "name": "Marilyn Monroe",
 64 |         "slug": "Marilyn-Monroe"
 65 |       },
 66 |       "tags": [
 67 |         "be-yourself",
 68 |         "inspirational"
 69 |       ],
 70 |       "text": "\u201cImperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.\u201d"
 71 |     },
 72 |     {
 73 |       "author": {
 74 |         "goodreads_link": "/author/show/9810.Albert_Einstein",
 75 |         "name": "Albert Einstein",
 76 |         "slug": "Albert-Einstein"
 77 |       },
 78 |       "tags": [
 79 |         "adulthood",
 80 |         "success",
 81 |         "value"
 82 |       ],
 83 |       "text": "\u201cTry not to become a man of success. Rather become a man of value.\u201d"
 84 |     },
 85 |     {
 86 |       "author": {
 87 |         "goodreads_link": "/author/show/7617.Andr_Gide",
 88 |         "name": "Andr\u00e9 Gide",
 89 |         "slug": "Andre-Gide"
 90 |       },
 91 |       "tags": [
 92 |         "life",
 93 |         "love"
 94 |       ],
 95 |       "text": "\u201cIt is better to be hated for what you are than to be loved for what you are not.\u201d"
 96 |     },
 97 |     {
 98 |       "author": {
 99 |         "goodreads_link": "/author/show/3091287.Thomas_A_Edison",
100 |         "name": "Thomas A. Edison",
101 |         "slug": "Thomas-A-Edison"
102 |       },
103 |       "tags": [
104 |         "edison",
105 |         "failure",
106 |         "inspirational",
107 |         "paraphrased"
108 |       ],
109 |       "text": "\u201cI have not failed. I've just found 10,000 ways that won't work.\u201d"
110 |     },
111 |     {
112 |       "author": {
113 |         "goodreads_link": "/author/show/44566.Eleanor_Roosevelt",
114 |         "name": "Eleanor Roosevelt",
115 |         "slug": "Eleanor-Roosevelt"
116 |       },
117 |       "tags": [
118 |         "misattributed-eleanor-roosevelt"
119 |       ],
120 |       "text": "\u201cA woman is like a tea bag; you never know how strong it is until it's in hot water.\u201d"
121 |     },
122 |     {
123 |       "author": {
124 |         "goodreads_link": "/author/show/7103.Steve_Martin",
125 |         "name": "Steve Martin",
126 |         "slug": "Steve-Martin"
127 |       },
128 |       "tags": [
129 |         "humor",
130 |         "obvious",
131 |         "simile"
132 |       ],
133 |       "text": "\u201cA day without sunshine is like, you know, night.\u201d"
134 |     }
135 |   ],
136 |   "tag": null,
137 |   "top_ten_tags": [
138 |     [
139 |       "love",
140 |       14
141 |     ],
142 |     [
143 |       "inspirational",
144 |       13
145 |     ],
146 |     [
147 |       "life",
148 |       13
149 |     ],
150 |     [
151 |       "humor",
152 |       12
153 |     ],
154 |     [
155 |       "books",
156 |       11
157 |     ],
158 |     [
159 |       "reading",
160 |       7
161 |     ],
162 |     [
163 |       "friendship",
164 |       5
165 |     ],
166 |     [
167 |       "friends",
168 |       4
169 |     ],
170 |     [
171 |       "truth",
172 |       4
173 |     ],
174 |     [
175 |       "simile",
176 |       3
177 |     ]
178 |   ]
179 | }


--------------------------------------------------------------------------------
/tests/site/data/quotes2.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "has_next": true,
  3 |   "page": 2,
  4 |   "quotes": [
  5 |     {
  6 |       "author": {
  7 |         "goodreads_link": "/author/show/82952.Marilyn_Monroe",
  8 |         "name": "Marilyn Monroe",
  9 |         "slug": "Marilyn-Monroe"
 10 |       },
 11 |       "tags": [
 12 |         "friends",
 13 |         "heartbreak",
 14 |         "inspirational",
 15 |         "life",
 16 |         "love",
 17 |         "sisters"
 18 |       ],
 19 |       "text": "\u201cThis life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.\u201d"
 20 |     },
 21 |     {
 22 |       "author": {
 23 |         "goodreads_link": "/author/show/1077326.J_K_Rowling",
 24 |         "name": "J.K. Rowling",
 25 |         "slug": "J-K-Rowling"
 26 |       },
 27 |       "tags": [
 28 |         "courage",
 29 |         "friends"
 30 |       ],
 31 |       "text": "\u201cIt takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.\u201d"
 32 |     },
 33 |     {
 34 |       "author": {
 35 |         "goodreads_link": "/author/show/9810.Albert_Einstein",
 36 |         "name": "Albert Einstein",
 37 |         "slug": "Albert-Einstein"
 38 |       },
 39 |       "tags": [
 40 |         "simplicity",
 41 |         "understand"
 42 |       ],
 43 |       "text": "\u201cIf you can't explain it to a six year old, you don't understand it yourself.\u201d"
 44 |     },
 45 |     {
 46 |       "author": {
 47 |         "goodreads_link": "/author/show/25241.Bob_Marley",
 48 |         "name": "Bob Marley",
 49 |         "slug": "Bob-Marley"
 50 |       },
 51 |       "tags": [
 52 |         "love"
 53 |       ],
 54 |       "text": "\u201cYou may not be her first, her last, or her only. She loved before she may love again. But if she loves you now, what else matters? She's not perfect\u2014you aren't either, and the two of you may never be perfect together but if she can make you laugh, cause you to think twice, and admit to being human and making mistakes, hold onto her and give her the most you can. She may not be thinking about you every second of the day, but she will give you a part of her that she knows you can break\u2014her heart. So don't hurt her, don't change her, don't analyze and don't expect more than she can give. Smile when she makes you happy, let her know when she makes you mad, and miss her when she's not there.\u201d"
 55 |     },
 56 |     {
 57 |       "author": {
 58 |         "goodreads_link": "/author/show/61105.Dr_Seuss",
 59 |         "name": "Dr. Seuss",
 60 |         "slug": "Dr-Seuss"
 61 |       },
 62 |       "tags": [
 63 |         "fantasy"
 64 |       ],
 65 |       "text": "\u201cI like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.\u201d"
 66 |     },
 67 |     {
 68 |       "author": {
 69 |         "goodreads_link": "/author/show/4.Douglas_Adams",
 70 |         "name": "Douglas Adams",
 71 |         "slug": "Douglas-Adams"
 72 |       },
 73 |       "tags": [
 74 |         "life",
 75 |         "navigation"
 76 |       ],
 77 |       "text": "\u201cI may not have gone where I intended to go, but I think I have ended up where I needed to be.\u201d"
 78 |     },
 79 |     {
 80 |       "author": {
 81 |         "goodreads_link": "/author/show/1049.Elie_Wiesel",
 82 |         "name": "Elie Wiesel",
 83 |         "slug": "Elie-Wiesel"
 84 |       },
 85 |       "tags": [
 86 |         "activism",
 87 |         "apathy",
 88 |         "hate",
 89 |         "indifference",
 90 |         "inspirational",
 91 |         "love",
 92 |         "opposite",
 93 |         "philosophy"
 94 |       ],
 95 |       "text": "\u201cThe opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.\u201d"
 96 |     },
 97 |     {
 98 |       "author": {
 99 |         "goodreads_link": "/author/show/1938.Friedrich_Nietzsche",
100 |         "name": "Friedrich Nietzsche",
101 |         "slug": "Friedrich-Nietzsche"
102 |       },
103 |       "tags": [
104 |         "friendship",
105 |         "lack-of-friendship",
106 |         "lack-of-love",
107 |         "love",
108 |         "marriage",
109 |         "unhappy-marriage"
110 |       ],
111 |       "text": "\u201cIt is not a lack of love, but a lack of friendship that makes unhappy marriages.\u201d"
112 |     },
113 |     {
114 |       "author": {
115 |         "goodreads_link": "/author/show/1244.Mark_Twain",
116 |         "name": "Mark Twain",
117 |         "slug": "Mark-Twain"
118 |       },
119 |       "tags": [
120 |         "books",
121 |         "contentment",
122 |         "friends",
123 |         "friendship",
124 |         "life"
125 |       ],
126 |       "text": "\u201cGood friends, good books, and a sleepy conscience: this is the ideal life.\u201d"
127 |     },
128 |     {
129 |       "author": {
130 |         "goodreads_link": "/author/show/276029.Allen_Saunders",
131 |         "name": "Allen Saunders",
132 |         "slug": "Allen-Saunders"
133 |       },
134 |       "tags": [
135 |         "fate",
136 |         "life",
137 |         "misattributed-john-lennon",
138 |         "planning",
139 |         "plans"
140 |       ],
141 |       "text": "\u201cLife is what happens to us while we are making other plans.\u201d"
142 |     }
143 |   ],
144 |   "tag": null,
145 |   "top_ten_tags": [
146 |     [
147 |       "love",
148 |       14
149 |     ],
150 |     [
151 |       "inspirational",
152 |       13
153 |     ],
154 |     [
155 |       "life",
156 |       13
157 |     ],
158 |     [
159 |       "humor",
160 |       12
161 |     ],
162 |     [
163 |       "books",
164 |       11
165 |     ],
166 |     [
167 |       "reading",
168 |       7
169 |     ],
170 |     [
171 |       "friendship",
172 |       5
173 |     ],
174 |     [
175 |       "friends",
176 |       4
177 |     ],
178 |     [
179 |       "truth",
180 |       4
181 |     ],
182 |     [
183 |       "simile",
184 |       3
185 |     ]
186 |   ]
187 | }


--------------------------------------------------------------------------------
/tests/site/data/quotes3.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "has_next": false,
  3 |   "page": 3,
  4 |   "quotes": [
  5 |     {
  6 |       "author": {
  7 |         "goodreads_link": "/author/show/4026.Pablo_Neruda",
  8 |         "name": "Pablo Neruda",
  9 |         "slug": "Pablo-Neruda"
 10 |       },
 11 |       "tags": [
 12 |         "love",
 13 |         "poetry"
 14 |       ],
 15 |       "text": "\u201cI love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.\u201d"
 16 |     },
 17 |     {
 18 |       "author": {
 19 |         "goodreads_link": "/author/show/12080.Ralph_Waldo_Emerson",
 20 |         "name": "Ralph Waldo Emerson",
 21 |         "slug": "Ralph-Waldo-Emerson"
 22 |       },
 23 |       "tags": [
 24 |         "happiness"
 25 |       ],
 26 |       "text": "\u201cFor every minute you are angry you lose sixty seconds of happiness.\u201d"
 27 |     },
 28 |     {
 29 |       "author": {
 30 |         "goodreads_link": "/author/show/838305.Mother_Teresa",
 31 |         "name": "Mother Teresa",
 32 |         "slug": "Mother-Teresa"
 33 |       },
 34 |       "tags": [
 35 |         "attributed-no-source"
 36 |       ],
 37 |       "text": "\u201cIf you judge people, you have no time to love them.\u201d"
 38 |     },
 39 |     {
 40 |       "author": {
 41 |         "goodreads_link": "/author/show/2014.Garrison_Keillor",
 42 |         "name": "Garrison Keillor",
 43 |         "slug": "Garrison-Keillor"
 44 |       },
 45 |       "tags": [
 46 |         "humor",
 47 |         "religion"
 48 |       ],
 49 |       "text": "\u201cAnyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.\u201d"
 50 |     },
 51 |     {
 52 |       "author": {
 53 |         "goodreads_link": "/author/show/4427.Jim_Henson",
 54 |         "name": "Jim Henson",
 55 |         "slug": "Jim-Henson"
 56 |       },
 57 |       "tags": [
 58 |         "humor"
 59 |       ],
 60 |       "text": "\u201cBeauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.\u201d"
 61 |     },
 62 |     {
 63 |       "author": {
 64 |         "goodreads_link": "/author/show/61105.Dr_Seuss",
 65 |         "name": "Dr. Seuss",
 66 |         "slug": "Dr-Seuss"
 67 |       },
 68 |       "tags": [
 69 |         "comedy",
 70 |         "life",
 71 |         "yourself"
 72 |       ],
 73 |       "text": "\u201cToday you are You, that is truer than true. There is no one alive who is Youer than You.\u201d"
 74 |     },
 75 |     {
 76 |       "author": {
 77 |         "goodreads_link": "/author/show/9810.Albert_Einstein",
 78 |         "name": "Albert Einstein",
 79 |         "slug": "Albert-Einstein"
 80 |       },
 81 |       "tags": [
 82 |         "children",
 83 |         "fairy-tales"
 84 |       ],
 85 |       "text": "\u201cIf you want your children to be intelligent, read them fairy tales. If you want them to be more intelligent, read them more fairy tales.\u201d"
 86 |     },
 87 |     {
 88 |       "author": {
 89 |         "goodreads_link": "/author/show/1077326.J_K_Rowling",
 90 |         "name": "J.K. Rowling",
 91 |         "slug": "J-K-Rowling"
 92 |       },
 93 |       "tags": [],
 94 |       "text": "\u201cIt is impossible to live without failing at something, unless you live so cautiously that you might as well not have lived at all - in which case, you fail by default.\u201d"
 95 |     },
 96 |     {
 97 |       "author": {
 98 |         "goodreads_link": "/author/show/9810.Albert_Einstein",
 99 |         "name": "Albert Einstein",
100 |         "slug": "Albert-Einstein"
101 |       },
102 |       "tags": [
103 |         "imagination"
104 |       ],
105 |       "text": "\u201cLogic will get you from A to Z; imagination will get you everywhere.\u201d"
106 |     },
107 |     {
108 |       "author": {
109 |         "goodreads_link": "/author/show/25241.Bob_Marley",
110 |         "name": "Bob Marley",
111 |         "slug": "Bob-Marley"
112 |       },
113 |       "tags": [
114 |         "music"
115 |       ],
116 |       "text": "\u201cOne good thing about music, when it hits you, you feel no pain.\u201d"
117 |     }
118 |   ],
119 |   "tag": null,
120 |   "top_ten_tags": [
121 |     [
122 |       "love",
123 |       14
124 |     ],
125 |     [
126 |       "inspirational",
127 |       13
128 |     ],
129 |     [
130 |       "life",
131 |       13
132 |     ],
133 |     [
134 |       "humor",
135 |       12
136 |     ],
137 |     [
138 |       "books",
139 |       11
140 |     ],
141 |     [
142 |       "reading",
143 |       7
144 |     ],
145 |     [
146 |       "friendship",
147 |       5
148 |     ],
149 |     [
150 |       "friends",
151 |       4
152 |     ],
153 |     [
154 |       "truth",
155 |       4
156 |     ],
157 |     [
158 |       "simile",
159 |       3
160 |     ]
161 |   ]
162 | }
163 | 


--------------------------------------------------------------------------------
/tests/site/files/mancha.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/files/mancha.pdf


--------------------------------------------------------------------------------
/tests/site/gallery.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <title>Gallery</title>
 5 |     <meta charset="utf-8" />
 6 |     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
 7 | </head>
 8 | 
 9 | <body>
10 | <div>
11 |     <h1>Gallery</h1>
12 |     <ul>
13 |         <li>
14 |             <img src="static/img/ales-krivec-ZMZHcvIVgbg-unsplash.jpg" /></p>
15 |         </li>
16 |         <li>
17 |             <img src="static/img/elyssa-fahndrich-MF16lGb95WY-unsplash.jpg" />
18 |         </li>
19 |         <li>
20 |             <img src="static/img/nathan-dumlao-RCfalHrnFAs-unsplash.jpg" />
21 |         </li>
22 |     </ul>
23 | </div>
24 | </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/tests/site/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <title>Awesome site</title>
 5 |     <meta charset="utf-8" />
 6 |     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
 7 | </head>
 8 | 
 9 | <body>
10 | <div>
11 |     <h1>Awesome site</h1>
12 |     <p><a class="lorem_ipsum" href="lorem_ipsum.html">Lorem Ipsum</a></p>
13 |     <p><a class="scroll" href="scroll.html">Infinite Scroll</a></p>
14 | </div>
15 | </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/tests/site/lorem_ipsum.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <title>Lorem Ipsum</title>
 5 |     <meta charset="utf-8" />
 6 |     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
 7 | </head>
 8 | <body>
 9 | <div>
10 |     <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit.</p>
11 | </div>
12 | </body>
13 | </html>
14 | 


--------------------------------------------------------------------------------
/tests/site/redirect.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <title>Page should redirect</title>
 5 |     <link rel="canonical" href="index.html">
 6 |     <meta name="robots" content="noindex">
 7 |     <meta charset="utf-8">
 8 |     <meta http-equiv="refresh" content="0; url=index.html">
 9 |   </head>
10 |   <body>
11 |     <p>You should not see this because you are immediately redirected.</p>
12 |   </body>
13 | </html>
14 | 


--------------------------------------------------------------------------------
/tests/site/scroll.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Quotes to Scrape</title>
 6 |     <link rel="stylesheet" href="static/bootstrap.min.css">
 7 |     <link rel="stylesheet" href="static/main.css">
 8 | </head>
 9 | <body>
10 |     <div class="container">
11 |         <div class="row">
12 |             <div class="col-md-8">
13 |                 <div class="quotes"></div>
14 |             </div>
15 |         </div>
16 |         <div id="loading" style="background-color: #eeeecc"><h5>Loading...</h5></div>
17 |     </div>
18 | 
19 |         <script src="static/jquery.js"></script>
20 |         <script>
21 |             $(function() {
22 |                 var page = 1, tag = null, hasNextPage = true;
23 | 
24 |                 function appendQuotes(quotes) {
25 |                     var html = $.map(quotes, function(d){
26 |                         var tags = $.map(d['tags'], function(t) {
27 |                             return "<a class='tag'>" + t + "</a>";
28 |                         }).join(" ");
29 |                         return "<div class='quote'><span class='text'>" + d['text'] + "</span><span>by <small class='author'>" + d['author']['name'] + "</small></span><div class='tags'>Tags: " + tags + "</div></div>";
30 |                     });
31 |                     $('.quotes').append(html);
32 |                 }
33 | 
34 |                 function updatePage(page) {
35 |                     $('#loading').show('fast');
36 |                     $.get('data/quotes' + page + '.json').done(function(data) {
37 |                         appendQuotes(data.quotes);
38 |                         hasNextPage = data.has_next;
39 |                         $('#loading').hide('fast');
40 |                     });
41 |                 }
42 | 
43 |                 updatePage(page);
44 | 
45 |                 $(window).on('scroll', function(){
46 |                     var scrollTop = $(window).scrollTop();
47 |                     var heightDiff = $(document).height() - $(window).height();
48 |                     if (hasNextPage && Math.abs(scrollTop - heightDiff) <= 1){
49 |                         page += 1;
50 |                         console.log('scrolling to page: ' + page);
51 |                         updatePage(page);
52 |                     }
53 |                 });
54 |             });
55 |         </script>
56 |     </body>
57 | </html>
58 | 


--------------------------------------------------------------------------------
/tests/site/static/img/ales-krivec-ZMZHcvIVgbg-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/static/img/ales-krivec-ZMZHcvIVgbg-unsplash.jpg


--------------------------------------------------------------------------------
/tests/site/static/img/elyssa-fahndrich-MF16lGb95WY-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/static/img/elyssa-fahndrich-MF16lGb95WY-unsplash.jpg


--------------------------------------------------------------------------------
/tests/site/static/img/nathan-dumlao-RCfalHrnFAs-unsplash.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/site/static/img/nathan-dumlao-RCfalHrnFAs-unsplash.jpg


--------------------------------------------------------------------------------
/tests/site/static/main.css:
--------------------------------------------------------------------------------
  1 | /* Custom page CSS */
  2 | body {
  3 |   font-family: sans-serif;
  4 | }
  5 | 
  6 | .container .text-muted {
  7 |   margin: 20px 0;
  8 | }
  9 | 
 10 | .tags-box {
 11 |   text-align: right;
 12 | }
 13 | 
 14 | .tags-box h2 {
 15 |   margin-top: 0px;
 16 | }
 17 | 
 18 | .tag-item {
 19 |   display: block;
 20 |   margin: 4px;
 21 | }
 22 | 
 23 | .quote {
 24 |   padding: 10px;
 25 |   margin-bottom: 30px;
 26 |   border: 1px solid #333333;
 27 |   border-radius: 5px;
 28 |   box-shadow: 2px 2px 3px #333333;
 29 | }
 30 | 
 31 | .quote small.author {
 32 |   font-weight: bold;
 33 |   color: #3677E8;
 34 | }
 35 | 
 36 | .quote span.text {
 37 |   display: block;
 38 |   margin-bottom: 5px;
 39 |   font-size: large;
 40 |   font-style: italic;
 41 | }
 42 | 
 43 | .quote .tags {
 44 |   margin-top: 10px;
 45 | }
 46 | 
 47 | .tag {
 48 |   padding: 2px 5px;
 49 |   border-radius: 5px;
 50 |   color: white;
 51 |   font-size: small;
 52 |   background-color: #7CA3E6;
 53 | }
 54 | 
 55 | a.tag:hover {
 56 |   text-decoration: none;
 57 | }
 58 | 
 59 | /* Sticky footer styles */
 60 | html {
 61 |   position: relative;
 62 |   min-height: 100%;
 63 | }
 64 | 
 65 | body {
 66 |   /* Margin bottom by footer height */
 67 |   margin-bottom: 60px;
 68 | }
 69 | 
 70 | .footer {
 71 |   position: absolute;
 72 |   bottom: 0;
 73 |   width: 100%;
 74 |   /* Set the fixed height of the footer here */
 75 |   height: 6em;
 76 |   background-color: #f5f5f5;
 77 | }
 78 | 
 79 | .error {
 80 |   color: red;
 81 | }
 82 | 
 83 | .header-box {
 84 |   padding-bottom: 40px;
 85 | }
 86 | 
 87 | .header-box p {
 88 |   margin-top: 30px;
 89 |   float: right;
 90 | }
 91 | 
 92 | .author-details  {
 93 |   width: 80%;
 94 | }
 95 | 
 96 | .author-description {
 97 |   text-align: justify;
 98 |   margin-bottom: 20px;
 99 | }
100 | 
101 | ul.pager {
102 |   margin-bottom: 100px;
103 | }
104 | 
105 | .copyright {
106 |   text-align: center;
107 | }
108 | 
109 | .sh-red {
110 |   color: #cc0b0f;
111 | }
112 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/tests_asyncio/__init__.py


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_browser.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import os
  4 | import platform
  5 | import random
  6 | import re
  7 | import signal
  8 | import subprocess
  9 | import time
 10 | import uuid
 11 | from contextlib import asynccontextmanager
 12 | from pathlib import Path
 13 | from threading import Thread
 14 | from typing import Tuple
 15 | from unittest import IsolatedAsyncioTestCase
 16 | 
 17 | import psutil
 18 | import pytest
 19 | from playwright._impl._errors import TargetClosedError
 20 | from playwright.async_api import async_playwright
 21 | from scrapy import Request, Spider
 22 | 
 23 | from tests import allow_windows, make_handler, assert_correct_response
 24 | from tests.mockserver import StaticMockServer
 25 | 
 26 | 
 27 | async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]:
 28 |     """Run a Chromium instance in a separate process, return the process
 29 |     object and a string with its devtools endpoint.
 30 |     """
 31 |     async with async_playwright() as playwright:
 32 |         proc = subprocess.Popen(  # pylint: disable=consider-using-with
 33 |             [playwright.chromium.executable_path, "--headless", "--remote-debugging-port=0"],
 34 |             text=True,
 35 |             stdout=subprocess.PIPE,
 36 |             stderr=subprocess.PIPE,
 37 |         )
 38 |         devtools_url = None
 39 |         while devtools_url is None:
 40 |             line = proc.stderr.readline().strip()  # type: ignore
 41 |             if not line:
 42 |                 time.sleep(0.2)
 43 |                 continue
 44 |             print("browser output:", line)
 45 |             if match := re.match(r"^DevTools listening on (.+)$", line):
 46 |                 devtools_url = match.group(1)
 47 |                 print("devtools_url:", devtools_url)
 48 |         return proc, devtools_url
 49 | 
 50 | 
 51 | def _run_chromium_browser_server() -> Tuple[subprocess.Popen, str]:
 52 |     """Start a Playwright server in a separate process, return the process
 53 |     object and a string with its websocket endpoint.
 54 |     Pass fixed port and ws path as arguments instead of allowing Playwright
 55 |     to choose, for some reason I was unable to capture stdout/stderr :shrug:
 56 |     """
 57 |     port = str(random.randint(60_000, 63_000))
 58 |     ws_path = str(uuid.uuid4())
 59 |     launch_server_script_path = str(Path(__file__).parent.parent / "launch_chromium_server.js")
 60 |     command = ["node", launch_server_script_path, port, ws_path]
 61 |     proc = subprocess.Popen(command)  # pylint: disable=consider-using-with
 62 |     return proc, f"ws://localhost:{port}/{ws_path}"
 63 | 
 64 | 
 65 | @asynccontextmanager
 66 | async def remote_chromium(with_devtools_protocol: bool = True):
 67 |     """Launch a remote browser that lasts while in the context."""
 68 |     proc = url = None
 69 |     try:
 70 |         if with_devtools_protocol:
 71 |             proc, url = await _run_chromium_devtools()
 72 |         else:
 73 |             proc, url = _run_chromium_browser_server()
 74 |             await asyncio.sleep(1)  # allow some time for the browser to start
 75 |     except Exception:
 76 |         pass
 77 |     else:
 78 |         print(f"Browser URL: {url}")
 79 |         yield url
 80 |     finally:
 81 |         if proc:
 82 |             proc.kill()
 83 |             proc.communicate()
 84 | 
 85 | 
 86 | class TestBrowserRemoteChromium(IsolatedAsyncioTestCase):
 87 |     @pytest.fixture(autouse=True)
 88 |     def inject_fixtures(self, caplog):
 89 |         caplog.set_level(logging.DEBUG)
 90 |         self._caplog = caplog
 91 | 
 92 |     @allow_windows
 93 |     async def test_connect_devtools(self):
 94 |         async with remote_chromium(with_devtools_protocol=True) as devtools_url:
 95 |             settings_dict = {
 96 |                 "PLAYWRIGHT_BROWSER_TYPE": "chromium",
 97 |                 "PLAYWRIGHT_CDP_URL": devtools_url,
 98 |                 "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
 99 |             }
100 |             async with make_handler(settings_dict) as handler:
101 |                 with StaticMockServer() as server:
102 |                     req = Request(server.urljoin("/index.html"), meta={"playwright": True})
103 |                     resp = await handler._download_request(req, Spider("foo"))
104 |                 assert_correct_response(resp, req)
105 |                 assert (
106 |                     "scrapy-playwright",
107 |                     logging.WARNING,
108 |                     "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
109 |                 ) in self._caplog.record_tuples
110 | 
111 |     @allow_windows
112 |     async def test_connect(self):
113 |         async with remote_chromium(with_devtools_protocol=False) as browser_url:
114 |             settings_dict = {
115 |                 "PLAYWRIGHT_BROWSER_TYPE": "chromium",
116 |                 "PLAYWRIGHT_CONNECT_URL": browser_url,
117 |                 "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
118 |             }
119 |             async with make_handler(settings_dict) as handler:
120 |                 with StaticMockServer() as server:
121 |                     req = Request(server.urljoin("/index.html"), meta={"playwright": True})
122 |                     resp = await handler._download_request(req, Spider("foo"))
123 |                 assert_correct_response(resp, req)
124 |                 assert (
125 |                     "scrapy-playwright",
126 |                     logging.INFO,
127 |                     "Connecting to remote Playwright",
128 |                 ) in self._caplog.record_tuples
129 |                 assert (
130 |                     "scrapy-playwright",
131 |                     logging.INFO,
132 |                     "Connected to remote Playwright",
133 |                 ) in self._caplog.record_tuples
134 |                 assert (
135 |                     "scrapy-playwright",
136 |                     logging.WARNING,
137 |                     "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
138 |                 ) in self._caplog.record_tuples
139 | 
140 | 
141 | class TestBrowserReconnectChromium(IsolatedAsyncioTestCase):
142 |     @pytest.fixture(autouse=True)
143 |     def inject_fixtures(self, caplog):
144 |         caplog.set_level(logging.DEBUG)
145 |         self._caplog = caplog
146 | 
147 |     @staticmethod
148 |     def kill_chrome():
149 |         for proc in psutil.process_iter(["pid", "name"]):
150 |             if proc.info["name"].lower() in ("chrome", "chromium"):
151 |                 os.kill(proc.info["pid"], signal.SIGKILL)
152 | 
153 |     @allow_windows
154 |     async def test_browser_closed_restart(self):
155 |         spider = Spider("foo")
156 |         async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
157 |             with StaticMockServer() as server:
158 |                 req1 = Request(
159 |                     server.urljoin("/index.html"),
160 |                     meta={"playwright": True, "playwright_include_page": True},
161 |                 )
162 |                 resp1 = await handler._download_request(req1, spider)
163 |                 page = resp1.meta["playwright_page"]
164 |                 await page.context.browser.close()
165 |                 req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
166 |                 resp2 = await handler._download_request(req2, spider)
167 |         assert_correct_response(resp1, req1)
168 |         assert_correct_response(resp2, req2)
169 |         assert (
170 |             self._caplog.record_tuples.count(
171 |                 (
172 |                     "scrapy-playwright",
173 |                     logging.DEBUG,
174 |                     "Browser disconnected",
175 |                 )
176 |             )
177 |             == 2  # one mid-crawl after calling Browser.close() manually, one at the end
178 |         )
179 |         assert (
180 |             self._caplog.record_tuples.count(
181 |                 (
182 |                     "scrapy-playwright",
183 |                     logging.INFO,
184 |                     "Launching browser chromium",
185 |                 )
186 |             )
187 |             == 2  # one at the beginning, one after calling Browser.close() manually
188 |         )
189 | 
190 |     @pytest.mark.skipif(
191 |         platform.system() == "Windows",
192 |         reason="os.kill does not work as expected on Windows",
193 |     )
194 |     async def test_browser_crashed_restart(self):
195 |         spider = Spider("foo")
196 |         async with make_handler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": "chromium"}) as handler:
197 |             with StaticMockServer() as server:
198 |                 req1 = Request(
199 |                     server.urljoin("/index.html"),
200 |                     meta={"playwright": True, "playwright_include_page": True},
201 |                 )
202 |                 resp1 = await handler._download_request(req1, spider)
203 |                 thread = Thread(target=self.kill_chrome, daemon=True)
204 |                 thread.start()
205 |                 req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
206 |                 req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True})
207 |                 req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True})
208 |                 resp2 = await handler._download_request(req2, spider)
209 |                 resp3 = await handler._download_request(req3, spider)
210 |                 resp4 = await handler._download_request(req4, spider)
211 |                 thread.join()
212 |         assert_correct_response(resp1, req1)
213 |         assert_correct_response(resp2, req2)
214 |         assert_correct_response(resp3, req3)
215 |         assert_correct_response(resp4, req4)
216 |         assert (
217 |             self._caplog.record_tuples.count(
218 |                 (
219 |                     "scrapy-playwright",
220 |                     logging.DEBUG,
221 |                     "Browser disconnected",
222 |                 )
223 |             )
224 |             == 2  # one mid-crawl after killing the browser process, one at the end
225 |         )
226 |         assert (
227 |             self._caplog.record_tuples.count(
228 |                 (
229 |                     "scrapy-playwright",
230 |                     logging.INFO,
231 |                     "Launching browser chromium",
232 |                 )
233 |             )
234 |             == 2  # one at the beginning, one after killing the broser process
235 |         )
236 | 
237 |     @pytest.mark.skipif(
238 |         platform.system() == "Windows",
239 |         reason="os.kill does not work as expected on Windows",
240 |     )
241 |     async def test_browser_crashed_do_not_restart(self):
242 |         spider = Spider("foo")
243 |         settings_dict = {
244 |             "PLAYWRIGHT_BROWSER_TYPE": "chromium",
245 |             "PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER": False,
246 |         }
247 |         async with make_handler(settings_dict=settings_dict) as handler:
248 |             with StaticMockServer() as server:
249 |                 await asyncio.sleep(1)  # allow time for the browser to fully launch
250 |                 req1 = Request(
251 |                     server.urljoin("/index.html"),
252 |                     meta={"playwright": True, "playwright_include_page": True},
253 |                 )
254 |                 resp1 = await handler._download_request(req1, spider)
255 |                 assert_correct_response(resp1, req1)
256 |                 thread = Thread(target=self.kill_chrome, daemon=True)
257 |                 thread.start()
258 |                 req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
259 |                 req3 = Request(server.urljoin("/lorem_ipsum.html"), meta={"playwright": True})
260 |                 req4 = Request(server.urljoin("/scroll.html"), meta={"playwright": True})
261 |                 with pytest.raises(TargetClosedError):
262 |                     await handler._download_request(req2, spider)
263 |                     await handler._download_request(req3, spider)
264 |                     await handler._download_request(req4, spider)
265 |                 thread.join()
266 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_browser_contexts.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import platform
  3 | import tempfile
  4 | from pathlib import Path
  5 | from unittest import IsolatedAsyncioTestCase
  6 | from uuid import uuid4
  7 | 
  8 | import pytest
  9 | from playwright.async_api import Browser, TimeoutError as PlaywrightTimeoutError
 10 | from scrapy import Spider, Request
 11 | from scrapy_playwright.page import PageMethod
 12 | 
 13 | from tests import allow_windows, make_handler
 14 | from tests.mockserver import StaticMockServer
 15 | 
 16 | 
 17 | class MixinTestCaseMultipleContexts:
 18 |     @allow_windows
 19 |     async def test_context_kwargs(self):
 20 |         settings_dict = {
 21 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 22 |             "PLAYWRIGHT_CONTEXTS": {
 23 |                 "default": {"java_script_enabled": False},
 24 |             },
 25 |         }
 26 |         async with make_handler(settings_dict) as handler:
 27 |             with StaticMockServer() as server:
 28 |                 req = Request(
 29 |                     url=server.urljoin("/scroll.html"),
 30 |                     meta={
 31 |                         "playwright": True,
 32 |                         "playwright_page_methods": [
 33 |                             # cause a timeout by waiting on an element that is rendered with js
 34 |                             PageMethod("wait_for_selector", selector="div.quote", timeout=1000),
 35 |                         ],
 36 |                     },
 37 |                 )
 38 |                 with pytest.raises(PlaywrightTimeoutError):
 39 |                     await handler._download_request(req, Spider("foo"))
 40 | 
 41 |     @allow_windows
 42 |     async def test_contexts_max_pages(self):
 43 |         settings = {
 44 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 45 |             "PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1,
 46 |             "PLAYWRIGHT_CONTEXTS": {
 47 |                 "a": {"java_script_enabled": True},
 48 |                 "b": {"java_script_enabled": True},
 49 |             },
 50 |         }
 51 |         async with make_handler(settings) as handler:
 52 |             with StaticMockServer() as server:
 53 |                 requests = [
 54 |                     handler._download_request(
 55 |                         Request(
 56 |                             server.urljoin(f"/index.html?a={i}"),
 57 |                             meta={"playwright": True, "playwright_context": "a"},
 58 |                         ),
 59 |                         Spider("foo"),
 60 |                     )
 61 |                     for i in range(20)
 62 |                 ] + [
 63 |                     handler._download_request(
 64 |                         Request(
 65 |                             server.urljoin(f"/index.html?b={i}"),
 66 |                             meta={"playwright": True, "playwright_context": "b"},
 67 |                         ),
 68 |                         Spider("foo"),
 69 |                     )
 70 |                     for i in range(20)
 71 |                 ]
 72 |                 await asyncio.gather(*requests)
 73 | 
 74 |             assert handler.stats.get_value("playwright/page_count/max_concurrent") == 2
 75 | 
 76 |     @allow_windows
 77 |     async def test_max_contexts(self):
 78 |         def cb_close_context(task):
 79 |             response = task.result()
 80 |             asyncio.create_task(response.meta["playwright_page"].context.close())
 81 | 
 82 |         settings = {
 83 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 84 |             "PLAYWRIGHT_MAX_CONTEXTS": 4,
 85 |         }
 86 |         async with make_handler(settings) as handler:
 87 |             with StaticMockServer() as server:
 88 |                 tasks = []
 89 |                 for i in range(20):
 90 |                     request = Request(
 91 |                         url=server.urljoin(f"/index.html?a={i}"),
 92 |                         callback=cb_close_context,
 93 |                         meta={
 94 |                             "playwright": True,
 95 |                             "playwright_include_page": True,
 96 |                             "playwright_context": f"ctx-{i}",
 97 |                         },
 98 |                     )
 99 |                     coro = handler._download_request(
100 |                         request=request,
101 |                         spider=Spider("foo"),
102 |                     )
103 |                     # callbacks are not invoked at the download handler, call them explicitly
104 |                     task = asyncio.create_task(coro)
105 |                     task.add_done_callback(request.callback)
106 |                     tasks.append(task)
107 |                 await asyncio.gather(*tasks)
108 | 
109 |             assert handler.stats.get_value("playwright/context_count/max_concurrent") == 4
110 | 
111 |     @allow_windows
112 |     async def test_contexts_startup(self):
113 |         settings = {
114 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
115 |             "PLAYWRIGHT_CONTEXTS": {
116 |                 "first": {
117 |                     "storage_state": {
118 |                         "cookies": [
119 |                             {
120 |                                 "url": "https://example.org",
121 |                                 "name": "foo",
122 |                                 "value": "bar",
123 |                             },
124 |                         ],
125 |                     },
126 |                 },
127 |             },
128 |         }
129 |         async with make_handler(settings) as handler:
130 |             assert len(handler.context_wrappers) == 1
131 | 
132 |             with StaticMockServer() as server:
133 |                 meta = {
134 |                     "playwright": True,
135 |                     "playwright_include_page": True,
136 |                     "playwright_context": "first",
137 |                 }
138 |                 req = Request(server.urljoin("/index.html"), meta=meta)
139 |                 resp = await handler._download_request(req, Spider("foo"))
140 | 
141 |             page = resp.meta["playwright_page"]
142 |             storage_state = await page.context.storage_state()
143 |             await page.close()
144 |             await page.context.close()
145 |             cookie = storage_state["cookies"][0]
146 |             assert cookie["name"] == "foo"
147 |             assert cookie["value"] == "bar"
148 |             assert cookie["domain"] == "example.org"
149 | 
150 |     @allow_windows
151 |     async def test_persistent_context(self):
152 |         temp_dir = f"{tempfile.gettempdir()}/{uuid4()}"
153 |         settings = {
154 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
155 |             "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000,
156 |             "PLAYWRIGHT_CONTEXTS": {
157 |                 "persistent": {
158 |                     "user_data_dir": temp_dir,
159 |                 },
160 |             },
161 |         }
162 |         assert not Path(temp_dir).exists()
163 |         async with make_handler(settings) as handler:
164 |             assert Path(temp_dir).is_dir()
165 |             assert len(handler.context_wrappers) == 1
166 |             assert handler.context_wrappers["persistent"].persistent
167 |             assert not hasattr(handler, "browser")
168 | 
169 |     @allow_windows
170 |     async def test_mixed_persistent_contexts(self):
171 |         temp_dir = f"{tempfile.gettempdir()}/{uuid4()}"
172 |         settings = {
173 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
174 |             "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 3000,
175 |             "PLAYWRIGHT_CONTEXTS": {
176 |                 "persistent": {
177 |                     "user_data_dir": temp_dir,
178 |                 },
179 |                 "non-persistent": {
180 |                     "java_script_enabled": False,
181 |                 },
182 |             },
183 |         }
184 |         assert not Path(temp_dir).exists()
185 |         async with make_handler(settings) as handler:
186 |             assert Path(temp_dir).is_dir()
187 |             assert len(handler.context_wrappers) == 2
188 |             assert handler.context_wrappers["persistent"].persistent
189 |             assert not handler.context_wrappers["non-persistent"].persistent
190 |             assert isinstance(handler.browser, Browser)
191 | 
192 |     @allow_windows
193 |     async def test_contexts_dynamic(self):
194 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
195 |             assert len(handler.context_wrappers) == 0
196 | 
197 |             with StaticMockServer() as server:
198 |                 meta = {
199 |                     "playwright": True,
200 |                     "playwright_include_page": True,
201 |                     "playwright_context": "new",
202 |                     "playwright_context_kwargs": {
203 |                         "storage_state": {
204 |                             "cookies": [
205 |                                 {
206 |                                     "url": "https://example.org",
207 |                                     "name": "asdf",
208 |                                     "value": "qwerty",
209 |                                 },
210 |                             ],
211 |                         },
212 |                     },
213 |                 }
214 |                 req = Request(server.urljoin("/index.html"), meta=meta)
215 |                 resp = await handler._download_request(req, Spider("foo"))
216 | 
217 |             assert len(handler.context_wrappers) == 1
218 | 
219 |             page = resp.meta["playwright_page"]
220 |             storage_state = await page.context.storage_state()
221 |             await page.close()
222 |             cookie = storage_state["cookies"][0]
223 |             assert cookie["name"] == "asdf"
224 |             assert cookie["value"] == "qwerty"
225 |             assert cookie["domain"] == "example.org"
226 | 
227 | 
228 | class TestCaseMultipleContextsChromium(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts):
229 |     browser_type = "chromium"
230 | 
231 | 
232 | class TestCaseMultipleContextsFirefox(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts):
233 |     browser_type = "firefox"
234 | 
235 | 
236 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
237 | class TestCaseMultipleContextsWebkit(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts):
238 |     browser_type = "webkit"
239 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_extensions.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from asyncio.subprocess import Process as AsyncioProcess
 3 | from unittest import IsolatedAsyncioTestCase
 4 | from unittest.mock import MagicMock, patch
 5 | 
 6 | import pytest
 7 | from playwright.async_api import PlaywrightContextManager
 8 | from scrapy.exceptions import NotConfigured
 9 | from scrapy.extensions.memusage import MemoryUsage
10 | 
11 | from scrapy_playwright.memusage import ScrapyPlaywrightMemoryUsageExtension
12 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
13 | 
14 | 
15 | SCHEMA_PID_MAP = {"http": 123, "https": 456}
16 | 
17 | 
18 | def mock_crawler_with_handlers() -> dict:
19 |     handlers = {}
20 |     for schema, pid in SCHEMA_PID_MAP.items():
21 |         process = MagicMock()
22 |         process.pid = pid
23 |         handlers[schema] = MagicMock(spec=ScrapyPlaywrightDownloadHandler)
24 |         handlers[schema].playwright_context_manager._connection._transport._proc = process
25 |     crawler = MagicMock()
26 |     crawler.engine.downloader.handlers._handlers = handlers
27 |     return crawler
28 | 
29 | 
30 | def raise_import_error(*args, **kwargs):
31 |     raise ImportError
32 | 
33 | 
34 | class MockMemoryInfo:
35 |     rss = 999
36 | 
37 | 
38 | @pytest.mark.skipif(
39 |     platform.system() == "Windows",
40 |     reason="resource stdlib module is not available on Windows",
41 | )
42 | @patch("scrapy.extensions.memusage.MailSender")
43 | class TestMemoryUsageExtension(IsolatedAsyncioTestCase):
44 |     async def test_process_availability(self, _MailSender):
45 |         """The main node process should be accessible from the context manager"""
46 |         ctx_manager = PlaywrightContextManager()
47 |         await ctx_manager.start()
48 |         assert isinstance(ctx_manager._connection._transport._proc, AsyncioProcess)
49 |         await ctx_manager.__aexit__()
50 | 
51 |     @patch("scrapy_playwright.memusage.import_module", side_effect=raise_import_error)
52 |     async def test_psutil_not_available_extension_disabled(self, _import_module, _MailSender):
53 |         crawler = MagicMock()
54 |         with pytest.raises(NotConfigured):
55 |             ScrapyPlaywrightMemoryUsageExtension(crawler)
56 | 
57 |     async def test_get_process_ids_ok(self, _MailSender):
58 |         crawler = mock_crawler_with_handlers()
59 |         extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
60 |         assert extension._get_main_process_ids() == list(SCHEMA_PID_MAP.values())
61 | 
62 |     async def test_get_process_ids_error(self, _MailSender):
63 |         crawler = mock_crawler_with_handlers()
64 |         crawler.engine.downloader.handlers._handlers = MagicMock()
65 |         crawler.engine.downloader.handlers._handlers.values.side_effect = raise_import_error
66 |         extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
67 |         assert extension._get_main_process_ids() == []
68 | 
69 |     async def test_get_descendant_processes(self, _MailSender):
70 |         p1 = MagicMock()
71 |         p2 = MagicMock()
72 |         p3 = MagicMock()
73 |         p4 = MagicMock()
74 |         p2.children.return_value = [p3, p4]
75 |         p1.children.return_value = [p2]
76 |         crawler = MagicMock()
77 |         extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
78 |         assert extension._get_descendant_processes(p1) == [p2, p3, p4]
79 | 
80 |     async def test_get_total_process_size(self, _MailSender):
81 |         crawler = MagicMock()
82 |         extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
83 |         extension.psutil = MagicMock()
84 |         extension.psutil.Process.return_value.memory_info.return_value = MockMemoryInfo()
85 |         extension._get_main_process_ids = MagicMock(return_value=[1, 2, 3])
86 |         expected_size = MockMemoryInfo().rss * len(extension._get_main_process_ids())
87 |         assert extension._get_total_playwright_process_memory() == expected_size
88 | 
89 |     async def test_get_virtual_size_sum(self, _MailSender):
90 |         crawler = MagicMock()
91 |         extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
92 |         parent_cls_extension = MemoryUsage(crawler)
93 |         extension._get_total_playwright_process_memory = MagicMock(return_value=123)
94 |         assert extension.get_virtual_size() == parent_cls_extension.get_virtual_size() + 123
95 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_headers.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import platform
  4 | import warnings
  5 | from unittest import IsolatedAsyncioTestCase
  6 | 
  7 | import pytest
  8 | from scrapy import Spider, Request
  9 | 
 10 | from tests import allow_windows, make_handler
 11 | from tests.mockserver import MockServer
 12 | 
 13 | 
 14 | class MixinProcessHeadersTestCase:
 15 |     @pytest.fixture(autouse=True)
 16 |     def inject_fixtures(self, caplog):
 17 |         caplog.set_level(logging.DEBUG)
 18 |         self._caplog = caplog
 19 | 
 20 |     @allow_windows
 21 |     async def test_user_agent(self):
 22 |         settings_dict = {
 23 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 24 |             "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
 25 |             "USER_AGENT": None,
 26 |         }
 27 |         async with make_handler(settings_dict) as handler:
 28 |             with MockServer() as server:
 29 |                 # if Scrapy's user agent is None, use the one from the Browser
 30 |                 req = Request(
 31 |                     url=server.urljoin("/headers"),
 32 |                     meta={"playwright": True},
 33 |                 )
 34 |                 resp = await handler._download_request(req, Spider("foo"))
 35 |                 headers = json.loads(resp.css("pre::text").get())
 36 |                 headers = {key.lower(): value for key, value in headers.items()}
 37 |                 assert headers["user-agent"] == self.browser_type
 38 | 
 39 |                 # if Scrapy's user agent is set to some value, use it
 40 |                 req = Request(
 41 |                     url=server.urljoin("/headers"),
 42 |                     meta={"playwright": True},
 43 |                     headers={"User-Agent": "foobar"},
 44 |                 )
 45 |                 resp = await handler._download_request(req, Spider("foo"))
 46 |                 headers = json.loads(resp.css("pre::text").get())
 47 |                 headers = {key.lower(): value for key, value in headers.items()}
 48 |                 assert headers["user-agent"] == "foobar"
 49 | 
 50 |     @allow_windows
 51 |     async def test_playwright_headers(self):
 52 |         """Ignore Scrapy headers"""
 53 |         settings_dict = {
 54 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 55 |             "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
 56 |             "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
 57 |             "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 2000,
 58 |         }
 59 |         async with make_handler(settings_dict) as handler:
 60 |             with MockServer() as server:
 61 |                 req = Request(
 62 |                     url=server.urljoin("/headers"),
 63 |                     meta={"playwright": True},
 64 |                     headers={"User-Agent": "foobar", "Asdf": "qwerty"},
 65 |                 )
 66 |                 resp = await handler._download_request(req, Spider("foo"))
 67 |                 headers = json.loads(resp.css("pre::text").get())
 68 |                 headers = {key.lower(): value for key, value in headers.items()}
 69 |                 assert headers["user-agent"] == self.browser_type
 70 |                 assert req.headers["user-agent"].decode("utf-8") == self.browser_type
 71 |                 assert "asdf" not in headers
 72 |                 assert "asdf" not in req.headers
 73 |                 assert b"asdf" not in req.headers
 74 | 
 75 |     @allow_windows
 76 |     async def test_use_custom_headers_ok(self):
 77 |         """Custom header processing function"""
 78 | 
 79 |         async def important_headers(
 80 |             browser_type_name,  # pylint: disable=unused-argument
 81 |             playwright_request,  # pylint: disable=unused-argument
 82 |             scrapy_request_data,  # pylint: disable=unused-argument
 83 |         ) -> dict:
 84 |             return {"foo": "bar"}
 85 | 
 86 |         settings_dict = {
 87 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 88 |             "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
 89 |             "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
 90 |         }
 91 |         async with make_handler(settings_dict) as handler:
 92 |             with MockServer() as server:
 93 |                 req = Request(
 94 |                     url=server.urljoin("/headers"),
 95 |                     meta={"playwright": True},
 96 |                     headers={"User-Agent": "foobar", "Asdf": "qwerty"},
 97 |                 )
 98 |                 with warnings.catch_warnings(record=True) as warning_list:
 99 |                     resp = await handler._download_request(req, Spider("foo"))
100 |                 assert not warning_list
101 |                 headers = json.loads(resp.css("pre::text").get())
102 |                 headers = {key.lower(): value for key, value in headers.items()}
103 |                 assert headers["foo"] == "bar"
104 |                 assert headers.get("user-agent") not in (self.browser_type, "foobar")
105 |                 assert "asdf" not in headers
106 | 
107 |     @allow_windows
108 |     async def test_use_custom_headers_deprecated_arg_handling(self):
109 |         """Custom header processing function that receives deprecated args"""
110 | 
111 |         async def deprecated_args(
112 |             browser_name, pw_req, headers  # pylint: disable=unused-argument
113 |         ) -> dict:
114 |             return {"foo": "bar"}
115 | 
116 |         settings_dict = {
117 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
118 |             "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
119 |             "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": deprecated_args,
120 |         }
121 |         async with make_handler(settings_dict) as handler:
122 |             with MockServer() as server:
123 |                 req = Request(
124 |                     url=server.urljoin("/headers"),
125 |                     meta={"playwright": True},
126 |                     headers={"User-Agent": "foobar", "Asdf": "qwerty"},
127 |                 )
128 |                 with warnings.catch_warnings(record=True) as warning_list:
129 |                     resp = await handler._download_request(req, Spider("foo"))
130 |                 headers = json.loads(resp.css("pre::text").get())
131 |                 headers = {key.lower(): value for key, value in headers.items()}
132 |                 assert headers["foo"] == "bar"
133 |                 assert headers.get("user-agent") not in (self.browser_type, "foobar")
134 |                 assert "asdf" not in headers
135 |                 assert str(warning_list[0].message) == (
136 |                     "Accepting positional arguments in the function passed to the"
137 |                     " PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function"
138 |                     " should accept three (3) keyword arguments instead:"
139 |                     " browser_type_name: str,"
140 |                     " playwright_request: playwright.async_api.Request,"
141 |                     " scrapy_request_data: dict"
142 |                 )
143 | 
144 | 
145 | class TestProcessHeadersChromium(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase):
146 |     browser_type = "chromium"
147 | 
148 | 
149 | class TestProcessHeadersFirefox(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase):
150 |     browser_type = "firefox"
151 | 
152 | 
153 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
154 | class TestProcessHeadersWebkit(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase):
155 |     browser_type = "webkit"
156 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_page_methods.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import platform
  3 | import subprocess
  4 | from tempfile import NamedTemporaryFile
  5 | from unittest import IsolatedAsyncioTestCase
  6 | 
  7 | import pytest
  8 | from scrapy import Spider, Request
  9 | from scrapy.http.response.html import HtmlResponse
 10 | 
 11 | from playwright.async_api import Page
 12 | from scrapy_playwright.page import PageMethod
 13 | 
 14 | from tests import allow_windows, make_handler, assert_correct_response
 15 | from tests.mockserver import StaticMockServer
 16 | 
 17 | 
 18 | def get_mimetype(file):
 19 |     return subprocess.run(
 20 |         ["file", "--mime-type", "--brief", file.name],
 21 |         stdout=subprocess.PIPE,
 22 |         universal_newlines=True,
 23 |         check=False,
 24 |     ).stdout.strip()
 25 | 
 26 | 
 27 | class TestPageMethods(IsolatedAsyncioTestCase):
 28 |     @allow_windows
 29 |     async def test_page_methods(self):
 30 |         screenshot = PageMethod("screenshot", "foo", 123, path="/tmp/file", type="png")
 31 |         assert screenshot.method == "screenshot"
 32 |         assert screenshot.args == ("foo", 123)
 33 |         assert screenshot.kwargs == {"path": "/tmp/file", "type": "png"}
 34 |         assert screenshot.result is None
 35 |         assert str(screenshot) == "<PageMethod for method 'screenshot'>"
 36 | 
 37 | 
 38 | class MixinPageMethodTestCase:
 39 |     @pytest.fixture(autouse=True)
 40 |     def inject_fixtures(self, caplog):
 41 |         caplog.set_level(logging.DEBUG)
 42 |         self._caplog = caplog
 43 | 
 44 |     @allow_windows
 45 |     async def test_page_non_page_method(self):
 46 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
 47 |             with StaticMockServer() as server:
 48 |                 req = Request(
 49 |                     url=server.urljoin("/index.html"),
 50 |                     meta={
 51 |                         "playwright": True,
 52 |                         "playwright_page_methods": [
 53 |                             "not-a-page-method",
 54 |                             5,
 55 |                             None,
 56 |                         ],
 57 |                     },
 58 |                 )
 59 |                 resp = await handler._download_request(req, Spider("foo"))
 60 | 
 61 |         assert_correct_response(resp, req)
 62 |         for obj in req.meta["playwright_page_methods"]:
 63 |             assert (
 64 |                 "scrapy-playwright",
 65 |                 logging.WARNING,
 66 |                 f"Ignoring {repr(obj)}: expected PageMethod, got {repr(type(obj))}",
 67 |             ) in self._caplog.record_tuples
 68 | 
 69 |     @allow_windows
 70 |     async def test_page_mixed_page_methods(self):
 71 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
 72 |             with StaticMockServer() as server:
 73 |                 req = Request(
 74 |                     url=server.urljoin("/index.html"),
 75 |                     meta={
 76 |                         "playwright": True,
 77 |                         "playwright_page_methods": {
 78 |                             "does_not_exist": PageMethod("does_not_exist"),
 79 |                             "is_closed": PageMethod("is_closed"),  # not awaitable
 80 |                             "title": PageMethod("title"),  # awaitable
 81 |                         },
 82 |                     },
 83 |                 )
 84 |                 resp = await handler._download_request(req, Spider("foo"))
 85 | 
 86 |         assert_correct_response(resp, req)
 87 |         does_not_exist = req.meta["playwright_page_methods"]["does_not_exist"]
 88 |         assert (
 89 |             "scrapy-playwright",
 90 |             logging.WARNING,
 91 |             f"Ignoring {repr(does_not_exist)}: could not find method",
 92 |         ) in self._caplog.record_tuples
 93 |         assert not req.meta["playwright_page_methods"]["is_closed"].result
 94 |         assert req.meta["playwright_page_methods"]["title"].result == "Awesome site"
 95 | 
 96 |     @allow_windows
 97 |     async def test_page_method_navigation(self):
 98 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
 99 |             with StaticMockServer() as server:
100 |                 req = Request(
101 |                     url=server.urljoin("/index.html"),
102 |                     meta={
103 |                         "playwright": True,
104 |                         "playwright_page_methods": [PageMethod("click", "a.lorem_ipsum")],
105 |                     },
106 |                 )
107 |                 resp = await handler._download_request(req, Spider("foo"))
108 | 
109 |             assert isinstance(resp, HtmlResponse)
110 |             assert resp.request is req
111 |             assert resp.url == server.urljoin("/lorem_ipsum.html")
112 |             assert resp.status == 200
113 |             assert "playwright" in resp.flags
114 |             assert resp.css("title::text").get() == "Lorem Ipsum"
115 |             text = resp.css("p::text").get()
116 |             assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
117 | 
118 |     @allow_windows
119 |     async def test_page_method_infinite_scroll(self):
120 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
121 |             with StaticMockServer() as server:
122 |                 req = Request(
123 |                     url=server.urljoin("/scroll.html"),
124 |                     headers={"User-Agent": "scrapy-playwright"},
125 |                     meta={
126 |                         "playwright": True,
127 |                         "playwright_page_methods": [
128 |                             PageMethod("wait_for_selector", selector="div.quote"),
129 |                             PageMethod(
130 |                                 "evaluate", "window.scrollBy(0, document.body.scrollHeight)"
131 |                             ),
132 |                             PageMethod("wait_for_selector", selector="div.quote:nth-child(11)"),
133 |                             PageMethod(
134 |                                 "evaluate", "window.scrollBy(0, document.body.scrollHeight)"
135 |                             ),
136 |                             PageMethod("wait_for_selector", selector="div.quote:nth-child(21)"),
137 |                         ],
138 |                     },
139 |                 )
140 |                 resp = await handler._download_request(req, Spider("foo"))
141 | 
142 |             assert_correct_response(resp, req)
143 |             assert len(resp.css("div.quote")) == 30
144 | 
145 |     @allow_windows
146 |     async def test_page_method_screenshot(self):
147 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
148 |             with NamedTemporaryFile(mode="w+b", delete=False) as png_file:
149 |                 with StaticMockServer() as server:
150 |                     req = Request(
151 |                         url=server.urljoin("/index.html"),
152 |                         meta={
153 |                             "playwright": True,
154 |                             "playwright_page_methods": {
155 |                                 "png": PageMethod("screenshot", path=png_file.name, type="png"),
156 |                             },
157 |                         },
158 |                     )
159 |                     await handler._download_request(req, Spider("foo"))
160 | 
161 |                 png_file.file.seek(0)
162 |                 assert png_file.file.read() == req.meta["playwright_page_methods"]["png"].result
163 |                 if platform.system() != "Windows":
164 |                     assert get_mimetype(png_file) == "image/png"
165 | 
166 |     @allow_windows
167 |     async def test_page_method_pdf(self):
168 |         if self.browser_type != "chromium":
169 |             pytest.skip("PDF generation is supported only in Chromium")
170 | 
171 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
172 |             with NamedTemporaryFile(mode="w+b", delete=False) as pdf_file:
173 |                 with StaticMockServer() as server:
174 |                     req = Request(
175 |                         url=server.urljoin("/index.html"),
176 |                         meta={
177 |                             "playwright": True,
178 |                             "playwright_page_methods": {
179 |                                 "pdf": PageMethod("pdf", path=pdf_file.name),
180 |                             },
181 |                         },
182 |                     )
183 |                     await handler._download_request(req, Spider("foo"))
184 | 
185 |                 pdf_file.file.seek(0)
186 |                 assert pdf_file.file.read() == req.meta["playwright_page_methods"]["pdf"].result
187 |                 if platform.system() != "Windows":
188 |                     assert get_mimetype(pdf_file) == "application/pdf"
189 | 
190 |     @allow_windows
191 |     async def test_page_method_callable(self):
192 | 
193 |         async def scroll_page(page: Page) -> str:
194 |             await page.wait_for_selector(selector="div.quote")
195 |             await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
196 |             await page.wait_for_selector(selector="div.quote:nth-child(11)")
197 |             await page.evaluate("window.scrollBy(0, document.body.scrollHeight)")
198 |             await page.wait_for_selector(selector="div.quote:nth-child(21)")
199 |             return page.url
200 | 
201 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
202 |             with StaticMockServer() as server:
203 |                 req = Request(
204 |                     url=server.urljoin("/scroll.html"),
205 |                     meta={
206 |                         "playwright": True,
207 |                         "playwright_page_methods": {
208 |                             "callable": PageMethod(scroll_page),
209 |                         },
210 |                     },
211 |                 )
212 |                 resp = await handler._download_request(req, Spider("foo"))
213 | 
214 |             assert_correct_response(resp, req)
215 |             assert len(resp.css("div.quote")) == 30
216 |             assert resp.meta["playwright_page_methods"]["callable"].result == resp.url
217 | 
218 | 
219 | class TestPageMethodChromium(IsolatedAsyncioTestCase, MixinPageMethodTestCase):
220 |     browser_type = "chromium"
221 | 
222 | 
223 | class TestPageMethodFirefox(IsolatedAsyncioTestCase, MixinPageMethodTestCase):
224 |     browser_type = "firefox"
225 | 
226 | 
227 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
228 | class TestPageMethodWebkit(IsolatedAsyncioTestCase, MixinPageMethodTestCase):
229 |     browser_type = "webkit"
230 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_playwright_requests.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import logging
  4 | import platform
  5 | from ipaddress import ip_address
  6 | from unittest import IsolatedAsyncioTestCase
  7 | from unittest.mock import AsyncMock, MagicMock, patch
  8 | 
  9 | import pytest
 10 | from playwright.async_api import (
 11 |     Dialog,
 12 |     Error as PlaywrightError,
 13 |     Page as PlaywrightPage,
 14 |     TimeoutError as PlaywrightTimeoutError,
 15 | )
 16 | from scrapy import Spider, Request, FormRequest
 17 | 
 18 | from scrapy_playwright.handler import DEFAULT_CONTEXT_NAME
 19 | from scrapy_playwright.page import PageMethod
 20 | 
 21 | from tests import allow_windows, make_handler, assert_correct_response
 22 | from tests.mockserver import MockServer, StaticMockServer
 23 | 
 24 | 
 25 | class DialogSpider(Spider):
 26 |     """A spider with a method to handle the "dialog" page event"""
 27 | 
 28 |     name = "dialog"
 29 | 
 30 |     def parse(self, **_kwargs) -> None:
 31 |         return None
 32 | 
 33 |     async def handle_dialog(self, dialog: Dialog) -> None:
 34 |         self.dialog_message = dialog.message
 35 |         await dialog.dismiss()
 36 | 
 37 | 
 38 | class MixinTestCase:
 39 |     browser_type: str
 40 | 
 41 |     @pytest.fixture(autouse=True)
 42 |     def inject_fixtures(self, caplog):
 43 |         caplog.set_level(logging.DEBUG)
 44 |         self._caplog = caplog
 45 | 
 46 |     @allow_windows
 47 |     async def test_basic_response(self):
 48 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
 49 |             with StaticMockServer() as server:
 50 |                 meta = {"playwright": True, "playwright_include_page": True}
 51 |                 req = Request(server.urljoin("/index.html"), meta=meta)
 52 |                 resp = await handler._download_request(req, Spider("foo"))
 53 | 
 54 |             assert_correct_response(resp, req)
 55 |             assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"]
 56 |             assert isinstance(resp.meta["playwright_page"], PlaywrightPage)
 57 |             assert resp.meta["playwright_page"].url == resp.url
 58 |             await resp.meta["playwright_page"].close()
 59 | 
 60 |     @allow_windows
 61 |     async def test_post_request(self):
 62 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
 63 |             with MockServer() as server:
 64 |                 req = FormRequest(
 65 |                     server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"}
 66 |                 )
 67 |                 resp = await handler._download_request(req, Spider("foo"))
 68 | 
 69 |             assert_correct_response(resp, req)
 70 |             assert "Request body: foo=bar" in resp.text
 71 | 
 72 |     @allow_windows
 73 |     async def test_timeout_error(self):
 74 |         settings_dict = {
 75 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
 76 |             "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 100,
 77 |         }
 78 |         async with make_handler(settings_dict) as handler:
 79 |             with MockServer() as server:
 80 |                 req = Request(server.urljoin("/headers?delay=1"), meta={"playwright": True})
 81 |                 with pytest.raises(PlaywrightTimeoutError) as excinfo:
 82 |                     await handler._download_request(req, Spider("foo"))
 83 |                 assert (
 84 |                     "scrapy-playwright",
 85 |                     logging.WARNING,
 86 |                     f"Closing page due to failed request: {req}"
 87 |                     f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}",
 88 |                 ) in self._caplog.record_tuples
 89 | 
 90 |     @allow_windows
 91 |     async def test_retry_page_content_still_navigating(self):
 92 |         if self.browser_type != "chromium":
 93 |             pytest.skip("Only Chromium seems to redirect meta tags within the same goto call")
 94 | 
 95 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
 96 |             with StaticMockServer() as server:
 97 |                 req = Request(server.urljoin("/redirect.html"), meta={"playwright": True})
 98 |                 resp = await handler._download_request(req, Spider("foo"))
 99 | 
100 |             assert resp.request is req
101 |             assert resp.url == server.urljoin("/index.html")  # redirected
102 |             assert resp.status == 200
103 |             assert "playwright" in resp.flags
104 |             assert (
105 |                 "scrapy-playwright",
106 |                 logging.DEBUG,
107 |                 f"Retrying to get content from page '{req.url}', error: 'Unable to retrieve"
108 |                 " content because the page is navigating and changing the content.'",
109 |             ) in self._caplog.record_tuples
110 | 
111 |     @patch("scrapy_playwright.handler.logger")
112 |     @allow_windows
113 |     async def test_route_continue_exception(self, logger):
114 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
115 |             scrapy_request = Request(url="https://example.org", method="GET")
116 |             spider = Spider("foo")
117 |             initial_request_done = asyncio.Event()
118 |             req_handler = handler._make_request_handler(
119 |                 context_name=DEFAULT_CONTEXT_NAME,
120 |                 method=scrapy_request.method,
121 |                 url=scrapy_request.url,
122 |                 headers=scrapy_request.headers,
123 |                 body=None,
124 |                 encoding="utf-8",
125 |                 spider=spider,
126 |                 initial_request_done=initial_request_done,
127 |             )
128 |             route = MagicMock()
129 |             playwright_request = AsyncMock()
130 |             playwright_request.url = scrapy_request.url
131 |             playwright_request.method = scrapy_request.method
132 |             playwright_request.is_navigation_request = MagicMock(return_value=True)
133 |             playwright_request.all_headers.return_value = {}
134 | 
135 |             # safe error, only warn
136 |             ex = PlaywrightError("Target page, context or browser has been closed")
137 |             route.continue_.side_effect = ex
138 |             await req_handler(route, playwright_request)
139 |             logger.warning.assert_called_with(
140 |                 "Failed processing Playwright request: <%s %s> exc_type=%s exc_msg=%s",
141 |                 playwright_request.method,
142 |                 playwright_request.url,
143 |                 type(ex),
144 |                 str(ex),
145 |                 extra={
146 |                     "spider": spider,
147 |                     "context_name": DEFAULT_CONTEXT_NAME,
148 |                     "scrapy_request_url": scrapy_request.url,
149 |                     "scrapy_request_method": scrapy_request.method,
150 |                     "playwright_request_url": playwright_request.url,
151 |                     "playwright_request_method": playwright_request.method,
152 |                     "exception": ex,
153 |                 },
154 |                 exc_info=True,
155 |             )
156 | 
157 |             # unknown errors, re-raise
158 |             route.continue_.side_effect = ZeroDivisionError("asdf")
159 |             with pytest.raises(ZeroDivisionError):
160 |                 await req_handler(route, playwright_request)
161 |             route.continue_.side_effect = PlaywrightError("qwerty")
162 |             with pytest.raises(PlaywrightError):
163 |                 await req_handler(route, playwright_request)
164 | 
165 |     @allow_windows
166 |     async def test_event_handler_dialog_callable(self):
167 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
168 |             with StaticMockServer() as server:
169 |                 spider = DialogSpider()
170 |                 req = Request(
171 |                     url=server.urljoin("/index.html"),
172 |                     meta={
173 |                         "playwright": True,
174 |                         "playwright_page_methods": [
175 |                             # trigger an alert
176 |                             PageMethod("evaluate", "alert('foobar');"),
177 |                         ],
178 |                         "playwright_page_event_handlers": {
179 |                             "dialog": spider.handle_dialog,
180 |                         },
181 |                     },
182 |                 )
183 |                 await handler._download_request(req, spider)
184 | 
185 |             assert spider.dialog_message == "foobar"
186 | 
187 |     @allow_windows
188 |     async def test_event_handler_dialog_str(self):
189 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
190 |             with StaticMockServer() as server:
191 |                 spider = DialogSpider()
192 |                 req = Request(
193 |                     url=server.urljoin("/index.html"),
194 |                     meta={
195 |                         "playwright": True,
196 |                         "playwright_page_methods": [
197 |                             # trigger an alert
198 |                             PageMethod("evaluate", "alert('foobar');"),
199 |                         ],
200 |                         "playwright_page_event_handlers": {
201 |                             "dialog": "handle_dialog",
202 |                         },
203 |                     },
204 |                 )
205 |                 await handler._download_request(req, spider)
206 | 
207 |             assert spider.dialog_message == "foobar"
208 | 
209 |     @allow_windows
210 |     async def test_event_handler_dialog_missing(self):
211 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
212 |             with StaticMockServer() as server:
213 |                 spider = DialogSpider()
214 |                 req = Request(
215 |                     url=server.urljoin("/index.html"),
216 |                     meta={
217 |                         "playwright": True,
218 |                         "playwright_page_event_handlers": {
219 |                             "dialog": "missing_method",
220 |                         },
221 |                     },
222 |                 )
223 |                 await handler._download_request(req, spider)
224 | 
225 |         assert (
226 |             "scrapy-playwright",
227 |             logging.WARNING,
228 |             "Spider 'dialog' does not have a 'missing_method' attribute,"
229 |             " ignoring handler for event 'dialog'",
230 |         ) in self._caplog.record_tuples
231 |         assert getattr(spider, "dialog_message", None) is None
232 | 
233 |     @allow_windows
234 |     async def test_response_attributes(self):
235 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
236 |             with MockServer() as server:
237 |                 req = Request(
238 |                     url=server.urljoin(),
239 |                     meta={"playwright": True},
240 |                 )
241 |                 response = await handler._download_request(req, Spider("spider_name"))
242 | 
243 |         assert response.ip_address == ip_address(server.address)
244 | 
245 |     @allow_windows
246 |     async def test_page_goto_kwargs_referer(self):
247 |         if self.browser_type != "chromium":
248 |             pytest.skip("referer as goto kwarg seems to work only with chromium :shrug:")
249 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
250 |             with MockServer() as server:
251 |                 fake_referer = server.urljoin("/fake/referer")
252 |                 req = Request(
253 |                     url=server.urljoin("/headers"),
254 |                     meta={
255 |                         "playwright": True,
256 |                         "playwright_page_goto_kwargs": {"referer": fake_referer},
257 |                     },
258 |                 )
259 |                 response = await handler._download_request(req, Spider("spider_name"))
260 | 
261 |         headers = json.loads(response.css("pre::text").get())
262 |         assert headers["Referer"] == fake_referer
263 | 
264 |     @allow_windows
265 |     async def test_navigation_returns_none(self):
266 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
267 |             with MockServer():
268 |                 req = Request(url="about:blank", meta={"playwright": True})
269 |                 response = await handler._download_request(req, Spider("spider_name"))
270 | 
271 |         assert (
272 |             "scrapy-playwright",
273 |             logging.WARNING,
274 |             f"Navigating to {req!r} returned None, the response"
275 |             " will have empty headers and status 200",
276 |         ) in self._caplog.record_tuples
277 |         assert not response.headers
278 |         assert response.status == 200
279 | 
280 |     @allow_windows
281 |     async def test_abort_requests(self):
282 |         async def should_abort_request_async(request):
283 |             return request.resource_type == "image"
284 | 
285 |         def should_abort_request_sync(request):
286 |             return request.resource_type == "image"
287 | 
288 |         for predicate in (
289 |             lambda request: request.resource_type == "image",
290 |             should_abort_request_async,
291 |             should_abort_request_sync,
292 |         ):
293 |             settings_dict = {
294 |                 "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
295 |                 "PLAYWRIGHT_ABORT_REQUEST": predicate,
296 |             }
297 |             async with make_handler(settings_dict) as handler:
298 |                 with StaticMockServer() as server:
299 |                     req = Request(
300 |                         url=server.urljoin("/gallery.html"),
301 |                         meta={"playwright": True},
302 |                     )
303 |                     await handler._download_request(req, Spider("foo"))
304 | 
305 |                     req_prefix = "playwright/request_count"
306 |                     resp_prefix = "playwright/response_count"
307 |                     assert handler.stats.get_value(f"{req_prefix}/resource_type/document") == 1
308 |                     assert handler.stats.get_value(f"{req_prefix}/resource_type/image") == 3
309 |                     assert handler.stats.get_value(f"{resp_prefix}/resource_type/document") == 1
310 |                     assert handler.stats.get_value(f"{resp_prefix}/resource_type/image") is None
311 |                     assert handler.stats.get_value(f"{req_prefix}/aborted") == 3
312 | 
313 |     @allow_windows
314 |     async def test_page_initialization_ok(self):
315 |         async def init_page(page, _request):
316 |             await page.set_extra_http_headers({"Extra-Header": "Qwerty"})
317 | 
318 |         settings_dict = {
319 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
320 |             "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
321 |         }
322 |         async with make_handler(settings_dict) as handler:
323 |             with MockServer() as server:
324 |                 req = Request(
325 |                     url=server.urljoin("/headers"),
326 |                     meta={"playwright": True, "playwright_page_init_callback": init_page},
327 |                 )
328 |                 response = await handler._download_request(req, Spider("spider_name"))
329 |         assert response.status == 200
330 |         headers = json.loads(response.css("pre::text").get())
331 |         headers = {key.lower(): value for key, value in headers.items()}
332 |         assert headers["extra-header"] == "Qwerty"
333 | 
334 |     @allow_windows
335 |     async def test_page_initialization_fail(self):
336 |         async def init_page(page, _request, _missing):
337 |             await page.set_extra_http_headers({"Extra-Header": "Qwerty"})
338 | 
339 |         settings_dict = {
340 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
341 |             "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": None,
342 |         }
343 |         async with make_handler(settings_dict) as handler:
344 |             with MockServer() as server:
345 |                 req = Request(
346 |                     url=server.urljoin("/headers"),
347 |                     meta={"playwright": True, "playwright_page_init_callback": init_page},
348 |                 )
349 |                 response = await handler._download_request(req, Spider("spider_name"))
350 |         assert response.status == 200
351 |         headers = json.loads(response.css("pre::text").get())
352 |         headers = {key.lower(): value for key, value in headers.items()}
353 |         assert "extra-header" not in headers
354 |         for entry in self._caplog.record_tuples:
355 |             if "Page init callback exception for" in entry[2]:
356 |                 assert entry[0] == "scrapy-playwright"
357 |                 assert entry[1] == logging.WARNING
358 |                 assert f"[Context=default] Page init callback exception for {req!r}" in entry[2]
359 |                 assert "init_page() missing 1 required positional argument: '_missing'" in entry[2]
360 | 
361 |     @allow_windows
362 |     async def test_redirect(self):
363 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
364 |             with MockServer() as server:
365 |                 req = Request(
366 |                     url=server.urljoin("/redirect2"),
367 |                     meta={"playwright": True},
368 |                 )
369 |                 response = await handler._download_request(req, Spider("spider_name"))
370 | 
371 |         assert response.url == server.urljoin("/headers")
372 |         assert response.meta["redirect_times"] == 2
373 |         assert response.meta["redirect_reasons"] == [302, 301]
374 |         assert response.meta["redirect_urls"] == [
375 |             server.urljoin("/redirect2"),
376 |             server.urljoin("/redirect"),
377 |         ]
378 | 
379 |     @allow_windows
380 |     async def test_logging_record_spider(self):
381 |         """Make sure at least one log record has the spider as an attribute
382 |         (records sent before opening the spider will not have it).
383 |         """
384 |         spider = Spider("spider_name")
385 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
386 |             with MockServer() as server:
387 |                 req = Request(url=server.urljoin("/index.html"), meta={"playwright": True})
388 |                 await handler._download_request(req, spider)
389 | 
390 |         assert any(getattr(rec, "spider", None) is spider for rec in self._caplog.records)
391 | 
392 |     @allow_windows
393 |     @patch("scrapy_playwright.handler._make_request_logger")
394 |     async def test_request_logger_disabled(self, make_request_logger: MagicMock):
395 |         self._caplog.set_level(logging.DEBUG + 1, "scrapy-playwright")
396 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
397 |             with MockServer() as server:
398 |                 req = Request(url=server.urljoin("/index.html"), meta={"playwright": True})
399 |                 await handler._download_request(req, Spider("foo"))
400 | 
401 |         debug_message = (
402 |             f"[Context=default] Request: <{req.method} {req.url}> (resource type: document)"
403 |         )
404 |         assert not any(rec.message == debug_message for rec in self._caplog.records)
405 |         make_request_logger.assert_not_called()
406 | 
407 |     @allow_windows
408 |     async def test_request_logger_enabled(self):
409 |         self._caplog.set_level(logging.DEBUG, "scrapy-playwright")
410 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
411 |             with MockServer() as server:
412 |                 req = Request(url=server.urljoin("/index.html"), meta={"playwright": True})
413 |                 await handler._download_request(req, Spider("foo"))
414 | 
415 |         debug_message = (
416 |             f"[Context=default] Request: <{req.method} {req.url}> (resource type: document)"
417 |         )
418 |         assert any(rec.message == debug_message for rec in self._caplog.records)
419 | 
420 |     @allow_windows
421 |     @patch("scrapy_playwright.handler._make_response_logger")
422 |     async def test_response_logger_disabled(self, make_response_logger: MagicMock):
423 |         self._caplog.set_level(logging.DEBUG + 1, "scrapy-playwright")
424 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
425 |             with MockServer() as server:
426 |                 req = Request(url=server.urljoin("/index.html"), meta={"playwright": True})
427 |                 response = await handler._download_request(req, Spider("foo"))
428 | 
429 |         debug_message = f"[Context=default] Response: <{response.status} {response.url}>"
430 |         assert not any(rec.message == debug_message for rec in self._caplog.records)
431 |         make_response_logger.assert_not_called()
432 | 
433 |     @allow_windows
434 |     async def test_response_logger_enabled(self):
435 |         self._caplog.set_level(logging.DEBUG, "scrapy-playwright")
436 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
437 |             with MockServer() as server:
438 |                 request = Request(url=server.urljoin("/index.html"), meta={"playwright": True})
439 |                 response = await handler._download_request(request, Spider("foo"))
440 | 
441 |         debug_message = f"[Context=default] Response: <{response.status} {response.url}>"
442 |         assert any(rec.message == debug_message for rec in self._caplog.records)
443 | 
444 |     @allow_windows
445 |     async def test_download_file_ok(self):
446 |         settings_dict = {
447 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
448 |         }
449 |         async with make_handler(settings_dict) as handler:
450 |             with MockServer() as server:
451 |                 request = Request(
452 |                     url=server.urljoin("mancha.pdf"),
453 |                     meta={"playwright": True},
454 |                 )
455 |                 response = await handler._download_request(request, Spider("foo"))
456 |                 assert response.meta["playwright_suggested_filename"] == "mancha.pdf"
457 |                 assert response.body.startswith(b"%PDF-1.5")
458 |                 assert response.headers.get("Content-Type") == b"application/pdf"
459 |                 assert handler.stats.get_value("playwright/download_count") == 1
460 | 
461 |     @allow_windows
462 |     async def test_download_file_delay_ok(self):
463 |         settings_dict = {
464 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
465 |             "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0,
466 |         }
467 |         async with make_handler(settings_dict) as handler:
468 |             with MockServer() as server:
469 |                 request = Request(
470 |                     url=server.urljoin("/mancha.pdf?delay=1"),
471 |                     meta={"playwright": True},
472 |                 )
473 |                 response = await handler._download_request(request, Spider("foo"))
474 |                 assert response.meta["playwright_suggested_filename"] == "mancha.pdf"
475 |                 assert response.body.startswith(b"%PDF-1.5")
476 |                 assert handler.stats.get_value("playwright/download_count") == 1
477 | 
478 |     @allow_windows
479 |     async def test_download_file_delay_error(self):
480 |         settings_dict = {
481 |             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
482 |             "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 10,
483 |         }
484 |         async with make_handler(settings_dict) as handler:
485 |             with MockServer() as server:
486 |                 request = Request(
487 |                     url=server.urljoin("/mancha.pdf?delay=1"),
488 |                     meta={"playwright": True},
489 |                 )
490 |                 with pytest.raises(PlaywrightError) as excinfo:
491 |                     await handler._download_request(request, Spider("foo"))
492 |                 assert (
493 |                     "scrapy-playwright",
494 |                     logging.WARNING,
495 |                     f"Closing page due to failed request: {request}"
496 |                     f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}",
497 |                 ) in self._caplog.record_tuples
498 | 
499 |     @allow_windows
500 |     async def test_download_file_failure(self):
501 |         if self.browser_type != "chromium":
502 |             pytest.skip()
503 | 
504 |         async def cancel_download(download):
505 |             await download.cancel()
506 | 
507 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
508 |             with MockServer() as server:
509 |                 request = Request(
510 |                     url=server.urljoin("/mancha.pdf?content_length_multiplier=1000"),
511 |                     meta={
512 |                         "playwright": True,
513 |                         "playwright_event_handlers": {"download": cancel_download},
514 |                     },
515 |                 )
516 |                 with pytest.raises(RuntimeError) as excinfo:
517 |                     await handler._download_request(request, Spider("foo"))
518 |                 assert (
519 |                     "scrapy-playwright",
520 |                     logging.WARNING,
521 |                     f"Closing page due to failed request: {request}"
522 |                     f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}",
523 |                 ) in self._caplog.record_tuples
524 | 
525 |     @allow_windows
526 |     async def test_fail_status_204(self):
527 |         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
528 |             with MockServer() as server:
529 |                 request = Request(
530 |                     url=server.urljoin("/status/204"),
531 |                     meta={"playwright": True},
532 |                 )
533 |                 with pytest.raises(PlaywrightError) as excinfo:
534 |                     await handler._download_request(request, Spider("foo"))
535 |                 assert (
536 |                     "scrapy-playwright",
537 |                     logging.WARNING,
538 |                     f"Closing page due to failed request: {request}"
539 |                     f" exc_type={type(excinfo.value)} exc_msg={str(excinfo.value)}",
540 |                 ) in self._caplog.record_tuples
541 | 
542 | 
543 | class TestCaseChromium(IsolatedAsyncioTestCase, MixinTestCase):
544 |     browser_type = "chromium"
545 | 
546 | 
547 | class TestCaseFirefox(IsolatedAsyncioTestCase, MixinTestCase):
548 |     browser_type = "firefox"
549 | 
550 | 
551 | @pytest.mark.skipif(platform.system() != "Darwin", reason="Test WebKit only on Darwin")
552 | class TestCaseWebkit(IsolatedAsyncioTestCase, MixinTestCase):
553 |     browser_type = "webkit"
554 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_settings.py:
--------------------------------------------------------------------------------
 1 | from unittest import IsolatedAsyncioTestCase
 2 | 
 3 | import pytest
 4 | from scrapy.exceptions import NotSupported
 5 | from scrapy.settings import Settings
 6 | 
 7 | from scrapy_playwright.handler import Config
 8 | 
 9 | from tests import allow_windows, make_handler
10 | 
11 | 
12 | class TestSettings(IsolatedAsyncioTestCase):
13 |     async def test_settings_timeout_value(self):
14 |         config = Config.from_settings(Settings({}))
15 |         assert config.navigation_timeout is None
16 | 
17 |         config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None}))
18 |         assert config.navigation_timeout is None
19 | 
20 |         config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0}))
21 |         assert config.navigation_timeout == 0
22 | 
23 |         config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123}))
24 |         assert config.navigation_timeout == 123
25 | 
26 |         config = Config.from_settings(Settings({"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5}))
27 |         assert config.navigation_timeout == 0.5
28 | 
29 |     async def test_max_pages_per_context(self):
30 |         config = Config.from_settings(Settings({"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1234}))
31 |         assert config.max_pages_per_context == 1234
32 | 
33 |         config = Config.from_settings(Settings({"CONCURRENT_REQUESTS": 9876}))
34 |         assert config.max_pages_per_context == 9876
35 | 
36 |     async def test_connect_remote_urls(self):
37 |         with pytest.raises(NotSupported) as exc_info:
38 |             Config.from_settings(
39 |                 Settings({"PLAYWRIGHT_CONNECT_URL": "asdf", "PLAYWRIGHT_CDP_URL": "qwerty"})
40 |             )
41 |         assert (
42 |             str(exc_info.value)
43 |             == "Setting both PLAYWRIGHT_CDP_URL and PLAYWRIGHT_CONNECT_URL is not supported"
44 |         )
45 | 
46 |     @allow_windows
47 |     async def test_max_contexts(self):
48 |         async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": None}) as handler:
49 |             assert not hasattr(handler, "context_semaphore")
50 | 
51 |         async with make_handler({"PLAYWRIGHT_MAX_CONTEXTS": 1234}) as handler:
52 |             assert handler.context_semaphore._value == 1234
53 | 


--------------------------------------------------------------------------------
/tests/tests_asyncio/test_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from decimal import Decimal
  3 | from unittest import IsolatedAsyncioTestCase
  4 | from unittest.mock import AsyncMock
  5 | 
  6 | import pytest
  7 | from playwright.async_api import Error as PlaywrightError
  8 | from scrapy import Spider
  9 | from scrapy.http.headers import Headers
 10 | from scrapy.settings import Settings
 11 | from scrapy_playwright._utils import (
 12 |     _NAVIGATION_ERROR_MSG,
 13 |     _encode_body,
 14 |     _get_float_setting,
 15 |     _get_header_value,
 16 |     _get_page_content,
 17 |     _maybe_await,
 18 | )
 19 | 
 20 | 
 21 | class TestPageContent(IsolatedAsyncioTestCase):
 22 |     @pytest.fixture(autouse=True)
 23 |     def inject_fixtures(self, caplog):
 24 |         caplog.set_level(logging.DEBUG)
 25 |         self._caplog = caplog
 26 | 
 27 |     async def test_get_page_content_ok(self):
 28 |         expected_content = "lorem ipsum"
 29 |         page = AsyncMock()
 30 |         page.content.return_value = expected_content
 31 |         content = await _get_page_content(
 32 |             page=page,
 33 |             spider=Spider("foo"),
 34 |             context_name="context",
 35 |             scrapy_request_url="https://example.org",
 36 |             scrapy_request_method="GET",
 37 |         )
 38 |         assert content == expected_content
 39 | 
 40 |     async def test_get_page_content_retry_known_exception(self):
 41 |         expected_content = "lorem ipsum"
 42 |         page = AsyncMock()
 43 |         page.url = "FAKE URL"
 44 |         page.content.side_effect = [PlaywrightError(_NAVIGATION_ERROR_MSG), expected_content]
 45 |         content = await _get_page_content(
 46 |             page=page,
 47 |             spider=Spider("foo"),
 48 |             context_name="context",
 49 |             scrapy_request_url="https://example.org",
 50 |             scrapy_request_method="GET",
 51 |         )
 52 |         assert content == expected_content
 53 |         assert (
 54 |             "scrapy-playwright",
 55 |             logging.DEBUG,
 56 |             f"Retrying to get content from page '{page.url}', error: 'Unable to retrieve"
 57 |             " content because the page is navigating and changing the content.'",
 58 |         ) in self._caplog.record_tuples
 59 | 
 60 |     async def test_get_page_content_reraise_unknown_exception(self):
 61 |         expected_exception_message = "nope"
 62 |         page = AsyncMock()
 63 |         page.content.side_effect = PlaywrightError(expected_exception_message)
 64 |         with pytest.raises(PlaywrightError, match=expected_exception_message):
 65 |             await _get_page_content(
 66 |                 page=page,
 67 |                 spider=Spider("foo"),
 68 |                 context_name="context",
 69 |                 scrapy_request_url="https://example.org",
 70 |                 scrapy_request_method="GET",
 71 |             )
 72 | 
 73 | 
 74 | class TestBodyEncoding(IsolatedAsyncioTestCase):
 75 |     @staticmethod
 76 |     def body_str(charset: str, content: str = "áéíóú") -> str:
 77 |         return f"""
 78 |             <!doctype html>
 79 |             <html>
 80 |             <head>
 81 |             <meta charset="{charset}">
 82 |             </head>
 83 |             <body>
 84 |             <p>{content}</p>
 85 |             </body>
 86 |             </html>
 87 |         """.strip()
 88 | 
 89 |     async def test_encode_from_headers(self):
 90 |         """Charset declared in headers takes precedence"""
 91 |         text = self.body_str(charset="gb2312")
 92 |         body, encoding = _encode_body(
 93 |             headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
 94 |             text=text,
 95 |         )
 96 |         assert encoding == "cp1252"
 97 |         assert body == text.encode(encoding)
 98 | 
 99 |     async def test_encode_from_body(self):
100 |         """No charset declared in headers, use the one declared in the body"""
101 |         text = self.body_str(charset="gb2312")
102 |         body, encoding = _encode_body(headers=Headers({}), text=text)
103 |         assert encoding == "gb18030"
104 |         assert body == text.encode(encoding)
105 | 
106 |     async def test_encode_fallback_utf8(self):
107 |         """No charset declared, use utf-8 as fallback"""
108 |         text = "<html>áéíóú</html>"
109 |         body, encoding = _encode_body(headers=Headers(), text=text)
110 |         assert encoding == "utf-8"
111 |         assert body == text.encode(encoding)
112 | 
113 |     async def test_encode_mismatch(self):
114 |         """Charset declared in headers and body do not match, and the headers
115 |         one fails to encode: use the one in the body (first one that works)
116 |         """
117 |         text = self.body_str(charset="gb2312", content="空手道")
118 |         body, encoding = _encode_body(
119 |             headers=Headers({"content-type": "text/html; charset=ISO-8859-1"}),
120 |             text=text,
121 |         )
122 |         assert encoding == "gb18030"
123 |         assert body == text.encode(encoding)
124 | 
125 | 
126 | class TestHeaderValue(IsolatedAsyncioTestCase):
127 |     async def test_get_header_value(self):
128 |         async def _identity(x):
129 |             return x
130 | 
131 |         res1 = AsyncMock()
132 |         res1.header_value = _identity
133 |         assert "asdf" == await _get_header_value(res1, "asdf")
134 |         assert "qwerty" == await _get_header_value(res1, "qwerty")
135 | 
136 |         res2 = AsyncMock()
137 |         res2.header_value.side_effect = Exception("nope")
138 |         assert await _get_header_value(res2, "asdf") is None
139 |         assert await _get_header_value(res2, "qwerty") is None
140 | 
141 | 
142 | class TestMaybeAwait(IsolatedAsyncioTestCase):
143 |     async def test_maybe_await(self):
144 |         async def _awaitable_identity(x):
145 |             return x
146 | 
147 |         assert await _maybe_await(_awaitable_identity("asdf")) == "asdf"
148 |         assert await _maybe_await(_awaitable_identity("qwerty")) == "qwerty"
149 |         assert await _maybe_await(_awaitable_identity(1234)) == 1234
150 |         assert await _maybe_await("foo") == "foo"
151 |         assert await _maybe_await("bar") == "bar"
152 |         assert await _maybe_await(1234) == 1234
153 | 
154 | 
155 | class TestGetFloatSetting(IsolatedAsyncioTestCase):
156 |     async def test_get_float_setting(self):
157 |         settings = Settings(
158 |             {
159 |                 "ZERO": 0,
160 |                 "FLOAT": 1.5,
161 |                 "DECIMAL": Decimal("2.5"),
162 |                 "INT": 3,
163 |                 "NUMERIC_STRING": "123",
164 |                 "NON_NUMERIC_STRING": "asdf",
165 |                 "NONE": None,
166 |                 "LIST": [1, 2, 3],
167 |             }
168 |         )
169 |         assert _get_float_setting(settings, "ZERO") == 0.0
170 |         assert _get_float_setting(settings, "FLOAT") == 1.5
171 |         assert _get_float_setting(settings, "DECIMAL") == 2.5
172 |         assert _get_float_setting(settings, "INT") == 3.0
173 |         assert _get_float_setting(settings, "NUMERIC_STRING") == 123
174 |         assert _get_float_setting(settings, "NON_NUMERIC_STRING") is None
175 |         assert _get_float_setting(settings, "NONE") is None
176 |         assert _get_float_setting(settings, "LIST") is None
177 |         assert _get_float_setting(settings, "MISSING_KEY") is None
178 | 


--------------------------------------------------------------------------------
/tests/tests_twisted/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-playwright/97717e94b9143eaac422295008a0e9e11253a01c/tests/tests_twisted/__init__.py


--------------------------------------------------------------------------------
/tests/tests_twisted/test_mixed_requests.py:
--------------------------------------------------------------------------------
 1 | from scrapy import Spider
 2 | from scrapy.http import Request, Response
 3 | from scrapy.utils.test import get_crawler
 4 | from twisted.internet import defer
 5 | from twisted.trial.unittest import TestCase
 6 | 
 7 | from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
 8 | from tests.mockserver import StaticMockServer
 9 | 
10 | 
11 | class MixedRequestsTestCase(TestCase):
12 |     """
13 |     This test case ensures the handler's 'download_request' method works as expected, and
14 |     non-playwright requests are processed correctly. The rest of the tests directly call
15 |     '_download_request', which is a coroutine ('download_request' returns a Deferred).
16 |     """
17 | 
18 |     timeout_ms = 500
19 | 
20 |     @defer.inlineCallbacks
21 |     def setUp(self):
22 |         self.server = StaticMockServer()
23 |         self.server.__enter__()
24 |         self.handler = ScrapyPlaywrightDownloadHandler.from_crawler(
25 |             get_crawler(settings_dict={"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": self.timeout_ms})
26 |         )
27 |         yield self.handler._engine_started()
28 | 
29 |     @defer.inlineCallbacks
30 |     def tearDown(self):
31 |         self.server.__exit__(None, None, None)
32 |         yield self.handler.close()
33 | 
34 |     @defer.inlineCallbacks
35 |     def test_download_request(self):
36 |         def _check_regular(response, request):
37 |             self.assertIsInstance(response, Response)
38 |             self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
39 |             self.assertEqual(response.url, request.url)
40 |             self.assertEqual(response.status, 200)
41 |             self.assertNotIn("playwright", response.flags)
42 | 
43 |         def _check_playwright_ok(response, request):
44 |             self.assertIsInstance(response, Response)
45 |             self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
46 |             self.assertEqual(response.url, request.url)
47 |             self.assertEqual(response.status, 200)
48 |             self.assertIn("playwright", response.flags)
49 | 
50 |         def _check_playwright_error(failure, url):
51 |             # different errors depending on the platform
52 |             self.assertTrue(
53 |                 f"Page.goto: net::ERR_CONNECTION_REFUSED at {url}" in str(failure.value)
54 |                 or f"Page.goto: Timeout {self.timeout_ms}ms exceeded" in str(failure.value)
55 |             )
56 | 
57 |         req1 = Request(self.server.urljoin("/index.html"))
58 |         yield self.handler.download_request(req1, Spider("foo")).addCallback(
59 |             _check_regular, request=req1
60 |         )
61 | 
62 |         req2 = Request(self.server.urljoin("/index.html"), meta={"playwright": True})
63 |         yield self.handler.download_request(req2, Spider("foo")).addCallback(
64 |             _check_playwright_ok, request=req2
65 |         )
66 | 
67 |         req3 = Request("http://localhost:12345/asdf", meta={"playwright": True})
68 |         yield self.handler.download_request(req3, Spider("foo")).addErrback(
69 |             _check_playwright_error, url=req3.url
70 |         )
71 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = bandit,black,flake8,typing,pylint,py,py-twisted
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest==7.4.0
 7 |     pytest_cov==4.1.0
 8 |     pytest_twisted==1.14
 9 |     psutil==5.9.7
10 |     playwright==1.44  # version must match the one installed with npm below
11 | allowlist_externals =
12 |     npm
13 |     npx
14 | commands =
15 |     playwright install --with-deps
16 |     npm install playwright@1.44
17 |     npx playwright install chromium
18 |     py.test -vv --reactor=asyncio \
19 |         --cov-report=term-missing \
20 |         --cov-report=xml:coverage-asyncio.xml \
21 |         --cov-report=html:coverage-asyncio \
22 |         --cov=scrapy_playwright {posargs: scrapy_playwright tests/tests_asyncio}
23 | setenv =
24 |     DEBUG=pw:api
25 | 
26 | [testenv:py]
27 | basepython = python3
28 | 
29 | [testenv:py-twisted]
30 | basepython = python3
31 | commands =
32 |     playwright install --with-deps
33 |     py.test -vv --reactor=asyncio \
34 |         --cov-report=term-missing \
35 |         --cov-report=xml:coverage-twisted.xml \
36 |         --cov-report=html:coverage-twisted \
37 |         --cov=scrapy_playwright {posargs: scrapy_playwright tests/tests_twisted}
38 | 
39 | [testenv:bandit]
40 | deps =
41 |     bandit
42 | commands =
43 |     bandit -r {posargs: scrapy_playwright setup.py examples}
44 | 
45 | [testenv:black]
46 | deps =
47 |     black==24.4.2
48 | commands =
49 |     black --check {posargs: scrapy_playwright setup.py tests examples}
50 | 
51 | [testenv:flake8]
52 | deps =
53 |     flake8==7.0.0
54 | commands =
55 |     flake8 --exclude=.git,.tox,venv* {posargs: scrapy_playwright setup.py tests examples}
56 | 
57 | [testenv:typing]
58 | deps =
59 |     mypy==1.10.0
60 | commands =
61 |     mypy --show-error-codes --ignore-missing-imports \
62 |         --follow-imports=skip {posargs: scrapy_playwright setup.py tests examples}
63 | 
64 | [testenv:pylint]
65 | deps =
66 |     psutil==5.9.7
67 |     pylint==3.2.2
68 |     pytest==7.4.0
69 | commands =
70 |     pip install -e .
71 |     pylint {posargs: scrapy_playwright setup.py tests}
72 | 


--------------------------------------------------------------------------------