├── .coveragerc ├── .github └── workflows │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── CHANGES.rst ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── example ├── scrapy.cfg └── scrashtest │ ├── __init__.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── quotes.py ├── pyproject.toml ├── pytest.ini ├── scrapy_splash ├── __init__.py ├── cache.py ├── cookies.py ├── dupefilter.py ├── middleware.py ├── request.py ├── response.py ├── responsetypes.py └── utils.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── mockserver.py ├── resources.py ├── test_cookies.py ├── test_fingerprints.py ├── test_integration.py ├── test_middleware.py ├── test_request.py ├── test_utils.py └── utils.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - '[0-9]+.[0-9]+.[0-9]+' 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | environment: 10 | name: pypi 11 | url: https://pypi.org/p/${{ github.event.repository.name }} 12 | permissions: 13 | id-token: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: 3.13 19 | - run: | 20 | python -m pip install --upgrade build 21 | python -m build 22 | - name: Publish to PyPI 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 7 | runs-on: ${{ matrix.os || 'ubuntu-latest' }} 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | include: 12 | - python-version: '3.9' 13 | - python-version: '3.10' 14 | - python-version: '3.11' 15 | - python-version: '3.12' 16 | - python-version: '3.13' 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - name: Run Splash 22 | run: | 23 | docker run --rm -d -p 8050:8050 --network host scrapinghub/splash 24 | 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Run tests 31 | env: 32 | TOXENV: py 33 | run: | 34 | pip install -U tox 35 | SPLASH_URL=http://127.0.0.1:8050 tox 36 | 37 | - name: Upload coverage report 38 | uses: codecov/codecov-action@v5 39 | with: 40 | token: ${{ secrets.CODECOV_TOKEN }} 41 | 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .tox 3 | build 4 | dist 5 | scrapyjs.egg-info 6 | scrapy_splash.egg-info 7 | .cache 8 | .coverage 9 | .scrapy 10 | htmlcov 11 | .hypothesis 12 | .ipynb_checkpoints 13 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 0.11.1 (2025-02-11) 5 | ------------------- 6 | 7 | * Fixed ``SplashAwareDupeFilter`` failing to initialize. 8 | 9 | 0.11.0 (2025-02-06) 10 | ------------------- 11 | 12 | * Scrapy 2.4 or higher is now required. 13 | 14 | * The ``url`` parameter of ``SplashRequest`` is once again optional, reverting 15 | a backward-incompatible change from scrapy-splash 0.9.0. 16 | 17 | 0.10.1 (2025-01-27) 18 | ------------------- 19 | 20 | * Fixed ``SplashAwareDupeFilter`` failing to initialize. 21 | 22 | * Improved the README. 23 | 24 | 0.10.0 (2025-01-21) 25 | ------------------- 26 | 27 | * Removed official support for Python 3.7 and 3.8, and added official support 28 | for Python 3.12 and 3.13. 29 | 30 | * Added support for Scrapy 2.12+. 31 | 32 | This includes deprecating ``SplashAwareDupeFilter`` and 33 | ``SplashAwareFSCacheStorage`` in favor of the corresponding built-in, default 34 | Scrapy components, and instead using the new ``SplashRequestFingerprinter`` 35 | component to ensure request fingerprinting for Splash requests stays the 36 | same, now for every Scrapy component doing request fingerprinting and not 37 | only for duplicate filtering and HTTP caching. 38 | 39 | 0.9.0 (2023-02-03) 40 | ------------------ 41 | 42 | * Removed official support for Python 2.7, 3.4, 3.5 and 3.6, and added official 43 | support for Python 3.9, 3.10 and 3.11. 44 | 45 | * Deprecated ``SplashJsonResponse.body_as_unicode()``, to be replaced by 46 | ``SplashJsonResponse.text``. 47 | 48 | * Removed calls to obsolete ``to_native_str``, removed in Scrapy 2.8. 49 | 50 | 0.8.0 (2021-10-05) 51 | ------------------ 52 | 53 | * **Security bug fix:** 54 | 55 | If you use HttpAuthMiddleware_ (i.e. the ``http_user`` and ``http_pass`` 56 | spider attributes) for Splash authentication, any non-Splash request will 57 | expose your credentials to the request target. This includes ``robots.txt`` 58 | requests sent by Scrapy when the ``ROBOTSTXT_OBEY`` setting is set to 59 | ``True``. 60 | 61 | Use the new ``SPLASH_USER`` and ``SPLASH_PASS`` settings instead to set 62 | your Splash authentication credentials safely. 63 | 64 | .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth 65 | 66 | * Responses now expose the HTTP status code and headers from Splash as 67 | ``response.splash_response_status`` and 68 | ``response.splash_response_headers`` (#158) 69 | 70 | * The ``meta`` argument passed to the ``scrapy_splash.request.SplashRequest`` 71 | constructor is no longer modified (#164) 72 | 73 | * Website responses with 400 or 498 as HTTP status code are no longer 74 | handled as the equivalent Splash responses (#158) 75 | 76 | * Cookies are no longer sent to Splash itself (#156) 77 | 78 | * ``scrapy_splash.utils.dict_hash`` now also works with ``obj=None`` 79 | (``225793b``) 80 | 81 | * Our test suite now includes integration tests (#156) and tests can be run 82 | in parallel (``6fb8c41``) 83 | 84 | * There’s a new ‘Getting help’ section in the ``README.rst`` file (#161, 85 | #162), the documentation about ``SPLASH_SLOT_POLICY`` has been improved 86 | (#157) and a typo as been fixed (#121) 87 | 88 | * Made some internal improvements (``ee5000d``, ``25de545``, ``2aaa79d``) 89 | 90 | 91 | 0.7.2 (2017-03-30) 92 | ------------------ 93 | 94 | * fixed issue with response type detection. 95 | 96 | 0.7.1 (2016-12-20) 97 | ------------------ 98 | 99 | * Scrapy 1.0.x support is back; 100 | * README updates. 101 | 102 | 0.7 (2016-05-16) 103 | ---------------- 104 | 105 | * ``SPLASH_COOKIES_DEBUG`` setting allows to log cookies 106 | sent and received to/from Splash in ``cookies`` request/response fields. 107 | It is similar to Scrapy's builtin ``COOKIES_DEBUG``, but works for 108 | Splash requests; 109 | * README cleanup. 110 | 111 | 0.6.1 (2016-04-29) 112 | ------------------ 113 | 114 | * Warning about HTTP methods is no longer logged for non-Splash requests. 115 | 116 | 0.6 (2016-04-20) 117 | ---------------- 118 | 119 | * ``SplashAwareDupeFilter`` and ``splash_request_fingerprint`` are improved: 120 | they now canonicalize URLs and take URL fragments in account; 121 | * ``cache_args`` value fingerprints are now calculated faster. 122 | 123 | 0.5 (2016-04-18) 124 | ---------------- 125 | 126 | * ``cache_args`` SplashRequest argument and 127 | ``request.meta['splash']['cache_args']`` key allow to save network traffic 128 | and disk storage by not storing duplicate Splash arguments in disk request 129 | queues and not sending them to Splash multiple times. This feature requires 130 | Splash 2.1+. 131 | 132 | To upgrade from v0.4 enable ``SplashDeduplicateArgsMiddleware`` in settings.py:: 133 | 134 | SPIDER_MIDDLEWARES = { 135 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 136 | } 137 | 138 | 0.4 (2016-04-14) 139 | ---------------- 140 | 141 | * SplashFormRequest class is added; it is a variant of FormRequest which uses 142 | Splash; 143 | * Splash parameters are no longer stored in request.meta twice; this change 144 | should decrease disk queues data size; 145 | * SplashMiddleware now increases request priority when rescheduling the request; 146 | this should decrease disk queue data size and help with stale cookie 147 | problems. 148 | 149 | 0.3 (2016-04-11) 150 | ---------------- 151 | 152 | Package is renamed from ``scrapyjs`` to ``scrapy-splash``. 153 | 154 | An easiest way to upgrade is to replace ``scrapyjs`` imports with 155 | ``scrapy_splash`` and update ``settings.py`` with new defaults 156 | (check the README). 157 | 158 | There are many new helpers to handle JavaScript rendering transparently; 159 | the recommended way is now to use ``scrapy_splash.SplashRequest`` instead 160 | of ``request.meta['splash']``. Please make sure to read the README if 161 | you're upgrading from scrapyjs - you may be able to drop some code from your 162 | project, especially if you want to access response html, handle cookies 163 | and headers. 164 | 165 | * new SplashRequest class; it can be used as a replacement for scrapy.Request 166 | to provide a better integration with Splash; 167 | * added support for POST requests; 168 | * SplashResponse, SplashTextResponse and SplashJsonResponse allow to 169 | handle Splash responses transparently, taking care of response.url, 170 | response.body, response.headers and response.status. SplashJsonResponse 171 | allows to access decoded response JSON data as ``response.data``. 172 | * cookie handling improvements: it is possible to handle Scrapy and Splash 173 | cookies transparently; current cookiejar is exposed as response.cookiejar; 174 | * headers are passed to Splash by default; 175 | * URLs with fragments are handled automatically when using SplashRequest; 176 | * logging is improved: ``SplashRequest.__repr__`` shows both requested URL 177 | and Splash URL; 178 | * in case of Splash HTTP 400 errors the response is logged by default; 179 | * an issue with dupefilters is fixed: previously the order of keys in 180 | JSON request body could vary, making requests appear as non-duplicates; 181 | * it is now possible to pass custom headers to Splash server itself; 182 | * test coverage reports are enabled. 183 | 184 | 0.2 (2016-03-26) 185 | ---------------- 186 | 187 | * Scrapy 1.0 and 1.1 support; 188 | * Python 3 support; 189 | * documentation improvements; 190 | * project is moved to https://github.com/scrapy-plugins/scrapy-splash. 191 | 192 | 0.1.1 (2015-03-16) 193 | ------------------ 194 | 195 | Fixed fingerprint calculation for non-string meta values. 196 | 197 | 0.1 (2015-02-28) 198 | ---------------- 199 | 200 | Initial release 201 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Scrapy developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Scrapy-Splash nor the names of its contributors may 15 | be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.rst 3 | include tox.ini 4 | recursive-include tests *.py 5 | recursive-include example *.py *.cfg 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============================================== 2 | Scrapy & JavaScript integration through Splash 3 | ============================================== 4 | 5 | .. image:: https://img.shields.io/pypi/v/scrapy-splash.svg 6 | :target: https://pypi.python.org/pypi/scrapy-splash 7 | :alt: PyPI Version 8 | 9 | .. image:: https://github.com/scrapy-plugins/scrapy-splash/workflows/Tests/badge.svg 10 | :target: https://github.com/scrapy-plugins/scrapy-splash/actions/workflows/tests.yml 11 | :alt: Test Status 12 | 13 | .. image:: http://codecov.io/github/scrapy-plugins/scrapy-splash/coverage.svg?branch=master 14 | :target: http://codecov.io/github/scrapy-plugins/scrapy-splash?branch=master 15 | :alt: Code Coverage 16 | 17 | This library provides Scrapy_ and JavaScript integration using Splash_. 18 | The license is BSD 3-clause. 19 | 20 | .. _Scrapy: https://github.com/scrapy/scrapy 21 | .. _Splash: https://github.com/scrapinghub/splash 22 | 23 | Installation 24 | ============ 25 | 26 | Install scrapy-splash using pip:: 27 | 28 | $ pip install scrapy-splash 29 | 30 | Scrapy-Splash uses Splash_ HTTP API, so you also need a Splash instance. 31 | Usually to install & run Splash, something like this is enough:: 32 | 33 | $ docker run -p 8050:8050 scrapinghub/splash 34 | 35 | Check Splash `install docs`_ for more info. 36 | 37 | .. _install docs: http://splash.readthedocs.org/en/latest/install.html 38 | 39 | 40 | Configuration 41 | ============= 42 | 43 | 1. Add the Splash server address to ``settings.py`` of your Scrapy project 44 | like this:: 45 | 46 | SPLASH_URL = 'http://192.168.59.103:8050' 47 | 48 | 2. Enable the Splash middleware by adding it to ``DOWNLOADER_MIDDLEWARES`` 49 | in your ``settings.py`` file and changing HttpCompressionMiddleware 50 | priority: 51 | 52 | .. code:: python 53 | 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'scrapy_splash.SplashCookiesMiddleware': 723, 56 | 'scrapy_splash.SplashMiddleware': 725, 57 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 58 | } 59 | 60 | Order `723` is just before `HttpProxyMiddleware` (750) in default 61 | scrapy settings. 62 | 63 | HttpCompressionMiddleware priority should be changed in order to allow 64 | advanced response processing; see https://github.com/scrapy/scrapy/issues/1895 65 | for details. 66 | 67 | 3. Enable ``SplashDeduplicateArgsMiddleware`` by adding it to 68 | ``SPIDER_MIDDLEWARES`` in your ``settings.py``: 69 | 70 | .. code:: python 71 | 72 | SPIDER_MIDDLEWARES = { 73 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 74 | } 75 | 76 | This middleware is needed to support ``cache_args`` feature; it allows 77 | to save disk space by not storing duplicate Splash arguments multiple 78 | times in a disk request queue. If Splash 2.1+ is used the middleware 79 | also allows to save network traffic by not sending these duplicate 80 | arguments to Splash server multiple times. 81 | 82 | 4. Set a custom ``REQUEST_FINGERPRINTER_CLASS``: 83 | 84 | .. code:: python 85 | 86 | REQUEST_FINGERPRINTER_CLASS = 'scrapy_splash.SplashRequestFingerprinter' 87 | 88 | 89 | There are also some additional options available. 90 | Put them into your ``settings.py`` if you want to change the defaults: 91 | 92 | * ``SPLASH_COOKIES_DEBUG`` is ``False`` by default. 93 | Set to ``True`` to enable debugging cookies in the ``SplashCookiesMiddleware``. 94 | This option is similar to ``COOKIES_DEBUG`` 95 | for the built-in Scrapy cookies middleware: it logs sent and received cookies 96 | for all requests. 97 | * ``SPLASH_LOG_400`` is ``True`` by default - it instructs to log all 400 errors 98 | from Splash. They are important because they show errors occurred 99 | when executing the Splash script. Set it to ``False`` to disable this logging. 100 | * ``SPLASH_SLOT_POLICY`` is ``scrapy_splash.SlotPolicy.PER_DOMAIN`` (as object, not just a string) by default. 101 | It specifies how concurrency & politeness are maintained for Splash requests, 102 | and specify the default value for ``slot_policy`` argument for 103 | ``SplashRequest``, which is described below. 104 | * ``SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS`` is ``scrapy.settings.default_settings.REQUEST_FINGERPRINTER_CLASS`` by default. This changes the base class the Fingerprinter uses to get a fingerprint. 105 | 106 | 107 | Usage 108 | ===== 109 | 110 | Requests 111 | -------- 112 | 113 | The easiest way to render requests with Splash is to 114 | use ``scrapy_splash.SplashRequest``: 115 | 116 | .. code:: python 117 | 118 | yield SplashRequest(url, self.parse_result, 119 | args={ 120 | # optional; parameters passed to Splash HTTP API 121 | 'wait': 0.5, 122 | 123 | # 'url' is prefilled from request url 124 | # 'http_method' is set to 'POST' for POST requests 125 | # 'body' is set to request body for POST requests 126 | }, 127 | endpoint='render.json', # optional; default is render.html 128 | splash_url='', # optional; overrides SPLASH_URL 129 | slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN, # optional 130 | ) 131 | 132 | Alternatively, you can use regular scrapy.Request and 133 | ``'splash'`` Request `meta` key: 134 | 135 | .. code:: python 136 | 137 | yield scrapy.Request(url, self.parse_result, meta={ 138 | 'splash': { 139 | 'args': { 140 | # set rendering arguments here 141 | 'html': 1, 142 | 'png': 1, 143 | 144 | # 'url' is prefilled from request url 145 | # 'http_method' is set to 'POST' for POST requests 146 | # 'body' is set to request body for POST requests 147 | }, 148 | 149 | # optional parameters 150 | 'endpoint': 'render.json', # optional; default is render.json 151 | 'splash_url': '', # optional; overrides SPLASH_URL 152 | 'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN, 153 | 'splash_headers': {}, # optional; a dict with headers sent to Splash 154 | 'dont_process_response': True, # optional, default is False 155 | 'dont_send_headers': True, # optional, default is False 156 | 'magic_response': False, # optional, default is True 157 | } 158 | }) 159 | 160 | Use ``request.meta['splash']`` API in middlewares or when scrapy.Request 161 | subclasses are used (there is also ``SplashFormRequest`` described below). 162 | For example, ``meta['splash']`` allows to create a middleware which enables 163 | Splash for all outgoing requests by default. 164 | 165 | ``SplashRequest`` is a convenient utility to fill ``request.meta['splash']``; 166 | it should be easier to use in most cases. For each ``request.meta['splash']`` 167 | key there is a corresponding ``SplashRequest`` keyword argument: for example, 168 | to set ``meta['splash']['args']`` use ``SplashRequest(..., args=myargs)``. 169 | 170 | * ``meta['splash']['args']`` contains arguments sent to Splash. 171 | scrapy-splash adds some default keys/values to ``args``: 172 | 173 | * 'url' is set to request.url; 174 | * 'http_method' is set to 'POST' for POST requests; 175 | * 'body' is set to to request.body for POST requests. 176 | 177 | You can override default values by setting them explicitly. 178 | 179 | Note that by default Scrapy escapes URL fragments using AJAX escaping scheme. 180 | If you want to pass a URL with a fragment to Splash then set ``url`` 181 | in ``args`` dict manually. This is handled automatically if you use 182 | ``SplashRequest``, but you need to keep that in mind if you use raw 183 | ``meta['splash']`` API. 184 | 185 | Splash 1.8+ is required to handle POST requests; in earlier Splash versions 186 | 'http_method' and 'body' arguments are ignored. If you work with ``/execute`` 187 | endpoint and want to support POST requests you have to handle 188 | ``http_method`` and ``body`` arguments in your Lua script manually. 189 | 190 | * ``meta['splash']['cache_args']`` is a list of argument names to cache 191 | on Splash side. These arguments are sent to Splash only once, then cached 192 | values are used; it allows to save network traffic and decreases request 193 | queue disk memory usage. Use ``cache_args`` only for large arguments 194 | which don't change with each request; ``lua_source`` is a good candidate 195 | (if you don't use string formatting to build it). Splash 2.1+ is required 196 | for this feature to work. 197 | 198 | * ``meta['splash']['endpoint']`` is the Splash endpoint to use. 199 | In case of SplashRequest 200 | `render.html `_ 201 | is used by default. If you're using raw scrapy.Request then 202 | `render.json `_ 203 | is a default (for historical reasons). It is better to always pass endpoint 204 | explicitly. 205 | 206 | See Splash `HTTP API docs`_ for a full list of available endpoints 207 | and parameters. 208 | 209 | .. _HTTP API docs: http://splash.readthedocs.org/en/latest/api.html 210 | 211 | * ``meta['splash']['splash_url']`` overrides the Splash URL set 212 | in ``settings.py``. 213 | 214 | * ``meta['splash']['splash_headers']`` allows to add or change headers 215 | which are sent to Splash server. Note that this option **is not** for 216 | setting headers which are sent to the remote website. 217 | 218 | * ``meta['splash']['slot_policy']`` customize how 219 | concurrency & politeness are maintained for Splash requests. 220 | 221 | Currently there are 3 policies available: 222 | 223 | 1. ``scrapy_splash.SlotPolicy.PER_DOMAIN`` (default) - send Splash requests to 224 | downloader slots based on URL being rendered. It is useful if you want 225 | to maintain per-domain politeness & concurrency settings. 226 | 227 | 2. ``scrapy_splash.SlotPolicy.SINGLE_SLOT`` - send all Splash requests to 228 | a single downloader slot. It is useful if you want to throttle requests 229 | to Splash. 230 | 231 | 3. ``scrapy_splash.SlotPolicy.SCRAPY_DEFAULT`` - don't do anything with slots. 232 | It is similar to ``SINGLE_SLOT`` policy, but can be different if you access 233 | other services on the same address as Splash. 234 | 235 | * ``meta['splash']['dont_process_response']`` - when set to True, 236 | SplashMiddleware won't change the response to a custom scrapy.Response 237 | subclass. By default for Splash requests one of SplashResponse, 238 | SplashTextResponse or SplashJsonResponse is passed to the callback. 239 | 240 | * ``meta['splash']['dont_send_headers']``: by default scrapy-splash passes 241 | request headers to Splash in 'headers' JSON POST field. For all render.xxx 242 | endpoints it means Scrapy header options are respected by default 243 | (http://splash.readthedocs.org/en/stable/api.html#arg-headers). In Lua 244 | scripts you can use ``headers`` argument of ``splash:go`` to apply the 245 | passed headers: ``splash:go{url, headers=splash.args.headers}``. 246 | 247 | Set 'dont_send_headers' to True if you don't want to pass ``headers`` 248 | to Splash. 249 | 250 | * ``meta['splash']['http_status_from_error_code']`` - set response.status 251 | to HTTP error code when ``assert(splash:go(..))`` fails; it requires 252 | ``meta['splash']['magic_response']=True``. ``http_status_from_error_code`` 253 | option is False by default if you use raw meta API; 254 | SplashRequest sets it to True by default. 255 | 256 | * ``meta['splash']['magic_response']`` - when set to True and a JSON 257 | response is received from Splash, several attributes of the response 258 | (headers, body, url, status code) are filled using data returned in JSON: 259 | 260 | * response.headers are filled from 'headers' keys; 261 | * response.url is set to the value of 'url' key; 262 | * response.body is set to the value of 'html' key, 263 | or to base64-decoded value of 'body' key; 264 | * response.status is set to the value of 'http_status' key. 265 | When ``meta['splash']['http_status_from_error_code']`` is True 266 | and ``assert(splash:go(..))`` fails with an HTTP error 267 | response.status is also set to HTTP error code. 268 | 269 | Original URL, status and headers are available as ``response.real_url``, 270 | ``response.splash_response_status`` and ``response.splash_response_headers``. 271 | 272 | This option is set to True by default if you use SplashRequest. 273 | ``render.json`` and ``execute`` endpoints may not have all the necessary 274 | keys/values in the response. 275 | For non-JSON endpoints, only url is filled, regardless of the 276 | ``magic_response`` setting. 277 | 278 | 279 | Use ``scrapy_splash.SplashFormRequest`` if you want to make a ``FormRequest`` 280 | via splash. It accepts the same arguments as ``SplashRequest``, 281 | and also ``formdata``, like ``FormRequest`` from scrapy:: 282 | 283 | >>> from scrapy_splash import SplashFormRequest 284 | >>> SplashFormRequest('http://example.com', formdata={'foo': 'bar'}) 285 | 286 | 287 | ``SplashFormRequest.from_response`` is also supported, and works as described 288 | in `scrapy documentation `_. 289 | 290 | Responses 291 | --------- 292 | 293 | scrapy-splash returns Response subclasses for Splash requests: 294 | 295 | * SplashResponse is returned for binary Splash responses - e.g. for 296 | /render.png responses; 297 | * SplashTextResponse is returned when the result is text - e.g. for 298 | /render.html responses; 299 | * SplashJsonResponse is returned when the result is a JSON object - e.g. 300 | for /render.json responses or /execute responses when script returns 301 | a Lua table. 302 | 303 | To use standard Response classes set ``meta['splash']['dont_process_response']=True`` 304 | or pass ``dont_process_response=True`` argument to SplashRequest. 305 | 306 | All these responses set ``response.url`` to the URL of the original request 307 | (i.e. to the URL of a website you want to render), not to the URL of the 308 | requested Splash endpoint. "True" URL is still available as 309 | ``response.real_url``. 310 | 311 | SplashJsonResponse provide extra features: 312 | 313 | * ``response.data`` attribute contains response data decoded from JSON; 314 | you can access it like ``response.data['html']``. 315 | 316 | * If Splash session handling is configured, you can access current cookies 317 | as ``response.cookiejar``; it is a CookieJar instance. 318 | 319 | * If Scrapy-Splash response magic is enabled in request (default), 320 | several response attributes (headers, body, url, status code) 321 | are set automatically from original response body: 322 | 323 | * response.headers are filled from 'headers' keys; 324 | * response.url is set to the value of 'url' key; 325 | * response.body is set to the value of 'html' key, 326 | or to base64-decoded value of 'body' key; 327 | * response.status is set from the value of 'http_status' key. 328 | 329 | When ``response.body`` is updated in SplashJsonResponse 330 | (either from 'html' or from 'body' keys) familiar ``response.css`` 331 | and ``response.xpath`` methods are available. 332 | 333 | To turn off special handling of JSON result keys either set 334 | ``meta['splash']['magic_response']=False`` or pass ``magic_response=False`` 335 | argument to SplashRequest. 336 | 337 | Session Handling 338 | ================ 339 | 340 | Splash itself is stateless - each request starts from a clean state. 341 | In order to support sessions the following is required: 342 | 343 | 1. client (Scrapy) must send current cookies to Splash; 344 | 2. Splash script should make requests using these cookies and update 345 | them from HTTP response headers or JavaScript code; 346 | 3. updated cookies should be sent back to the client; 347 | 4. client should merge current cookies with the updated cookies. 348 | 349 | For (2) and (3) Splash provides ``splash:get_cookies()`` and 350 | ``splash:init_cookies()`` methods which can be used in Splash Lua scripts. 351 | 352 | scrapy-splash provides helpers for (1) and (4): to send current cookies 353 | in 'cookies' field and merge cookies back from 'cookies' response field 354 | set ``request.meta['splash']['session_id']`` to the session 355 | identifier. If you only want a single session use the same ``session_id`` for 356 | all request; any value like '1' or 'foo' is fine. 357 | 358 | For scrapy-splash session handling to work you must use ``/execute`` endpoint 359 | and a Lua script which accepts 'cookies' argument and returns 'cookies' 360 | field in the result: 361 | 362 | .. code:: python 363 | 364 | function main(splash) 365 | splash:init_cookies(splash.args.cookies) 366 | 367 | -- ... your script 368 | 369 | return { 370 | cookies = splash:get_cookies(), 371 | -- ... other results, e.g. html 372 | } 373 | end 374 | 375 | SplashRequest sets ``session_id`` automatically for ``/execute`` endpoint, 376 | i.e. cookie handling is enabled by default if you use SplashRequest, 377 | ``/execute`` endpoint and a compatible Lua rendering script. 378 | 379 | If you want to start from the same set of cookies, but then 'fork' sessions 380 | set ``request.meta['splash']['new_session_id']`` in addition to 381 | ``session_id``. Request cookies will be fetched from cookiejar ``session_id``, 382 | but response cookies will be merged back to the ``new_session_id`` cookiejar. 383 | 384 | Standard Scrapy ``cookies`` argument can be used with ``SplashRequest`` 385 | to add cookies to the current Splash cookiejar. 386 | 387 | Examples 388 | ======== 389 | 390 | Get HTML contents: 391 | 392 | .. code:: python 393 | 394 | import scrapy 395 | from scrapy_splash import SplashRequest 396 | 397 | class MySpider(scrapy.Spider): 398 | name = "MySpider" 399 | start_urls = ["http://example.com", "http://example.com/foo"] 400 | 401 | def start_requests(self): 402 | for url in self.start_urls: 403 | yield SplashRequest(url, self.parse, args={'wait': 0.5}) 404 | 405 | def parse(self, response): 406 | # response.body is a result of render.html call; it 407 | # contains HTML processed by a browser. 408 | # ... 409 | 410 | Get HTML contents and a screenshot: 411 | 412 | .. code:: python 413 | 414 | import json 415 | import base64 416 | import scrapy 417 | from scrapy_splash import SplashRequest 418 | 419 | class MySpider(scrapy.Spider): 420 | 421 | # ... 422 | splash_args = { 423 | 'wait': 1, 424 | 'html': 1, 425 | 'png': 1, 426 | 'width': 600, 427 | 'render_all': 1, 428 | } 429 | yield SplashRequest(url, self.parse_result, endpoint='render.json', 430 | args=splash_args) 431 | 432 | # ... 433 | def parse_result(self, response): 434 | # magic responses are turned ON by default, 435 | # so the result under 'html' key is available as response.body 436 | html = response.body 437 | 438 | # you can also query the html result as usual 439 | title = response.css('title').extract_first() 440 | 441 | # full decoded JSON data is available as response.data: 442 | png_bytes = base64.b64decode(response.data['png']) 443 | 444 | # ... 445 | 446 | Run a simple `Splash Lua Script`_: 447 | 448 | .. code:: python 449 | 450 | import json 451 | import base64 452 | from scrapy_splash import SplashRequest 453 | 454 | 455 | class MySpider(scrapy.Spider): 456 | 457 | # ... 458 | script = """ 459 | function main(splash) 460 | assert(splash:go(splash.args.url)) 461 | return splash:evaljs("document.title") 462 | end 463 | """ 464 | yield SplashRequest(url, self.parse_result, endpoint='execute', 465 | args={'lua_source': script}) 466 | 467 | # ... 468 | def parse_result(self, response): 469 | doc_title = response.text 470 | # ... 471 | 472 | 473 | More complex `Splash Lua Script`_ example - get a screenshot of an HTML 474 | element by its CSS selector (it requires Splash 2.1+). 475 | Note how are arguments passed to the script: 476 | 477 | .. code:: python 478 | 479 | import json 480 | import base64 481 | from scrapy_splash import SplashRequest 482 | 483 | script = """ 484 | -- Arguments: 485 | -- * url - URL to render; 486 | -- * css - CSS selector to render; 487 | -- * pad - screenshot padding size. 488 | 489 | -- this function adds padding around region 490 | function pad(r, pad) 491 | return {r[1]-pad, r[2]-pad, r[3]+pad, r[4]+pad} 492 | end 493 | 494 | -- main script 495 | function main(splash) 496 | 497 | -- this function returns element bounding box 498 | local get_bbox = splash:jsfunc([[ 499 | function(css) { 500 | var el = document.querySelector(css); 501 | var r = el.getBoundingClientRect(); 502 | return [r.left, r.top, r.right, r.bottom]; 503 | } 504 | ]]) 505 | 506 | assert(splash:go(splash.args.url)) 507 | assert(splash:wait(0.5)) 508 | 509 | -- don't crop image by a viewport 510 | splash:set_viewport_full() 511 | 512 | local region = pad(get_bbox(splash.args.css), splash.args.pad) 513 | return splash:png{region=region} 514 | end 515 | """ 516 | 517 | class MySpider(scrapy.Spider): 518 | 519 | 520 | # ... 521 | yield SplashRequest(url, self.parse_element_screenshot, 522 | endpoint='execute', 523 | args={ 524 | 'lua_source': script, 525 | 'pad': 32, 526 | 'css': 'a.title' 527 | } 528 | ) 529 | 530 | # ... 531 | def parse_element_screenshot(self, response): 532 | image_data = response.body # binary image data in PNG format 533 | # ... 534 | 535 | 536 | Use a Lua script to get an HTML response with cookies, headers, body 537 | and method set to correct values; ``lua_source`` argument value is cached 538 | on Splash server and is not sent with each request (it requires Splash 2.1+): 539 | 540 | .. code:: python 541 | 542 | import scrapy 543 | from scrapy_splash import SplashRequest 544 | 545 | script = """ 546 | function main(splash) 547 | splash:init_cookies(splash.args.cookies) 548 | assert(splash:go{ 549 | splash.args.url, 550 | headers=splash.args.headers, 551 | http_method=splash.args.http_method, 552 | body=splash.args.body, 553 | }) 554 | assert(splash:wait(0.5)) 555 | 556 | local entries = splash:history() 557 | local last_response = entries[#entries].response 558 | return { 559 | url = splash:url(), 560 | headers = last_response.headers, 561 | http_status = last_response.status, 562 | cookies = splash:get_cookies(), 563 | html = splash:html(), 564 | } 565 | end 566 | """ 567 | 568 | class MySpider(scrapy.Spider): 569 | 570 | 571 | # ... 572 | yield SplashRequest(url, self.parse_result, 573 | endpoint='execute', 574 | cache_args=['lua_source'], 575 | args={'lua_source': script}, 576 | headers={'X-My-Header': 'value'}, 577 | ) 578 | 579 | def parse_result(self, response): 580 | # here response.body contains result HTML; 581 | # response.headers are filled with headers from last 582 | # web page loaded to Splash; 583 | # cookies from all responses and from JavaScript are collected 584 | # and put into Set-Cookie response header, so that Scrapy 585 | # can remember them. 586 | 587 | 588 | 589 | .. _Splash Lua Script: http://splash.readthedocs.org/en/latest/scripting-tutorial.html 590 | 591 | 592 | HTTP Basic Auth 593 | =============== 594 | 595 | If you need to use HTTP Basic Authentication to access Splash, use the 596 | ``SPLASH_USER`` and ``SPLASH_PASS`` optional settings:: 597 | 598 | SPLASH_USER = 'user' 599 | SPLASH_PASS = 'userpass' 600 | 601 | Another option is ``meta['splash']['splash_headers']``: it allows to set 602 | custom headers which are sent to Splash server; add Authorization header 603 | to ``splash_headers`` if you want to change credentials per-request:: 604 | 605 | import scrapy 606 | from w3lib.http import basic_auth_header 607 | 608 | class MySpider(scrapy.Spider): 609 | # ... 610 | def start_requests(self): 611 | auth = basic_auth_header('user', 'userpass') 612 | yield SplashRequest(url, self.parse, 613 | splash_headers={'Authorization': auth}) 614 | 615 | **WARNING:** Don't use `HttpAuthMiddleware`_ 616 | (i.e. ``http_user`` / ``http_pass`` spider attributes) for Splash 617 | authentication: if you occasionally send a non-Splash request from your spider, 618 | you may expose Splash credentials to a remote website, as HttpAuthMiddleware 619 | sets credentials for all requests unconditionally. 620 | 621 | .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth 622 | 623 | Why not use the Splash HTTP API directly? 624 | ========================================= 625 | 626 | The obvious alternative to scrapy-splash would be to send requests directly 627 | to the Splash `HTTP API`_. Take a look at the example below and make 628 | sure to read the observations after it: 629 | 630 | .. code:: python 631 | 632 | import json 633 | 634 | import scrapy 635 | from scrapy.http.headers import Headers 636 | 637 | RENDER_HTML_URL = "http://127.0.0.1:8050/render.html" 638 | 639 | class MySpider(scrapy.Spider): 640 | start_urls = ["http://example.com", "http://example.com/foo"] 641 | 642 | def start_requests(self): 643 | for url in self.start_urls: 644 | body = json.dumps({"url": url, "wait": 0.5}, sort_keys=True) 645 | headers = Headers({'Content-Type': 'application/json'}) 646 | yield scrapy.Request(RENDER_HTML_URL, self.parse, method="POST", 647 | body=body, headers=headers) 648 | 649 | def parse(self, response): 650 | # response.body is a result of render.html call; it 651 | # contains HTML processed by a browser. 652 | # ... 653 | 654 | 655 | It works and is easy enough, but there are some issues that you should be 656 | aware of: 657 | 658 | 1. There is a bit of boilerplate. 659 | 660 | 2. As seen by Scrapy, we're sending requests to ``RENDER_HTML_URL`` instead 661 | of the target URLs. It affects concurrency and politeness settings: 662 | ``CONCURRENT_REQUESTS_PER_DOMAIN``, ``DOWNLOAD_DELAY``, etc could behave 663 | in unexpected ways since delays and concurrency settings are no longer 664 | per-domain. 665 | 666 | 3. As seen by Scrapy, response.url is an URL of the Splash server. 667 | scrapy-splash fixes it to be an URL of a requested page. 668 | "Real" URL is still available as ``response.real_url``. scrapy-splash also 669 | allows to handle ``response.status`` and ``response.headers`` transparently 670 | on Scrapy side. 671 | 672 | 4. Some options depend on each other - for example, if you use timeout_ 673 | Splash option then you may want to set ``download_timeout`` 674 | scrapy.Request meta key as well. 675 | 676 | 5. It is easy to get it subtly wrong - e.g. if you won't use 677 | ``sort_keys=True`` argument when preparing JSON body then binary POST body 678 | content could vary even if all keys and values are the same, and it means 679 | dupefilter and cache will work incorrectly. 680 | 681 | 6. Default Scrapy duplication filter doesn't take Splash specifics in 682 | account. For example, if an URL is sent in a JSON POST request body 683 | Scrapy will compute request fingerprint without canonicalizing this URL. 684 | 685 | 7. Splash Bad Request (HTTP 400) errors are hard to debug because by default 686 | response content is not displayed by Scrapy. SplashMiddleware logs content 687 | of HTTP 400 Splash responses by default (it can be turned off by setting 688 | ``SPLASH_LOG_400 = False`` option). 689 | 690 | 8. Cookie handling is tedious to implement, and you can't use Scrapy 691 | built-in Cookie middleware to handle cookies when working with Splash. 692 | 693 | 9. Large Splash arguments which don't change with every request 694 | (e.g. ``lua_source``) may take a lot of space when saved to Scrapy disk 695 | request queues. ``scrapy-splash`` provides a way to store such static 696 | parameters only once. 697 | 698 | 10. Splash 2.1+ provides a way to save network traffic by caching large 699 | static arguments on server, but it requires client support: client should 700 | send proper ``save_args`` and ``load_args`` values and handle HTTP 498 701 | responses. 702 | 703 | scrapy-splash utilities allow to handle such edge cases and reduce 704 | the boilerplate. 705 | 706 | .. _HTTP API: http://splash.readthedocs.org/en/latest/api.html 707 | .. _timeout: http://splash.readthedocs.org/en/latest/api.html#arg-timeout 708 | 709 | 710 | Getting help 711 | ============ 712 | 713 | * for problems with rendering pages read "`Splash FAQ`_" page 714 | * for Scrapy-related bugs take a look at "`reporting Scrapy bugs`_" page 715 | 716 | Best approach to get any other help is to ask a question on `Stack Overflow`_ 717 | 718 | .. _reporting Scrapy bugs: https://doc.scrapy.org/en/master/contributing.html#reporting-bugs 719 | .. _Splash FAQ: http://splash.readthedocs.io/en/stable/faq.html#website-is-not-rendered-correctly 720 | .. _Stack Overflow: https://stackoverflow.com/questions/tagged/scrapy-splash?sort=frequent&pageSize=15&mixed=1 721 | 722 | 723 | Contributing 724 | ============ 725 | 726 | Source code and bug tracker are on github: 727 | https://github.com/scrapy-plugins/scrapy-splash 728 | 729 | To run tests, install "tox" Python package and then run ``tox`` command 730 | from the source checkout. 731 | 732 | To run integration tests, start Splash and set SPLASH_URL env variable 733 | to Splash address before running ``tox`` command:: 734 | 735 | docker run -d --rm -p8050:8050 scrapinghub/splash:3.0 736 | SPLASH_URL=http://127.0.0.1:8050 tox -e py36 737 | -------------------------------------------------------------------------------- /example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = scrashtest.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrashtest 12 | -------------------------------------------------------------------------------- /example/scrashtest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-splash/72a8788212746b938e1e4d45aad56ff27857924a/example/scrashtest/__init__.py -------------------------------------------------------------------------------- /example/scrashtest/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'scrashtest' 4 | 5 | SPIDER_MODULES = ['scrashtest.spiders'] 6 | NEWSPIDER_MODULE = 'scrashtest.spiders' 7 | 8 | DOWNLOADER_MIDDLEWARES = { 9 | # Engine side 10 | 'scrapy_splash.SplashCookiesMiddleware': 723, 11 | 'scrapy_splash.SplashMiddleware': 725, 12 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 13 | # Downloader side 14 | } 15 | 16 | SPIDER_MIDDLEWARES = { 17 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 18 | } 19 | SPLASH_URL = 'http://127.0.0.1:8050/' 20 | # SPLASH_URL = 'http://192.168.59.103:8050/' 21 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 22 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 23 | ROBOTSTXT_OBEY = True -------------------------------------------------------------------------------- /example/scrashtest/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-splash/72a8788212746b938e1e4d45aad56ff27857924a/example/scrashtest/spiders/__init__.py -------------------------------------------------------------------------------- /example/scrashtest/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | 5 | from scrapy_splash import SplashRequest 6 | 7 | 8 | class QuotesSpider(scrapy.Spider): 9 | name = "quotes" 10 | allowed_domains = ["toscrape.com"] 11 | start_urls = ['http://quotes.toscrape.com/'] 12 | 13 | #custom_settings = { 14 | #'SPLASH_USER': 'splash-user', 15 | #'SPLASH_PASS': 'splash-password', 16 | #} 17 | 18 | def parse(self, response): 19 | le = LinkExtractor() 20 | for link in le.extract_links(response): 21 | yield SplashRequest( 22 | link.url, 23 | self.parse_link, 24 | endpoint='render.json', 25 | args={ 26 | 'har': 1, 27 | 'html': 1, 28 | } 29 | ) 30 | 31 | def parse_link(self, response): 32 | print("PARSED", response.real_url, response.url) 33 | print(response.css("title").extract()) 34 | print(response.data["har"]["log"]["pages"]) 35 | print(response.headers.get('Content-Type')) 36 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.bumpversion] 2 | current_version = "0.11.1" 3 | commit = true 4 | tag = true 5 | tag_name = "{new_version}" 6 | 7 | [[tool.bumpversion.files]] 8 | filename = 'CHANGES.rst' 9 | search = "\\(unreleased\\)$" 10 | replace = "({now:%Y-%m-%d})" 11 | regex = true 12 | 13 | [[tool.bumpversion.files]] 14 | filename = "setup.py" 15 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | doctest_optionflags = NORMALIZE_WHITESPACE ALLOW_UNICODE 3 | -------------------------------------------------------------------------------- /scrapy_splash/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | from .middleware import ( 5 | SplashMiddleware, 6 | SplashCookiesMiddleware, 7 | SplashDeduplicateArgsMiddleware, 8 | SlotPolicy, 9 | ) 10 | from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint 11 | from .cache import SplashAwareFSCacheStorage 12 | from .response import SplashResponse, SplashTextResponse, SplashJsonResponse 13 | from .request import SplashRequest, SplashFormRequest, SplashRequestFingerprinter 14 | -------------------------------------------------------------------------------- /scrapy_splash/cache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | To handle "splash" Request meta key correctly when HTTP cache is enabled 4 | Scrapy needs a custom caching backed. 5 | 6 | See https://github.com/scrapy/scrapy/issues/900 for more info. 7 | """ 8 | from __future__ import absolute_import 9 | import os 10 | from warnings import warn 11 | 12 | from scrapy.extensions.httpcache import FilesystemCacheStorage 13 | 14 | from .dupefilter import splash_request_fingerprint 15 | 16 | 17 | class SplashAwareFSCacheStorage(FilesystemCacheStorage): 18 | def __init__(self, settings): 19 | warn( 20 | ( 21 | "scrapy-splash.SplashAwareFSCacheStorage is deprecated. Set " 22 | "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to " 23 | "\"scrapy_splash.SplashRequestFingerprinter\" instead." 24 | ), 25 | DeprecationWarning, 26 | stacklevel=2, 27 | ) 28 | super().__init__(settings) 29 | 30 | def _get_request_path(self, spider, request): 31 | key = splash_request_fingerprint(request) 32 | return os.path.join(self.cachedir, spider.name, key[0:2], key) 33 | -------------------------------------------------------------------------------- /scrapy_splash/cookies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Cookie-related utilities. 4 | """ 5 | from __future__ import absolute_import 6 | import time 7 | import calendar 8 | 9 | from six.moves.http_cookiejar import CookieJar, Cookie 10 | 11 | 12 | def jar_to_har(cookiejar): 13 | """ Convert CookieJar to HAR cookies format """ 14 | return [cookie_to_har(c) for c in cookiejar] 15 | 16 | 17 | def har_to_jar(cookiejar, har_cookies, request_cookies=None): 18 | """ Add HAR cookies to the cookiejar. 19 | If request_cookies is given, remove cookies absent from har_cookies 20 | but present in request_cookies (they were removed). """ 21 | har_cookie_keys = set() 22 | for c in har_cookies: 23 | cookie = har_to_cookie(c) 24 | cookiejar.set_cookie(cookie) 25 | har_cookie_keys.add(_cookie_key(cookie)) 26 | if request_cookies: 27 | for c in request_cookies: 28 | cookie = har_to_cookie(c) 29 | if _cookie_key(cookie) not in har_cookie_keys: 30 | # We sent it but it did not come back: remove it 31 | try: 32 | cookiejar.clear(cookie.domain, cookie.path, cookie.name) 33 | except KeyError: 34 | pass # It could have been already removed 35 | 36 | 37 | def _cookie_key(cookie): 38 | return (cookie.domain, cookie.path, cookie.name) 39 | 40 | 41 | def har_to_cookie(har_cookie): 42 | """ 43 | Convert a cookie dict in HAR format to a Cookie instance. 44 | 45 | >>> har_cookie = { 46 | ... "name": "TestCookie", 47 | ... "value": "Cookie Value", 48 | ... "path": "/foo", 49 | ... "domain": "www.janodvarko.cz", 50 | ... "expires": "2009-07-24T19:20:30Z", 51 | ... "httpOnly": True, 52 | ... "secure": True, 53 | ... "comment": "this is a test" 54 | ... } 55 | >>> cookie = har_to_cookie(har_cookie) 56 | >>> cookie.name 57 | 'TestCookie' 58 | >>> cookie.value 59 | 'Cookie Value' 60 | >>> cookie.port 61 | >>> cookie.domain 62 | 'www.janodvarko.cz' 63 | >>> cookie.path 64 | '/foo' 65 | >>> cookie.secure 66 | True 67 | >>> cookie.expires 68 | 1248463230 69 | >>> cookie.comment 70 | 'this is a test' 71 | >>> cookie.get_nonstandard_attr('HttpOnly') 72 | True 73 | """ 74 | 75 | expires_timestamp = None 76 | if har_cookie.get('expires'): 77 | expires = time.strptime(har_cookie['expires'], "%Y-%m-%dT%H:%M:%SZ") 78 | expires_timestamp = calendar.timegm(expires) 79 | 80 | kwargs = dict( 81 | version=har_cookie.get('version') or 0, 82 | name=har_cookie['name'], 83 | value=har_cookie['value'], 84 | port=None, 85 | domain=har_cookie.get('domain', ''), 86 | path=har_cookie.get('path', '/'), 87 | secure=har_cookie.get('secure', False), 88 | expires=expires_timestamp, 89 | discard=False, 90 | comment=har_cookie.get('comment'), 91 | comment_url=bool(har_cookie.get('comment')), 92 | rest={'HttpOnly': har_cookie.get('httpOnly')}, 93 | rfc2109=False, 94 | ) 95 | kwargs['port_specified'] = bool(kwargs['port']) 96 | kwargs['domain_specified'] = bool(kwargs['domain']) 97 | kwargs['domain_initial_dot'] = kwargs['domain'].startswith('.') 98 | kwargs['path_specified'] = bool(kwargs['path']) 99 | return Cookie(**kwargs) 100 | 101 | 102 | def cookie_to_har(cookie): 103 | """ 104 | Convert a Cookie instance to a dict in HAR cookie format. 105 | """ 106 | c = { 107 | 'name': cookie.name, 108 | 'value': cookie.value, 109 | 'secure': cookie.secure, 110 | } 111 | if cookie.path_specified: 112 | c['path'] = cookie.path 113 | 114 | if cookie.domain_specified: 115 | c['domain'] = cookie.domain 116 | 117 | if cookie.expires: 118 | tm = time.gmtime(cookie.expires) 119 | c['expires'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", tm) 120 | 121 | http_only = cookie.get_nonstandard_attr('HttpOnly') 122 | if http_only is not None: 123 | c['httpOnly'] = bool(http_only) 124 | 125 | if cookie.comment: 126 | c['comment'] = cookie.comment 127 | 128 | return c 129 | -------------------------------------------------------------------------------- /scrapy_splash/dupefilter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | To handle "splash" Request meta key properly a custom DupeFilter must be set. 4 | See https://github.com/scrapy/scrapy/issues/900 for more info. 5 | """ 6 | from __future__ import absolute_import, annotations 7 | from copy import deepcopy 8 | import hashlib 9 | from weakref import WeakKeyDictionary 10 | from warnings import warn 11 | 12 | from scrapy.dupefilters import RFPDupeFilter 13 | 14 | from scrapy.utils.python import to_bytes 15 | from scrapy.utils.url import canonicalize_url 16 | from scrapy.utils.request import RequestFingerprinterProtocol 17 | 18 | from .utils import dict_hash 19 | 20 | 21 | _deprecated_fingerprint_cache = WeakKeyDictionary() 22 | 23 | 24 | def _serialize_headers( 25 | headers, request 26 | ): 27 | for header in headers: 28 | if header in request.headers: 29 | yield header 30 | for value in request.headers.getlist(header): 31 | yield value 32 | 33 | 34 | # From https://docs.scrapy.org/en/2.11/_modules/scrapy/utils/request.html 35 | # Needs to be added here since it was deletedin Scrapy 2.12 36 | def request_fingerprint( 37 | request, 38 | include_headers=None, 39 | keep_fragments=False, 40 | ): 41 | """ 42 | Return the request fingerprint as an hexadecimal string. 43 | 44 | The request fingerprint is a hash that uniquely identifies the resource the 45 | request points to. For example, take the following two urls: 46 | 47 | http://www.example.com/query?id=111&cat=222 48 | http://www.example.com/query?cat=222&id=111 49 | 50 | Even though those are two different URLs both point to the same resource 51 | and are equivalent (i.e. they should return the same response). 52 | 53 | Another example are cookies used to store session ids. Suppose the 54 | following page is only accessible to authenticated users: 55 | 56 | http://www.example.com/members/offers.html 57 | 58 | Lots of sites use a cookie to store the session id, which adds a random 59 | component to the HTTP Request and thus should be ignored when calculating 60 | the fingerprint. 61 | 62 | For this reason, request headers are ignored by default when calculating 63 | the fingerprint. If you want to include specific headers use the 64 | include_headers argument, which is a list of Request headers to include. 65 | 66 | Also, servers usually ignore fragments in urls when handling requests, 67 | so they are also ignored by default when calculating the fingerprint. 68 | If you want to include them, set the keep_fragments argument to True 69 | (for instance when handling requests with a headless browser). 70 | """ 71 | processed_include_headers = None 72 | if include_headers: 73 | processed_include_headers = tuple( 74 | to_bytes(h.lower()) for h in sorted(include_headers) 75 | ) 76 | cache = _deprecated_fingerprint_cache.setdefault(request, {}) 77 | cache_key = (processed_include_headers, keep_fragments) 78 | if cache_key not in cache: 79 | fp = hashlib.sha1() 80 | fp.update(to_bytes(request.method)) 81 | fp.update( 82 | to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)) 83 | ) 84 | fp.update(request.body or b"") 85 | if processed_include_headers: 86 | for part in _serialize_headers(processed_include_headers, request): 87 | fp.update(part) 88 | cache[cache_key] = fp.hexdigest() 89 | return cache[cache_key] 90 | 91 | 92 | def splash_request_fingerprint(request, include_headers=None): 93 | """ Request fingerprint which takes 'splash' meta key into account """ 94 | warn( 95 | ( 96 | "scrapy_splash.splash_request_fingerprint is deprecated. Set " 97 | "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to " 98 | "\"scrapy_splash.SplashRequestFingerprinter\" instead." 99 | ), 100 | DeprecationWarning, 101 | stacklevel=2, 102 | ) 103 | 104 | fp = request_fingerprint(request, include_headers=include_headers) 105 | if 'splash' not in request.meta: 106 | return fp 107 | 108 | splash_options = deepcopy(request.meta['splash']) 109 | args = splash_options.setdefault('args', {}) 110 | 111 | if 'url' in args: 112 | args['url'] = canonicalize_url(args['url'], keep_fragments=True) 113 | 114 | return dict_hash(splash_options, fp) 115 | 116 | 117 | class SplashAwareDupeFilter(RFPDupeFilter): 118 | """ 119 | DupeFilter that takes 'splash' meta key in account. 120 | It should be used with SplashMiddleware. 121 | """ 122 | 123 | def __init__( 124 | self, 125 | path: str | None = None, 126 | debug: bool = False, 127 | *, 128 | fingerprinter: RequestFingerprinterProtocol | None = None 129 | ): 130 | warn( 131 | ( 132 | "SplashAwareDupeFilter is deprecated. Set " 133 | "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to " 134 | "\"scrapy_splash.SplashRequestFingerprinter\" instead." 135 | ), 136 | DeprecationWarning, 137 | stacklevel=2, 138 | ) 139 | super().__init__(path, debug, fingerprinter=fingerprinter) 140 | 141 | def request_fingerprint(self, request): 142 | return splash_request_fingerprint(request) 143 | -------------------------------------------------------------------------------- /scrapy_splash/middleware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import copy 5 | import json 6 | import logging 7 | import warnings 8 | from collections import defaultdict 9 | 10 | from six.moves.urllib.parse import urljoin 11 | from six.moves.http_cookiejar import CookieJar 12 | 13 | from w3lib.http import basic_auth_header 14 | import scrapy 15 | from scrapy.exceptions import NotConfigured, IgnoreRequest 16 | from scrapy.http.headers import Headers 17 | from scrapy.http.response.text import TextResponse 18 | from scrapy import signals 19 | from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware 20 | 21 | from scrapy_splash.responsetypes import responsetypes 22 | from scrapy_splash.cookies import jar_to_har, har_to_jar 23 | from scrapy_splash.utils import ( 24 | scrapy_headers_to_unicode_dict, 25 | json_based_hash, 26 | parse_x_splash_saved_arguments_header, 27 | ) 28 | from scrapy_splash.response import get_splash_status, get_splash_headers 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | class SlotPolicy(object): 35 | PER_DOMAIN = 'per_domain' 36 | SINGLE_SLOT = 'single_slot' 37 | SCRAPY_DEFAULT = 'scrapy_default' 38 | 39 | _known = {PER_DOMAIN, SINGLE_SLOT, SCRAPY_DEFAULT} 40 | 41 | 42 | class SplashCookiesMiddleware(object): 43 | """ 44 | This downloader middleware maintains cookiejars for Splash requests. 45 | 46 | It gets cookies from 'cookies' field in Splash JSON responses 47 | and sends current cookies in 'cookies' JSON POST argument instead of 48 | sending them in http headers. 49 | 50 | It should process requests before SplashMiddleware, and process responses 51 | after SplashMiddleware. 52 | """ 53 | def __init__(self, debug=False): 54 | self.jars = defaultdict(CookieJar) 55 | self.debug = debug 56 | 57 | @classmethod 58 | def from_crawler(cls, crawler): 59 | return cls(debug=crawler.settings.getbool('SPLASH_COOKIES_DEBUG')) 60 | 61 | def process_request(self, request, spider): 62 | """ 63 | For Splash requests add 'cookies' key with current 64 | cookies to ``request.meta['splash']['args']`` and remove cookie 65 | headers sent to Splash itself. 66 | """ 67 | if 'splash' not in request.meta: 68 | return 69 | 70 | if request.meta.get('_splash_processed'): 71 | request.headers.pop('Cookie', None) 72 | return 73 | 74 | splash_options = request.meta['splash'] 75 | 76 | splash_args = splash_options.setdefault('args', {}) 77 | if 'cookies' in splash_args: # cookies already set 78 | return 79 | 80 | if 'session_id' not in splash_options: 81 | return 82 | 83 | jar = self.jars[splash_options['session_id']] 84 | 85 | cookies = self._get_request_cookies(request) 86 | har_to_jar(jar, cookies) 87 | 88 | splash_args['cookies'] = jar_to_har(jar) 89 | self._debug_cookie(request, spider) 90 | 91 | def process_response(self, request, response, spider): 92 | """ 93 | For Splash JSON responses add all cookies from 94 | 'cookies' in a response to the cookiejar. 95 | """ 96 | from scrapy_splash import SplashJsonResponse 97 | if not isinstance(response, SplashJsonResponse): 98 | return response 99 | 100 | if 'cookies' not in response.data: 101 | return response 102 | 103 | if 'splash' not in request.meta: 104 | return response 105 | 106 | if not request.meta.get('_splash_processed'): 107 | warnings.warn("SplashCookiesMiddleware requires SplashMiddleware") 108 | return response 109 | 110 | splash_options = request.meta['splash'] 111 | session_id = splash_options.get('new_session_id', 112 | splash_options.get('session_id')) 113 | if session_id is None: 114 | return response 115 | 116 | jar = self.jars[session_id] 117 | request_cookies = splash_options['args'].get('cookies', []) 118 | har_to_jar(jar, response.data['cookies'], request_cookies) 119 | self._debug_set_cookie(response, spider) 120 | response.cookiejar = jar 121 | return response 122 | 123 | def _get_request_cookies(self, request): 124 | if isinstance(request.cookies, dict): 125 | return [ 126 | {'name': k, 'value': v} for k, v in request.cookies.items() 127 | ] 128 | return request.cookies or [] 129 | 130 | def _debug_cookie(self, request, spider): 131 | if self.debug: 132 | cl = request.meta['splash']['args']['cookies'] 133 | if cl: 134 | cookies = '\n'.join( 135 | 'Cookie: {}'.format(self._har_repr(c)) for c in cl) 136 | msg = 'Sending cookies to: {}\n{}'.format(request, cookies) 137 | logger.debug(msg, extra={'spider': spider}) 138 | 139 | def _debug_set_cookie(self, response, spider): 140 | if self.debug: 141 | cl = response.data['cookies'] 142 | if cl: 143 | cookies = '\n'.join( 144 | 'Set-Cookie: {}'.format(self._har_repr(c)) for c in cl) 145 | msg = 'Received cookies from: {}\n{}'.format(response, cookies) 146 | logger.debug(msg, extra={'spider': spider}) 147 | 148 | @staticmethod 149 | def _har_repr(har_cookie): 150 | return '{}={}'.format(har_cookie['name'], har_cookie['value']) 151 | 152 | 153 | class SplashDeduplicateArgsMiddleware(object): 154 | """ 155 | Spider middleware which allows not to store duplicate Splash argument 156 | values in request queue. It works together with SplashMiddleware downloader 157 | middleware. 158 | """ 159 | local_values_key = '_splash_local_values' 160 | 161 | def process_spider_output(self, response, result, spider): 162 | for el in result: 163 | if isinstance(el, scrapy.Request): 164 | yield self._process_request(el, spider) 165 | else: 166 | yield el 167 | 168 | def process_start_requests(self, start_requests, spider): 169 | if not hasattr(spider, 'state'): 170 | spider.state = {} 171 | spider.state.setdefault(self.local_values_key, {}) # fingerprint => value dict 172 | 173 | for req in start_requests: 174 | yield self._process_request(req, spider) 175 | 176 | def _process_request(self, request, spider): 177 | """ 178 | Replace requested meta['splash']['args'] values with their fingerprints. 179 | This allows to store values only once in request queue, which helps 180 | with disk queue size. 181 | 182 | Downloader middleware should restore the values from fingerprints. 183 | """ 184 | if 'splash' not in request.meta: 185 | return request 186 | 187 | if '_replaced_args' in request.meta['splash']: 188 | # don't process re-scheduled requests 189 | # XXX: does it work as expected? 190 | warnings.warn("Unexpected request.meta['splash']['_replaced_args']") 191 | return request 192 | 193 | request.meta['splash']['_replaced_args'] = [] 194 | cache_args = request.meta['splash'].get('cache_args', []) 195 | args = request.meta['splash'].setdefault('args', {}) 196 | 197 | for name in cache_args: 198 | if name not in args: 199 | continue 200 | value = args[name] 201 | fp = 'LOCAL+' + json_based_hash(value) 202 | spider.state[self.local_values_key][fp] = value 203 | args[name] = fp 204 | request.meta['splash']['_replaced_args'].append(name) 205 | 206 | return request 207 | 208 | 209 | class SplashMiddleware(object): 210 | """ 211 | Scrapy downloader and spider middleware that passes requests 212 | through Splash when 'splash' Request.meta key is set. 213 | 214 | This middleware also works together with SplashDeduplicateArgsMiddleware 215 | spider middleware to allow not to store duplicate Splash argument values 216 | in request queue and not to send them multiple times to Splash 217 | (the latter requires Splash 2.1+). 218 | """ 219 | default_splash_url = 'http://127.0.0.1:8050' 220 | default_endpoint = "render.json" 221 | splash_extra_timeout = 5.0 222 | default_policy = SlotPolicy.PER_DOMAIN 223 | rescheduling_priority_adjust = +100 224 | retry_498_priority_adjust = +50 225 | remote_keys_key = '_splash_remote_keys' 226 | 227 | def __init__(self, crawler, splash_base_url, slot_policy, log_400, auth): 228 | self.crawler = crawler 229 | self.splash_base_url = splash_base_url 230 | self.slot_policy = slot_policy 231 | self.log_400 = log_400 232 | self.crawler.signals.connect(self.spider_opened, signals.spider_opened) 233 | self.auth = auth 234 | 235 | @classmethod 236 | def from_crawler(cls, crawler): 237 | s = crawler.settings 238 | splash_base_url = s.get('SPLASH_URL', cls.default_splash_url) 239 | log_400 = s.getbool('SPLASH_LOG_400', True) 240 | slot_policy = s.get('SPLASH_SLOT_POLICY', cls.default_policy) 241 | if slot_policy not in SlotPolicy._known: 242 | raise NotConfigured("Incorrect slot policy: %r" % slot_policy) 243 | 244 | splash_user = s.get('SPLASH_USER', '') 245 | splash_pass = s.get('SPLASH_PASS', '') 246 | auth = None 247 | if splash_user or splash_pass: 248 | auth = basic_auth_header(splash_user, splash_pass) 249 | return cls(crawler, splash_base_url, slot_policy, log_400, auth) 250 | 251 | def spider_opened(self, spider): 252 | if _http_auth_enabled(spider): 253 | replace_downloader_middleware(self.crawler, RobotsTxtMiddleware, 254 | SafeRobotsTxtMiddleware) 255 | if not hasattr(spider, 'state'): 256 | spider.state = {} 257 | 258 | # local fingerprint => key returned by splash 259 | spider.state.setdefault(self.remote_keys_key, {}) 260 | 261 | @property 262 | def _argument_values(self): 263 | key = SplashDeduplicateArgsMiddleware.local_values_key 264 | return self.crawler.spider.state[key] 265 | 266 | @property 267 | def _remote_keys(self): 268 | return self.crawler.spider.state[self.remote_keys_key] 269 | 270 | def process_request(self, request, spider): 271 | if 'splash' not in request.meta: 272 | return 273 | splash_options = request.meta['splash'] 274 | 275 | if request.method not in {'GET', 'POST'}: 276 | logger.error( 277 | "Currently only GET and POST requests are supported by " 278 | "SplashMiddleware; %(request)s is dropped", 279 | {'request': request}, 280 | extra={'spider': spider} 281 | ) 282 | self.crawler.stats.inc_value('splash/dropped/method/{}'.format( 283 | request.method)) 284 | raise IgnoreRequest("SplashRequest doesn't support " 285 | "HTTP {} method".format(request.method)) 286 | 287 | if request.meta.get("_splash_processed"): 288 | # don't process the same request more than once 289 | return 290 | 291 | request.meta['_splash_processed'] = True 292 | 293 | slot_policy = splash_options.get('slot_policy', self.slot_policy) 294 | self._set_download_slot(request, request.meta, slot_policy) 295 | 296 | args = splash_options.setdefault('args', {}) 297 | 298 | if '_replaced_args' in splash_options: 299 | # restore arguments before sending request to the downloader 300 | load_args = {} 301 | save_args = [] 302 | local_arg_fingerprints = {} 303 | for name in splash_options['_replaced_args']: 304 | fp = args[name] 305 | # Use remote Splash argument cache: if Splash key 306 | # for a value is known then don't send the value to Splash; 307 | # if it is unknown then try to save the value on server using 308 | # ``save_args``. 309 | if fp in self._remote_keys: 310 | load_args[name] = self._remote_keys[fp] 311 | del args[name] 312 | else: 313 | save_args.append(name) 314 | args[name] = self._argument_values[fp] 315 | 316 | local_arg_fingerprints[name] = fp 317 | 318 | if load_args: 319 | args['load_args'] = load_args 320 | if save_args: 321 | args['save_args'] = save_args 322 | splash_options['_local_arg_fingerprints'] = local_arg_fingerprints 323 | 324 | del splash_options['_replaced_args'] # ?? 325 | 326 | args.setdefault('url', request.url) 327 | if request.method == 'POST': 328 | args.setdefault('http_method', request.method) 329 | # XXX: non-UTF8 request bodies are not supported now 330 | args.setdefault('body', request.body.decode('utf8')) 331 | 332 | if not splash_options.get('dont_send_headers'): 333 | headers = scrapy_headers_to_unicode_dict(request.headers) 334 | if headers: 335 | # Headers set by HttpAuthMiddleware should be used for Splash, 336 | # not for the remote website (backwards compatibility). 337 | if _http_auth_enabled(spider): 338 | headers.pop('Authorization', None) 339 | args.setdefault('headers', headers) 340 | 341 | body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4) 342 | # print(body) 343 | 344 | if 'timeout' in args: 345 | # User requested a Splash timeout explicitly. 346 | # 347 | # We can't catch a case when user requested `download_timeout` 348 | # explicitly because a default value for `download_timeout` 349 | # is set by DownloadTimeoutMiddleware. 350 | # 351 | # As user requested Splash timeout explicitly, we shouldn't change 352 | # it. Another reason not to change the requested Splash timeout is 353 | # because it may cause a validation error on the remote end. 354 | # 355 | # But we can change Scrapy `download_timeout`: increase 356 | # it when it's too small. Decreasing `download_timeout` is not 357 | # safe. 358 | 359 | timeout_requested = float(args['timeout']) 360 | timeout_expected = timeout_requested + self.splash_extra_timeout 361 | 362 | # no timeout means infinite timeout 363 | timeout_current = request.meta.get('download_timeout', 1e6) 364 | 365 | if timeout_expected > timeout_current: 366 | request.meta['download_timeout'] = timeout_expected 367 | 368 | endpoint = splash_options.setdefault('endpoint', self.default_endpoint) 369 | splash_base_url = splash_options.get('splash_url', self.splash_base_url) 370 | splash_url = urljoin(splash_base_url, endpoint) 371 | 372 | headers = Headers({'Content-Type': 'application/json'}) 373 | if self.auth is not None: 374 | headers['Authorization'] = self.auth 375 | headers.update(splash_options.get('splash_headers', {})) 376 | new_request = request.replace( 377 | url=splash_url, 378 | method='POST', 379 | body=body, 380 | headers=headers, 381 | priority=request.priority + self.rescheduling_priority_adjust 382 | ) 383 | new_request.meta['dont_obey_robotstxt'] = True 384 | self.crawler.stats.inc_value('splash/%s/request_count' % endpoint) 385 | return new_request 386 | 387 | def process_response(self, request, response, spider): 388 | if not request.meta.get("_splash_processed"): 389 | return response 390 | 391 | splash_options = request.meta['splash'] 392 | if not splash_options: 393 | return response 394 | 395 | # update stats 396 | endpoint = splash_options['endpoint'] 397 | self.crawler.stats.inc_value( 398 | 'splash/%s/response_count/%s' % (endpoint, response.status) 399 | ) 400 | 401 | # handle save_args/load_args 402 | self._process_x_splash_saved_arguments(request, response) 403 | if get_splash_status(response) == 498: 404 | logger.debug("Got HTTP 498 response for {}; " 405 | "sending arguments again.".format(request), 406 | extra={'spider': spider}) 407 | return self._498_retry_request(request, response) 408 | 409 | if splash_options.get('dont_process_response', False): 410 | return response 411 | 412 | response = self._change_response_class(request, response) 413 | 414 | if self.log_400 and get_splash_status(response) == 400: 415 | self._log_400(request, response, spider) 416 | 417 | return response 418 | 419 | def _change_response_class(self, request, response): 420 | from scrapy_splash import SplashResponse, SplashTextResponse 421 | if not isinstance(response, (SplashResponse, SplashTextResponse)): 422 | # create a custom Response subclass based on response Content-Type 423 | # XXX: usually request is assigned to response only when all 424 | # downloader middlewares are executed. Here it is set earlier. 425 | # Does it have any negative consequences? 426 | respcls = responsetypes.from_args(headers=response.headers) 427 | if isinstance(response, TextResponse) and respcls is SplashResponse: 428 | # Even if the headers say it's binary, it has already 429 | # been detected as a text response by scrapy (for example 430 | # because it was decoded successfully), so we should not 431 | # convert it to SplashResponse. 432 | respcls = SplashTextResponse 433 | response = response.replace(cls=respcls, request=request) 434 | return response 435 | 436 | def _log_400(self, request, response, spider): 437 | from scrapy_splash import SplashJsonResponse 438 | if isinstance(response, SplashJsonResponse): 439 | logger.warning( 440 | "Bad request to Splash: %s" % response.data, 441 | {'request': request}, 442 | extra={'spider': spider} 443 | ) 444 | 445 | def _process_x_splash_saved_arguments(self, request, response): 446 | """ Keep track of arguments saved by Splash. """ 447 | saved_args = get_splash_headers(response).get(b'X-Splash-Saved-Arguments') 448 | if not saved_args: 449 | return 450 | saved_args = parse_x_splash_saved_arguments_header(saved_args) 451 | arg_fingerprints = request.meta['splash']['_local_arg_fingerprints'] 452 | for name, key in saved_args.items(): 453 | fp = arg_fingerprints[name] 454 | self._remote_keys[fp] = key 455 | 456 | def _498_retry_request(self, request, response): 457 | """ 458 | Return a retry request for HTTP 498 responses. HTTP 498 means 459 | load_args are not present on server; client should retry the request 460 | with full argument values instead of their hashes. 461 | """ 462 | meta = copy.deepcopy(request.meta) 463 | local_arg_fingerprints = meta['splash']['_local_arg_fingerprints'] 464 | args = meta['splash']['args'] 465 | args.pop('load_args', None) 466 | args['save_args'] = list(local_arg_fingerprints.keys()) 467 | 468 | for name, fp in local_arg_fingerprints.items(): 469 | args[name] = self._argument_values[fp] 470 | # print('remote_keys before:', self._remote_keys) 471 | self._remote_keys.pop(fp, None) 472 | # print('remote_keys after:', self._remote_keys) 473 | 474 | body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4) 475 | # print(body) 476 | request = request.replace( 477 | meta=meta, 478 | body=body, 479 | priority=request.priority+self.retry_498_priority_adjust 480 | ) 481 | return request 482 | 483 | def _set_download_slot(self, request, meta, slot_policy): 484 | if slot_policy == SlotPolicy.PER_DOMAIN: 485 | # Use the same download slot to (sort of) respect download 486 | # delays and concurrency options. 487 | meta['download_slot'] = self._get_slot_key(request) 488 | 489 | elif slot_policy == SlotPolicy.SINGLE_SLOT: 490 | # Use a single slot for all Splash requests 491 | meta['download_slot'] = '__splash__' 492 | 493 | elif slot_policy == SlotPolicy.SCRAPY_DEFAULT: 494 | # Use standard Scrapy concurrency setup 495 | pass 496 | 497 | def _get_slot_key(self, request_or_response): 498 | return self.crawler.engine.downloader._get_slot_key( 499 | request_or_response, None 500 | ) 501 | 502 | 503 | class SafeRobotsTxtMiddleware(RobotsTxtMiddleware): 504 | def process_request(self, request, spider): 505 | # disable robots.txt for Splash requests 506 | if _http_auth_enabled(spider) and 'splash' in request.meta: 507 | return 508 | return super(SafeRobotsTxtMiddleware, self).process_request( 509 | request, spider) 510 | 511 | 512 | def _http_auth_enabled(spider): 513 | # FIXME: this function should always return False if HttpAuthMiddleware is 514 | # not in a middleware list. 515 | return getattr(spider, 'http_user', '') or getattr(spider, 'http_pass', '') 516 | 517 | 518 | def replace_downloader_middleware(crawler, old_cls, new_cls): 519 | """ Replace downloader middleware with another one """ 520 | try: 521 | new_mw = new_cls.from_crawler(crawler) 522 | except NotConfigured: 523 | return 524 | 525 | mw_manager = crawler.engine.downloader.middleware 526 | mw_manager.middlewares = tuple([ 527 | mw if mw.__class__ is not old_cls else new_mw 528 | for mw in mw_manager.middlewares 529 | ]) 530 | for method_name, callbacks in mw_manager.methods.items(): 531 | for idx, meth in enumerate(callbacks): 532 | method_cls = meth.__self__.__class__ 533 | if method_cls is old_cls: 534 | new_meth = getattr(new_mw, method_name) 535 | # logger.debug("{} is replaced with {}".format(meth, new_meth)) 536 | callbacks[idx] = new_meth 537 | -------------------------------------------------------------------------------- /scrapy_splash/request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import copy 4 | import scrapy 5 | from scrapy.http import FormRequest 6 | from scrapy.utils.url import canonicalize_url 7 | 8 | from scrapy_splash import SlotPolicy 9 | from scrapy_splash.utils import to_unicode, dict_hash 10 | from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS 11 | from scrapy.utils.misc import load_object 12 | 13 | try: 14 | from scrapy.utils.misc import build_from_crawler 15 | except ImportError: # Scrapy < 2.12 16 | from scrapy.utils.misc import create_instance 17 | 18 | def build_from_crawler(objcls, crawler, /, *args, **kwargs): 19 | return create_instance(objcls, None, crawler, *args, **kwargs) 20 | 21 | # XXX: we can't implement SplashRequest without middleware support 22 | # because there is no way to set Splash URL based on settings 23 | # from inside SplashRequest. 24 | 25 | 26 | class SplashRequest(scrapy.Request): 27 | """ 28 | scrapy.Request subclass which instructs Scrapy to render 29 | the page using Splash. 30 | 31 | It requires SplashMiddleware to work. 32 | """ 33 | def __init__(self, 34 | url=None, 35 | callback=None, 36 | method='GET', 37 | endpoint='render.html', 38 | args=None, 39 | splash_url=None, 40 | slot_policy=SlotPolicy.PER_DOMAIN, 41 | splash_headers=None, 42 | dont_process_response=False, 43 | dont_send_headers=False, 44 | magic_response=True, 45 | session_id='default', 46 | http_status_from_error_code=True, 47 | cache_args=None, 48 | meta=None, 49 | **kwargs): 50 | 51 | if url is None: 52 | url = 'about:blank' 53 | url = to_unicode(url) 54 | 55 | meta = copy.deepcopy(meta) or {} 56 | splash_meta = meta.setdefault('splash', {}) 57 | splash_meta.setdefault('endpoint', endpoint) 58 | splash_meta.setdefault('slot_policy', slot_policy) 59 | if splash_url is not None: 60 | splash_meta['splash_url'] = splash_url 61 | if splash_headers is not None: 62 | splash_meta['splash_headers'] = splash_headers 63 | if dont_process_response: 64 | splash_meta['dont_process_response'] = True 65 | else: 66 | splash_meta.setdefault('magic_response', magic_response) 67 | if dont_send_headers: 68 | splash_meta['dont_send_headers'] = True 69 | if http_status_from_error_code: 70 | splash_meta['http_status_from_error_code'] = True 71 | if cache_args is not None: 72 | splash_meta['cache_args'] = cache_args 73 | 74 | if session_id is not None: 75 | if splash_meta['endpoint'].strip('/') == 'execute': 76 | splash_meta.setdefault('session_id', session_id) 77 | 78 | _args = {'url': url} # put URL to args in order to preserve #fragment 79 | _args.update(args or {}) 80 | _args.update(splash_meta.get('args', {})) 81 | splash_meta['args'] = _args 82 | 83 | # This is not strictly required, but it strengthens Splash 84 | # requests against AjaxCrawlMiddleware 85 | meta['ajax_crawlable'] = True 86 | 87 | super(SplashRequest, self).__init__(url, callback, method, meta=meta, 88 | **kwargs) 89 | 90 | @property 91 | def _processed(self): 92 | return self.meta.get('_splash_processed') 93 | 94 | @property 95 | def _splash_args(self): 96 | return self.meta.get('splash', {}).get('args', {}) 97 | 98 | @property 99 | def _original_url(self): 100 | return self._splash_args.get('url') 101 | 102 | @property 103 | def _original_method(self): 104 | return self._splash_args.get('http_method', 'GET') 105 | 106 | def __repr__(self): 107 | if not self._processed: 108 | return super().__repr__() 109 | return "<%s %s via %s>" % (self._original_method, self._original_url, self.url) 110 | 111 | 112 | class SplashFormRequest(SplashRequest, FormRequest): 113 | """ 114 | Use SplashFormRequest if you want to make a FormRequest via splash. 115 | Accepts the same arguments as SplashRequest, and also formdata, 116 | like FormRequest. First, FormRequest is initialized, and then it's 117 | url, method and body are passed to SplashRequest. 118 | Note that FormRequest calls escape_ajax on url (via Request._set_url). 119 | """ 120 | def __init__(self, url=None, callback=None, method=None, formdata=None, 121 | body=None, **kwargs): 122 | # First init FormRequest to get url, body and method 123 | if formdata: 124 | FormRequest.__init__( 125 | self, url=url, method=method, formdata=formdata) 126 | url, method, body = self.url, self.method, self.body 127 | # Then pass all other kwargs to SplashRequest 128 | SplashRequest.__init__( 129 | self, url=url, callback=callback, method=method, body=body, 130 | **kwargs) 131 | 132 | 133 | class SplashRequestFingerprinter: 134 | @classmethod 135 | def from_crawler(cls, crawler): 136 | return cls(crawler) 137 | 138 | def __init__(self, crawler): 139 | self._base_request_fingerprinter = build_from_crawler( 140 | load_object( 141 | crawler.settings.get( 142 | "SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS", 143 | REQUEST_FINGERPRINTER_CLASS, 144 | ) 145 | ), 146 | crawler, 147 | ) 148 | 149 | def fingerprint(self, request): 150 | """ Request fingerprint which takes 'splash' meta key into account """ 151 | 152 | fp = self._base_request_fingerprinter.fingerprint(request) 153 | if 'splash' not in request.meta: 154 | return fp 155 | 156 | splash_options = copy.deepcopy(request.meta['splash']) 157 | args = splash_options.setdefault('args', {}) 158 | 159 | if 'url' in args: 160 | args['url'] = canonicalize_url(args['url'], keep_fragments=True) 161 | 162 | return dict_hash(splash_options, fp).encode() 163 | -------------------------------------------------------------------------------- /scrapy_splash/response.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import json 5 | import base64 6 | import re 7 | from warnings import warn 8 | 9 | from scrapy.http import Response, TextResponse 10 | from scrapy import Selector 11 | 12 | from scrapy_splash.utils import headers_to_scrapy 13 | 14 | 15 | def get_splash_status(resp): 16 | return getattr(resp, 'splash_response_status', resp.status) 17 | 18 | 19 | def get_splash_headers(resp): 20 | return getattr(resp, 'splash_response_headers', resp.headers) 21 | 22 | 23 | class _SplashResponseMixin(object): 24 | """ 25 | This mixin fixes response.url and adds response.real_url 26 | """ 27 | def __init__(self, url, *args, **kwargs): 28 | real_url = kwargs.pop('real_url', None) 29 | if real_url is not None: 30 | self.real_url = real_url 31 | else: 32 | self.real_url = None 33 | # FIXME: create a .request @property with a setter? 34 | # Scrapy doesn't pass request to Response constructor; 35 | # it is worked around in SplashMiddleware. 36 | request = kwargs['request'] 37 | splash_args = self._splash_args(request) 38 | _url = splash_args.get('url') 39 | if _url is not None: 40 | self.real_url = url 41 | url = _url 42 | self.splash_response_status = kwargs.pop('splash_response_status', 43 | None) 44 | self.splash_response_headers = kwargs.pop('splash_response_headers', 45 | None) 46 | super(_SplashResponseMixin, self).__init__(url, *args, **kwargs) 47 | if self.splash_response_status is None: 48 | self.splash_response_status = self.status 49 | if self.splash_response_headers is None: 50 | self.splash_response_headers = self.headers.copy() 51 | 52 | def replace(self, *args, **kwargs): 53 | """Create a new Response with the same attributes except for those 54 | given new values. 55 | """ 56 | for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 57 | 'real_url', 'splash_response_status', 58 | 'splash_response_headers']: 59 | kwargs.setdefault(x, getattr(self, x)) 60 | cls = kwargs.pop('cls', self.__class__) 61 | return cls(*args, **kwargs) 62 | 63 | def _splash_options(self, request=None): 64 | if request is None: 65 | request = self.request 66 | return request.meta.get("splash", {}) 67 | 68 | def _splash_args(self, request=None): 69 | return self._splash_options(request).get('args', {}) 70 | 71 | 72 | class SplashResponse(_SplashResponseMixin, Response): 73 | """ 74 | This Response subclass sets response.url to the URL of a remote website 75 | instead of an URL of Splash server. "Real" response URL is still available 76 | as ``response.real_url``. 77 | """ 78 | 79 | 80 | class SplashTextResponse(_SplashResponseMixin, TextResponse): 81 | """ 82 | This TextResponse subclass sets response.url to the URL of a remote website 83 | instead of an URL of Splash server. "Real" response URL is still available 84 | as ``response.real_url``. 85 | """ 86 | def replace(self, *args, **kwargs): 87 | kwargs.setdefault('encoding', self.encoding) 88 | return _SplashResponseMixin.replace(self, *args, **kwargs) 89 | 90 | 91 | class SplashJsonResponse(SplashResponse): 92 | """ 93 | Splash Response with JSON data. It provides a convenient way to access 94 | parsed JSON response using ``response.data`` attribute and exposes 95 | current Splash cookiejar when it is available. 96 | 97 | If Scrapy-Splash response magic is enabled in request 98 | (['splash']['magic_response'] is not False), several other response 99 | attributes (headers, body, url, status code) are set automatically: 100 | 101 | * response.url is set to the value of 'url' key, original url is 102 | available as ``responce.real_url``; 103 | * response.headers are filled from 'headers' keys; original headers are 104 | available as ``response.splash_response_headers``; 105 | * response.status is set from the value of 'http_status' key; original 106 | status is available as ``response.splash_response_status``; 107 | * response.body is set to the value of 'html' key, 108 | or to base64-decoded value of 'body' key; 109 | """ 110 | def __init__(self, *args, **kwargs): 111 | self.cookiejar = None 112 | self._cached_ubody = None 113 | self._cached_data = None 114 | self._cached_selector = None 115 | kwargs.pop('encoding', None) # encoding is always utf-8 116 | super(SplashJsonResponse, self).__init__(*args, **kwargs) 117 | 118 | # FIXME: it assumes self.request is set 119 | if self._splash_options().get('magic_response', True): 120 | self._load_from_json() 121 | 122 | @property 123 | def data(self): 124 | if self._cached_data is None: 125 | self._cached_data = json.loads(self._ubody) 126 | return self._cached_data 127 | 128 | @property 129 | def text(self): 130 | return self._ubody 131 | 132 | def body_as_unicode(self): 133 | warn( 134 | ( 135 | "The body_as_unicode() method is deprecated, use the text " 136 | "property instead." 137 | ), 138 | DeprecationWarning, 139 | stacklevel=2, 140 | ) 141 | return self._ubody 142 | 143 | @property 144 | def _ubody(self): 145 | if self._cached_ubody is None: 146 | self._cached_ubody = self.body.decode(self.encoding) 147 | return self._cached_ubody 148 | 149 | @property 150 | def encoding(self): 151 | return 'utf8' 152 | 153 | @property 154 | def selector(self): 155 | if self._cached_selector is None: 156 | self._cached_selector = Selector(text=self.text, type='html') 157 | return self._cached_selector 158 | 159 | def xpath(self, query): 160 | return self.selector.xpath(query) 161 | 162 | def css(self, query): 163 | return self.selector.css(query) 164 | 165 | def _load_from_json(self): 166 | """ Fill response attributes from JSON results """ 167 | 168 | # response.status 169 | if 'http_status' in self.data: 170 | self.status = int(self.data['http_status']) 171 | elif self._splash_options().get('http_status_from_error_code', False): 172 | if 'error' in self.data: 173 | try: 174 | error = self.data['info']['error'] 175 | except KeyError: 176 | error = '' 177 | http_code_m = re.match(r'http(\d{3})', error) 178 | if http_code_m: 179 | self.status = int(http_code_m.group(1)) 180 | 181 | # response.url 182 | if 'url' in self.data: 183 | self._url = self.data['url'] 184 | 185 | # response.body 186 | if 'body' in self.data: 187 | self._body = base64.b64decode(self.data['body']) 188 | self._cached_ubody = self._body.decode(self.encoding) 189 | elif 'html' in self.data: 190 | self._cached_ubody = self.data['html'] 191 | self._body = self._cached_ubody.encode(self.encoding) 192 | self.headers[b"Content-Type"] = b"text/html; charset=utf-8" 193 | 194 | # response.headers 195 | if 'headers' in self.data: 196 | self.headers = headers_to_scrapy(self.data['headers']) 197 | -------------------------------------------------------------------------------- /scrapy_splash/responsetypes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | from scrapy.http import Response 5 | from scrapy.responsetypes import ResponseTypes 6 | 7 | import scrapy_splash 8 | 9 | 10 | class SplashResponseTypes(ResponseTypes): 11 | CLASSES = { 12 | 'text/html': 'scrapy_splash.response.SplashTextResponse', 13 | 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 14 | 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 15 | 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', 16 | 'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse', 17 | 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse', 18 | 'application/xml': 'scrapy_splash.response.SplashTextResponse', 19 | 'application/json': 'scrapy_splash.response.SplashJsonResponse', 20 | 'application/x-json': 'scrapy_splash.response.SplashJsonResponse', 21 | 'application/javascript': 'scrapy_splash.response.SplashTextResponse', 22 | 'application/x-javascript': 'scrapy_splash.response.SplashTextResponse', 23 | 'text/xml': 'scrapy_splash.response.SplashTextResponse', 24 | 'text/*': 'scrapy_splash.response.SplashTextResponse', 25 | } 26 | 27 | def from_args(self, headers=None, url=None, filename=None, body=None): 28 | """Guess the most appropriate Response class based on 29 | the given arguments.""" 30 | cls = super(SplashResponseTypes, self).from_args( 31 | headers=headers, 32 | url=url, 33 | filename=filename, 34 | body=body 35 | ) 36 | if cls is Response: 37 | cls = scrapy_splash.SplashResponse 38 | return cls 39 | 40 | 41 | responsetypes = SplashResponseTypes() 42 | -------------------------------------------------------------------------------- /scrapy_splash/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import json 4 | import hashlib 5 | import six 6 | 7 | from scrapy.http import Headers 8 | from scrapy.utils.python import to_unicode, to_bytes 9 | 10 | 11 | def dict_hash(obj, start=''): 12 | """ Return a hash for a dict, based on its contents """ 13 | h = hashlib.sha1(to_bytes(start)) 14 | h.update(to_bytes(obj.__class__.__name__)) 15 | if isinstance(obj, dict): 16 | for key, value in sorted(obj.items()): 17 | h.update(to_bytes(key)) 18 | h.update(to_bytes(dict_hash(value))) 19 | elif isinstance(obj, (list, tuple)): 20 | for el in obj: 21 | h.update(to_bytes(dict_hash(el))) 22 | else: 23 | # basic types 24 | if isinstance(obj, bool): 25 | value = str(int(obj)) 26 | elif isinstance(obj, (six.integer_types, float)): 27 | value = str(obj) 28 | elif isinstance(obj, (six.text_type, bytes)): 29 | value = obj 30 | elif obj is None: 31 | value = b'' 32 | else: 33 | raise ValueError("Unsupported value type: %s" % obj.__class__) 34 | h.update(to_bytes(value)) 35 | return h.hexdigest() 36 | 37 | 38 | def _process(value, sha=False): 39 | if isinstance(value, (six.text_type, bytes)): 40 | if sha: 41 | return hashlib.sha1(to_bytes(value)).hexdigest() 42 | return 'h', hash(value) 43 | if isinstance(value, dict): 44 | return {_process(k, sha=True): _process(v, sha) for k, v in value.items()} 45 | if isinstance(value, (list, tuple)): 46 | return [_process(v, sha) for v in value] 47 | return value 48 | 49 | 50 | def _fast_hash(value): 51 | """ 52 | Return a hash for any JSON-serializable value. 53 | Hash is not guaranteed to be the same in different Python processes, 54 | but it is very fast to compute for data structures with large string 55 | values. 56 | """ 57 | return _json_based_hash(_process(value)) 58 | 59 | 60 | _hash_cache = {} # fast hash => hash 61 | def json_based_hash(value): 62 | """ 63 | Return a hash for any JSON-serializable value. 64 | 65 | >>> json_based_hash({"foo": "bar", "baz": [1, 2]}) 66 | '0570066939bea46c610bfdc35b20f37ef09d05ed' 67 | """ 68 | fp = _fast_hash(value) 69 | if fp not in _hash_cache: 70 | _hash_cache[fp] = _json_based_hash(_process(value, sha=True)) 71 | return _hash_cache[fp] 72 | 73 | 74 | def _json_based_hash(value): 75 | v = json.dumps(value, sort_keys=True, ensure_ascii=False).encode('utf8') 76 | return hashlib.sha1(v).hexdigest() 77 | 78 | 79 | def headers_to_scrapy(headers): 80 | """ 81 | Return scrapy.http.Headers instance from headers data. 82 | 3 data formats are supported: 83 | 84 | * {name: value, ...} dict; 85 | * [(name, value), ...] list; 86 | * [{'name': name, 'value': value'}, ...] list (HAR headers format). 87 | """ 88 | if isinstance(headers or {}, dict): 89 | return Headers(headers or {}) 90 | 91 | if isinstance(headers[0], dict): 92 | return Headers([ 93 | (d['name'], d.get('value', '')) 94 | for d in headers 95 | ]) 96 | 97 | return Headers(headers) 98 | 99 | 100 | def scrapy_headers_to_unicode_dict(headers): 101 | """ 102 | Convert scrapy.http.Headers instance to a dictionary 103 | suitable for JSON encoding. 104 | """ 105 | return { 106 | to_unicode(key): to_unicode(b','.join(value)) 107 | for key, value in headers.items() 108 | } 109 | 110 | 111 | def parse_x_splash_saved_arguments_header(value): 112 | """ 113 | Parse X-Splash-Saved-Arguments header value. 114 | 115 | >>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3" 116 | >>> dct = parse_x_splash_saved_arguments_header(value) 117 | >>> sorted(list(dct.keys())) 118 | ['name1', 'name2'] 119 | >>> dct['name1'] 120 | '9a6747fc6259aa374ab4e1bb03074b6ec672cf99' 121 | >>> dct['name2'] 122 | 'ba001160ef96fe2a3f938fea9e6762e204a562b3' 123 | 124 | Binary header values are also supported: 125 | >>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8')) 126 | >>> dct2 == dct 127 | True 128 | """ 129 | value = to_unicode(value) 130 | return dict(kv.split('=', 1) for kv in value.split(";")) 131 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup 3 | 4 | setup( 5 | name='scrapy-splash', 6 | version='0.11.1', 7 | url='https://github.com/scrapy-plugins/scrapy-splash', 8 | description='JavaScript support for Scrapy using Splash', 9 | long_description=open('README.rst').read() + "\n\n" + open("CHANGES.rst").read(), 10 | author='Scrapy developers', 11 | maintainer='Mikhail Korobov', 12 | maintainer_email='kmike84@gmail.com', 13 | license='BSD', 14 | packages=['scrapy_splash'], 15 | zip_safe=False, 16 | classifiers=[ 17 | 'Development Status :: 4 - Beta', 18 | 'License :: OSI Approved :: BSD License', 19 | 'Programming Language :: Python', 20 | 'Programming Language :: Python :: 2', 21 | 'Programming Language :: Python :: 2.7', 22 | 'Programming Language :: Python :: 3', 23 | 'Programming Language :: Python :: 3.4', 24 | 'Programming Language :: Python :: 3.5', 25 | 'Programming Language :: Python :: 3.6', 26 | 'Framework :: Scrapy', 27 | 'Intended Audience :: Developers', 28 | 'Operating System :: OS Independent', 29 | 'Topic :: Internet :: WWW/HTTP', 30 | 'Topic :: Software Development :: Libraries :: Application Frameworks', 31 | 'Topic :: Software Development :: Libraries :: Python Modules', 32 | ], 33 | install_requires=['scrapy>=2.4', 'six'], 34 | ) 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from .mockserver import MockServer 5 | from .resources import SplashProtected 6 | 7 | 8 | @pytest.fixture() 9 | def settings(): 10 | """ Default scrapy-splash settings """ 11 | s = dict( 12 | # collect scraped items to .collected_items attribute 13 | ITEM_PIPELINES={ 14 | 'tests.utils.CollectorPipeline': 100, 15 | }, 16 | 17 | # scrapy-splash settings 18 | SPLASH_URL=os.environ.get('SPLASH_URL'), 19 | DOWNLOADER_MIDDLEWARES={ 20 | # Engine side 21 | 'scrapy_splash.SplashCookiesMiddleware': 723, 22 | 'scrapy_splash.SplashMiddleware': 725, 23 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 24 | # Downloader side 25 | }, 26 | SPIDER_MIDDLEWARES={ 27 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 28 | }, 29 | REQUEST_FINGERPRINTER_CLASS='scrapy_splash.SplashRequestFingerprinter', 30 | ) 31 | return s 32 | 33 | 34 | @pytest.fixture() 35 | def settings_auth(settings): 36 | with MockServer(SplashProtected) as s: 37 | print("splash url:", s.root_url) 38 | settings['SPLASH_URL'] = s.root_url 39 | yield settings 40 | -------------------------------------------------------------------------------- /tests/mockserver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse, socket, sys, time 3 | from subprocess import Popen, PIPE 4 | from importlib import import_module 5 | 6 | from twisted.internet import reactor 7 | from twisted.web.server import Site 8 | 9 | 10 | def get_ephemeral_port(): 11 | s = socket.socket() 12 | s.bind(("", 0)) 13 | return s.getsockname()[1] 14 | 15 | 16 | class MockServer(): 17 | def __init__(self, resource, port=None): 18 | self.resource = '{}.{}'.format(resource.__module__, resource.__name__) 19 | self.proc = None 20 | host = socket.gethostbyname(socket.gethostname()) 21 | self.port = port or get_ephemeral_port() 22 | self.root_url = 'http://%s:%d' % (host, self.port) 23 | 24 | def __enter__(self): 25 | self.proc = Popen( 26 | [sys.executable, '-u', '-m', 'tests.mockserver', 27 | self.resource, '--port', str(self.port)], 28 | stdout=PIPE) 29 | self.proc.stdout.readline() 30 | return self 31 | 32 | def __exit__(self, exc_type, exc_value, traceback): 33 | self.proc.kill() 34 | self.proc.wait() 35 | time.sleep(0.2) 36 | 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('resource') 41 | parser.add_argument('--port', type=int) 42 | args = parser.parse_args() 43 | module_name, name = args.resource.rsplit('.', 1) 44 | sys.path.append('.') 45 | resource = getattr(import_module(module_name), name)() 46 | http_port = reactor.listenTCP(args.port, Site(resource)) 47 | def print_listening(): 48 | host = http_port.getHost() 49 | print('Mock server {} running at http://{}:{}'.format( 50 | resource, host.host, host.port)) 51 | reactor.callWhenRunning(print_listening) 52 | reactor.run() 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /tests/resources.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | from six.moves.urllib.parse import urlparse 4 | 5 | from twisted.web.resource import Resource 6 | from zope.interface import implementer 7 | from twisted.web import resource, guard, proxy 8 | from twisted.cred.portal import IRealm, Portal 9 | from twisted.cred.checkers import InMemoryUsernamePasswordDatabaseDontUse 10 | 11 | from scrapy_splash.utils import to_bytes 12 | 13 | 14 | class HtmlResource(Resource): 15 | isLeaf = True 16 | content_type = 'text/html' 17 | html = '' 18 | extra_headers = {} 19 | status_code = 200 20 | 21 | def render_GET(self, request): 22 | request.setHeader(b'content-type', to_bytes(self.content_type)) 23 | for name, value in self.extra_headers.items(): 24 | request.setHeader(to_bytes(name), to_bytes(value)) 25 | request.setResponseCode(self.status_code) 26 | return to_bytes(self.html) 27 | 28 | 29 | class HelloWorld(HtmlResource): 30 | html = """ 31 | 32 | """ 33 | extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'} 34 | 35 | 36 | class HelloWorldDisallowByRobots(HelloWorld): 37 | """ Disallow itself via robots.txt """ 38 | isLeaf = False 39 | 40 | def getChild(self, name, request): 41 | if name == b"robots.txt": 42 | return self.RobotsTxt() 43 | return self 44 | 45 | class RobotsTxt(Resource): 46 | isLeaf = True 47 | def render_GET(self, request): 48 | return b'User-Agent: *\nDisallow: /\n' 49 | 50 | 51 | class HelloWorldDisallowAuth(HelloWorldDisallowByRobots): 52 | """ Disallow itself via robots.txt if a request to robots.txt 53 | contains basic auth header. """ 54 | class RobotsTxt(HelloWorldDisallowByRobots.RobotsTxt): 55 | def render_GET(self, request): 56 | if request.requestHeaders.hasHeader('Authorization'): 57 | return super(HelloWorldDisallowAuth.RobotsTxt, self).render_GET(request) 58 | request.setResponseCode(404) 59 | return b'' 60 | 61 | 62 | class Http400Resource(HtmlResource): 63 | status_code = 400 64 | html = "Website returns HTTP 400 error" 65 | 66 | 67 | class ManyCookies(Resource, object): 68 | class SetMyCookie(HtmlResource): 69 | html = "hello!" 70 | extra_headers = {'Set-Cookie': 'login=1'} 71 | 72 | def __init__(self): 73 | super(ManyCookies, self).__init__() 74 | self.putChild(b'', HelloWorld()) 75 | self.putChild(b'login', self.SetMyCookie()) 76 | 77 | 78 | def splash_proxy(): 79 | splash_url = os.environ.get('SPLASH_URL') 80 | p = urlparse(splash_url) 81 | return lambda: proxy.ReverseProxyResource(p.hostname, int(p.port), b'') 82 | 83 | 84 | def password_protected(resource_cls, username, password): 85 | # Sorry, but this is nuts. A zillion of classes, arbitrary 86 | # unicode / bytes requirements at random places. Is there a simpler 87 | # way to get HTTP Basic Auth working in Twisted? 88 | @implementer(IRealm) 89 | class SimpleRealm(object): 90 | def requestAvatar(self, avatarId, mind, *interfaces): 91 | if resource.IResource in interfaces: 92 | return resource.IResource, resource_cls(), lambda: None 93 | raise NotImplementedError() 94 | 95 | creds = {username: password} 96 | checkers = [InMemoryUsernamePasswordDatabaseDontUse(**creds)] 97 | return lambda: guard.HTTPAuthSessionWrapper( 98 | Portal(SimpleRealm(), checkers), 99 | [guard.BasicCredentialFactory(b'example.com')]) 100 | 101 | 102 | HelloWorldProtected = password_protected(HelloWorld, 'user', b'userpass') 103 | HelloWorldProtected.__name__ = 'HelloWorldProtected' 104 | HelloWorldProtected.__module__ = __name__ 105 | 106 | SplashProtected = password_protected(splash_proxy(), 'user', b'userpass') 107 | SplashProtected.__name__ = 'SplashProtected' 108 | SplashProtected.__module__ = __name__ 109 | -------------------------------------------------------------------------------- /tests/test_cookies.py: -------------------------------------------------------------------------------- 1 | from scrapy_splash.cookies import har_to_cookie, cookie_to_har 2 | 3 | 4 | # See also doctests in scrapy_splash.cookies module 5 | 6 | 7 | def test_cookie_to_har(): 8 | har_cookie = { 9 | "name": "TestCookie", 10 | "value": "Cookie Value", 11 | "path": "/foo", 12 | "domain": "www.janodvarko.cz", 13 | "expires": "2009-07-24T19:20:30Z", 14 | "httpOnly": True, 15 | "secure": True, 16 | "comment": "this is a test" 17 | } 18 | assert cookie_to_har(har_to_cookie(har_cookie)) == har_cookie 19 | cookie = har_to_cookie(har_cookie) 20 | assert vars(cookie) == vars(har_to_cookie(cookie_to_har(cookie))) 21 | -------------------------------------------------------------------------------- /tests/test_fingerprints.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from copy import deepcopy 4 | 5 | import pytest 6 | import scrapy 7 | 8 | from scrapy_splash import SplashRequest 9 | from scrapy_splash.dupefilter import request_fingerprint, splash_request_fingerprint 10 | from scrapy_splash.utils import dict_hash 11 | 12 | from .test_middleware import _get_mw 13 | from .utils import make_crawler 14 | from scrapy_splash.request import SplashRequestFingerprinter 15 | 16 | 17 | def test_dict_hash(): 18 | h1 = dict_hash({"foo": "bar", "bar": "baz"}) 19 | h2 = dict_hash({"foo": "bar", "bar": "baz"}) 20 | assert h1 == h2 21 | 22 | h3 = dict_hash({"egg": "spam"}) 23 | assert h3 != h2 24 | 25 | 26 | def test_dict_hash_nested(): 27 | h1 = dict_hash({"foo": "bar", "bar": {"baz": "spam"}}) 28 | h2 = dict_hash({"foo": "bar", "bar": {"baz": "spam"}}) 29 | assert h1 == h2 30 | 31 | h3 = dict_hash({"foo": "bar", "bar": {"baz": "egg"}}) 32 | h4 = dict_hash({"foo": "bar", "bar": {"bam": "spam"}}) 33 | assert h3 != h2 34 | assert h4 != h2 35 | 36 | 37 | def test_dict_hash_non_strings(): 38 | h1 = dict_hash({"foo": "bar", "float": 1.1, "int": 2, "bool": False, 39 | "seq": ["x", "y", (2, 3.7, {"x": 5, "y": [6, 7]})]}) 40 | h2 = dict_hash({"foo": "bar", "float": 1.2, "int": 2, "bool": False}) 41 | assert h1 != h2 42 | 43 | 44 | def test_dict_hash_invalid(): 45 | with pytest.raises(ValueError): 46 | dict_hash({"foo": scrapy}) 47 | 48 | 49 | def test_request_fingerprint_nosplash(): 50 | r1 = scrapy.Request("http://example.com") 51 | r2 = scrapy.Request("http://example.com", meta={"foo": "bar"}) 52 | assert request_fingerprint(r1) == splash_request_fingerprint(r1) 53 | assert request_fingerprint(r1) == request_fingerprint(r2) 54 | assert request_fingerprint(r1) == splash_request_fingerprint(r2) 55 | 56 | 57 | def assert_fingerprints_match(r1, r2): 58 | assert splash_request_fingerprint(r1) == splash_request_fingerprint(r2) 59 | 60 | 61 | def assert_fingerprints_dont_match(r1, r2): 62 | assert splash_request_fingerprint(r1) != splash_request_fingerprint(r2) 63 | 64 | 65 | def test_request_fingerprint_splash(): 66 | r1 = scrapy.Request("http://example.com") 67 | r2 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1}}}) 68 | r3 = scrapy.Request("http://example.com", meta={"splash": {"args": {"png": 1}}}) 69 | r4 = scrapy.Request("http://example.com", meta={"foo": "bar", "splash": {"args": {"html": 1}}}) 70 | r5 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1, "wait": 1.0}}}) 71 | 72 | assert request_fingerprint(r1) == request_fingerprint(r2) 73 | assert_fingerprints_dont_match(r1, r2) 74 | assert_fingerprints_dont_match(r1, r3) 75 | assert_fingerprints_dont_match(r1, r4) 76 | assert_fingerprints_dont_match(r1, r5) 77 | assert_fingerprints_dont_match(r2, r3) 78 | 79 | # only "splash" contents is taken into account 80 | assert_fingerprints_match(r2, r4) 81 | 82 | 83 | def assert_fingerprints_match_fingerprinter(fingerprinter, r1, r2): 84 | assert fingerprinter.fingerprint(r1) == fingerprinter.fingerprint(r2) 85 | 86 | 87 | def assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r2): 88 | assert fingerprinter.fingerprint(r1) != fingerprinter.fingerprint(r2) 89 | 90 | 91 | class TestSpider(scrapy.Spider): 92 | name = 'test_spider' 93 | 94 | 95 | def test_splash_request_fingerprinter(): 96 | crawler = make_crawler(TestSpider, {}) 97 | fingerprinter = SplashRequestFingerprinter(crawler) 98 | 99 | r1 = scrapy.Request("http://example.com") 100 | r2 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1}}}) 101 | r3 = scrapy.Request("http://example.com", meta={"splash": {"args": {"png": 1}}}) 102 | r4 = scrapy.Request("http://example.com", meta={"foo": "bar", "splash": {"args": {"html": 1}}}) 103 | r5 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1, "wait": 1.0}}}) 104 | 105 | assert request_fingerprint(r1) == request_fingerprint(r2) 106 | assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r2) 107 | assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r3) 108 | assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r4) 109 | assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r5) 110 | assert_fingerprints_dont_match_fingerprinter(fingerprinter, r2, r3) 111 | 112 | # only "splash" contents is taken into account 113 | assert_fingerprints_match_fingerprinter(fingerprinter, r2, r4) 114 | 115 | 116 | @pytest.fixture() 117 | def splash_middleware(): 118 | return _get_mw() 119 | 120 | 121 | @pytest.fixture 122 | def splash_mw_process(splash_middleware): 123 | def _process(r): 124 | r_copy = r.replace(meta=deepcopy(r.meta)) 125 | return splash_middleware.process_request(r_copy, None) or r 126 | return _process 127 | 128 | 129 | @pytest.fixture() 130 | def requests(): 131 | url1 = "http://example.com/foo?x=1&y=2" 132 | url2 = "http://example.com/foo?y=2&x=1" 133 | url3 = "http://example.com/foo?x=1&y=2&z=3" 134 | url4 = "http://example.com/foo?x=1&y=2#id2" 135 | url5 = "http://example.com/foo?x=1&y=2#!id2" 136 | request_kwargs = [ 137 | dict(url=url1), # 0 138 | dict(url=url1, method='POST'), # 1 139 | dict(url=url1, endpoint='render.har'), # 2 140 | dict(url=url2), # 3 141 | dict(url=url1, args={'wait': 0.5}), # 4 142 | dict(url=url2, args={'wait': 0.5}), # 5 143 | dict(url=url3), # 6 144 | dict(url=url2, method='POST'), # 7 145 | dict(args={'wait': 0.5}), # 8 146 | dict(args={'wait': 0.5}), # 9 147 | dict(args={'wait': 0.7}), # 10 148 | dict(url=url4), # 11 149 | ] 150 | splash_requests = [SplashRequest(**kwargs) for kwargs in request_kwargs] 151 | scrapy_requests = [ 152 | scrapy.Request(url=url1), # 12 153 | scrapy.Request(url=url2), # 13 154 | scrapy.Request(url=url4), # 14 155 | scrapy.Request(url=url5), # 15 156 | ] 157 | return splash_requests + scrapy_requests 158 | 159 | 160 | @pytest.mark.parametrize(["i", "dupe_indices"], [ 161 | (0, {3}), 162 | (1, {7}), 163 | (2, set()), 164 | (3, {0}), 165 | (4, {5}), 166 | (5, {4}), 167 | (6, set()), 168 | (7, {1}), 169 | (8, {9}), 170 | (9, {8}), 171 | (10, set()), 172 | (11, set()), 173 | (12, {13, 14}), 174 | (13, {12, 14}), 175 | (14, {13, 12}), 176 | (15, set()), 177 | ]) 178 | def test_duplicates(i, dupe_indices, requests, splash_mw_process): 179 | def assert_not_filtered(r1, r2): 180 | assert_fingerprints_dont_match(r1, r2) 181 | assert_fingerprints_dont_match( 182 | splash_mw_process(r1), 183 | splash_mw_process(r2), 184 | ) 185 | 186 | def assert_filtered(r1, r2): 187 | # request is filtered if it is filtered either 188 | # before rescheduling or after 189 | fp1 = splash_request_fingerprint(r1) 190 | fp2 = splash_request_fingerprint(r2) 191 | if fp1 != fp2: 192 | assert_fingerprints_match( 193 | splash_mw_process(r1), 194 | splash_mw_process(r2), 195 | ) 196 | 197 | dupe_indices = set(dupe_indices) 198 | dupe_indices.add(i) 199 | non_dupe_indices = set(range(len(requests))) - dupe_indices 200 | 201 | for j in dupe_indices: 202 | assert_filtered(requests[i], requests[j]) 203 | for j in non_dupe_indices: 204 | assert_not_filtered(requests[i], requests[j]) 205 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytest 3 | import scrapy 4 | from pkg_resources import parse_version 5 | from pytest_twisted import inlineCallbacks 6 | from w3lib.url import canonicalize_url 7 | from w3lib.http import basic_auth_header 8 | 9 | from scrapy_splash import SplashRequest 10 | from .utils import crawl_items, requires_splash 11 | from .resources import ( 12 | HelloWorld, 13 | Http400Resource, 14 | ManyCookies, 15 | HelloWorldProtected, 16 | HelloWorldDisallowByRobots, 17 | HelloWorldDisallowAuth, 18 | ) 19 | 20 | 21 | DEFAULT_SCRIPT = """ 22 | function main(splash) 23 | splash:init_cookies(splash.args.cookies) 24 | splash:go{ 25 | splash.args.url, 26 | headers=splash.args.headers, 27 | http_method=splash.args.http_method, 28 | body=splash.args.body, 29 | } 30 | local wait = 0.01 31 | if splash.args.wait ~= nil then 32 | wait = splash.args.wait 33 | end 34 | assert(splash:wait(wait)) 35 | 36 | local entries = splash:history() 37 | local last_response = entries[#entries].response 38 | return { 39 | url = splash:url(), 40 | headers = last_response.headers, 41 | http_status = last_response.status, 42 | cookies = splash:get_cookies(), 43 | html = splash:html(), 44 | args = splash.args, 45 | jsvalue = splash:evaljs("1+2"), 46 | } 47 | end 48 | """ 49 | 50 | 51 | class ResponseSpider(scrapy.Spider): 52 | """ Make a request to URL, return Scrapy response """ 53 | custom_settings = { 54 | 'HTTPERROR_ALLOW_ALL': True, 55 | 'ROBOTSTXT_OBEY': True, 56 | } 57 | url = None 58 | 59 | def start_requests(self): 60 | yield SplashRequest(self.url) 61 | 62 | def parse(self, response): 63 | yield {'response': response} 64 | 65 | 66 | class LuaSpider(ResponseSpider): 67 | """ Make a request to URL using default Lua script """ 68 | headers = None 69 | splash_headers = None 70 | 71 | def start_requests(self): 72 | yield SplashRequest(self.url, 73 | endpoint='execute', 74 | args={'lua_source': DEFAULT_SCRIPT}, 75 | headers=self.headers, 76 | splash_headers=self.splash_headers) 77 | 78 | 79 | class ScrapyAuthSpider(LuaSpider): 80 | """ Spider with incorrect (old, insecure) auth method """ 81 | http_user = 'user' 82 | http_pass = 'userpass' 83 | http_auth_domain = None 84 | 85 | 86 | class NonSplashSpider(ResponseSpider): 87 | """ Spider which uses HTTP auth and doesn't use Splash """ 88 | http_user = 'user' 89 | http_pass = 'userpass' 90 | http_auth_domain = None 91 | 92 | def start_requests(self): 93 | yield scrapy.Request(self.url) 94 | 95 | 96 | def assert_single_response(items): 97 | assert len(items) == 1 98 | return items[0]['response'] 99 | 100 | 101 | @requires_splash 102 | @inlineCallbacks 103 | def test_basic(settings): 104 | items, url, crawler = yield crawl_items(ResponseSpider, HelloWorld, 105 | settings) 106 | resp = assert_single_response(items) 107 | assert resp.url == url 108 | assert resp.css('body::text').extract_first().strip() == "hello world!" 109 | 110 | 111 | @requires_splash 112 | @inlineCallbacks 113 | def test_reload(settings): 114 | 115 | class ReloadSpider(ResponseSpider): 116 | """ Make two requests to URL, store both responses. 117 | This spider activates both start_requests and parse methods, 118 | and checks that dupefilter takes fragment into account. """ 119 | 120 | def parse(self, response): 121 | yield {'response': response} 122 | yield SplashRequest(self.url + '#foo') 123 | 124 | items, url, crawler = yield crawl_items(ReloadSpider, HelloWorld, settings) 125 | assert len(items) == 2 126 | assert crawler.stats.get_value('dupefilter/filtered') == 1 127 | resp = items[0]['response'] 128 | assert resp.url == url 129 | assert resp.css('body::text').extract_first().strip() == "hello world!" 130 | assert resp.status == resp.splash_response_status == 200 131 | assert resp.headers == resp.splash_response_headers 132 | assert resp.splash_response_headers['Content-Type'] == b"text/html; charset=utf-8" 133 | 134 | resp2 = items[1]['response'] 135 | assert resp2.body == resp.body 136 | assert resp2 is not resp 137 | assert resp2.url == resp.url + "#foo" 138 | 139 | 140 | @requires_splash 141 | @inlineCallbacks 142 | def test_basic_lua(settings): 143 | 144 | class LuaScriptSpider(ResponseSpider): 145 | """ Make a request using a Lua script similar to the one from README 146 | """ 147 | def start_requests(self): 148 | yield SplashRequest(self.url + "#foo", endpoint='execute', 149 | args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'}) 150 | 151 | 152 | items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld, 153 | settings) 154 | resp = assert_single_response(items) 155 | assert resp.url == url + "/#foo" 156 | assert resp.status == resp.splash_response_status == 200 157 | assert resp.css('body::text').extract_first().strip() == "hello world!" 158 | assert resp.data['jsvalue'] == 3 159 | assert resp.headers['X-MyHeader'] == b'my value' 160 | assert resp.headers['Content-Type'] == b'text/html' 161 | assert resp.splash_response_headers['Content-Type'] == b'application/json' 162 | assert resp.data['args']['foo'] == 'bar' 163 | 164 | 165 | @requires_splash 166 | @inlineCallbacks 167 | def test_bad_request(settings): 168 | class BadRequestSpider(ResponseSpider): 169 | def start_requests(self): 170 | yield SplashRequest(self.url, endpoint='execute', 171 | args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'}) 172 | 173 | items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld, 174 | settings) 175 | resp = assert_single_response(items) 176 | assert resp.status == 400 177 | assert resp.splash_response_status == 400 178 | 179 | items, url, crawler = yield crawl_items(LuaSpider, Http400Resource, 180 | settings) 181 | resp = assert_single_response(items) 182 | assert resp.status == 400 183 | assert resp.splash_response_status == 200 184 | 185 | 186 | @requires_splash 187 | @inlineCallbacks 188 | def test_cache_args(settings): 189 | 190 | class CacheArgsSpider(ResponseSpider): 191 | def _request(self, url): 192 | return SplashRequest(url, endpoint='execute', 193 | args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'}, 194 | cache_args=['lua_source']) 195 | 196 | def start_requests(self): 197 | yield self._request(self.url) 198 | 199 | def parse(self, response): 200 | yield {'response': response} 201 | yield self._request(self.url + "#foo") 202 | 203 | 204 | items, url, crawler = yield crawl_items(CacheArgsSpider, HelloWorld, 205 | settings) 206 | assert len(items) == 2 207 | resp = items[0]['response'] 208 | assert b"function main(splash)" in resp.request.body 209 | assert b"yy" in resp.request.body 210 | print(resp.body, resp.request.body) 211 | 212 | resp = items[1]['response'] 213 | assert b"function main(splash)" not in resp.request.body 214 | assert b"yy" in resp.request.body 215 | print(resp.body, resp.request.body) 216 | 217 | 218 | @requires_splash 219 | @inlineCallbacks 220 | def test_cookies(settings): 221 | 222 | # 64K for headers is over Twisted limit, 223 | # so if these headers are sent to Splash request would fail. 224 | BOMB = 'x' * 64000 225 | 226 | class LuaScriptSpider(ResponseSpider): 227 | """ Cookies must be sent to website, not to Splash """ 228 | custom_settings = { 229 | 'SPLASH_COOKIES_DEBUG': True, 230 | 'COOKIES_DEBUG': True, 231 | } 232 | 233 | def start_requests(self): 234 | # cookies set without Splash should be still 235 | # sent to a remote website. FIXME: this is not the case. 236 | yield scrapy.Request(self.url + "/login", self.parse, 237 | cookies={'x-set-scrapy': '1'}) 238 | 239 | def parse(self, response): 240 | yield SplashRequest(self.url + "#egg", self.parse_1, 241 | endpoint='execute', 242 | args={'lua_source': DEFAULT_SCRIPT}, 243 | cookies={'x-set-splash': '1'}) 244 | 245 | def parse_1(self, response): 246 | yield {'response': response} 247 | yield SplashRequest(self.url + "#foo", self.parse_2, 248 | endpoint='execute', 249 | args={'lua_source': DEFAULT_SCRIPT}) 250 | 251 | def parse_2(self, response): 252 | yield {'response': response} 253 | yield scrapy.Request(self.url, self.parse_3) 254 | 255 | def parse_3(self, response): 256 | # Splash (Twisted) drops requests with huge http headers, 257 | # but this one should work, as cookies are not sent 258 | # to Splash itself. 259 | yield {'response': response} 260 | yield SplashRequest(self.url + "#bar", self.parse_4, 261 | endpoint='execute', 262 | args={'lua_source': DEFAULT_SCRIPT}, 263 | cookies={'bomb': BOMB}) 264 | 265 | def parse_4(self, response): 266 | yield {'response': response} 267 | 268 | 269 | def _cookie_dict(har_cookies): 270 | return {c['name']: c['value'] for c in har_cookies} 271 | 272 | items, url, crawler = yield crawl_items(LuaScriptSpider, ManyCookies, 273 | settings) 274 | assert len(items) == 4 275 | 276 | # cookie should be sent to remote website, not to Splash 277 | resp = items[0]['response'] 278 | splash_request_headers = resp.request.headers 279 | cookies = resp.data['args']['cookies'] 280 | print(splash_request_headers) 281 | print(cookies) 282 | assert _cookie_dict(cookies) == { 283 | # 'login': '1', # FIXME 284 | 'x-set-splash': '1' 285 | } 286 | assert splash_request_headers.get(b'Cookie') is None 287 | 288 | # new cookie should be also sent to remote website, not to Splash 289 | resp2 = items[1]['response'] 290 | splash_request_headers = resp2.request.headers 291 | headers = resp2.data['args']['headers'] 292 | cookies = resp2.data['args']['cookies'] 293 | assert canonicalize_url(headers['Referer']) == canonicalize_url(url) 294 | assert _cookie_dict(cookies) == { 295 | # 'login': '1', 296 | 'x-set-splash': '1', 297 | 'sessionid': 'ABCD' 298 | } 299 | print(splash_request_headers) 300 | print(headers) 301 | print(cookies) 302 | assert splash_request_headers.get(b'Cookie') is None 303 | 304 | # TODO/FIXME: Cookies fetched when working with Splash should be picked up 305 | # by Scrapy 306 | resp3 = items[2]['response'] 307 | splash_request_headers = resp3.request.headers 308 | cookie_header = splash_request_headers.get(b'Cookie') 309 | assert b'x-set-scrapy=1' in cookie_header 310 | assert b'login=1' in cookie_header 311 | assert b'x-set-splash=1' in cookie_header 312 | # assert b'sessionid=ABCD' in cookie_header # FIXME 313 | 314 | # cookie bomb shouldn't cause problems 315 | resp4 = items[3]['response'] 316 | splash_request_headers = resp4.request.headers 317 | cookies = resp4.data['args']['cookies'] 318 | assert _cookie_dict(cookies) == { 319 | # 'login': '1', 320 | 'x-set-splash': '1', 321 | 'sessionid': 'ABCD', 322 | 'bomb': BOMB, 323 | } 324 | assert splash_request_headers.get(b'Cookie') is None 325 | 326 | 327 | @requires_splash 328 | @inlineCallbacks 329 | def test_access_http_auth(settings): 330 | # website is protected 331 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, 332 | settings) 333 | response = assert_single_response(items) 334 | assert response.status == 401 335 | assert response.splash_response_status == 200 336 | 337 | # header can be used to access it 338 | AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')} 339 | kwargs = {'headers': AUTH_HEADERS} 340 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, 341 | settings, kwargs) 342 | response = assert_single_response(items) 343 | assert 'hello' in response.text 344 | assert response.status == 200 345 | assert response.splash_response_status == 200 346 | 347 | 348 | @requires_splash 349 | @inlineCallbacks 350 | def test_protected_splash_no_auth(settings_auth): 351 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, 352 | settings_auth) 353 | response = assert_single_response(items) 354 | assert 'Unauthorized' in response.text 355 | assert 'hello' not in response.text 356 | assert response.status == 401 357 | assert response.splash_response_status == 401 358 | 359 | 360 | @requires_splash 361 | @inlineCallbacks 362 | def test_protected_splash_manual_headers_auth(settings_auth): 363 | AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')} 364 | kwargs = {'splash_headers': AUTH_HEADERS} 365 | 366 | # auth via splash_headers should work 367 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, 368 | settings_auth, kwargs) 369 | response = assert_single_response(items) 370 | assert 'hello' in response.text 371 | assert response.status == 200 372 | assert response.splash_response_status == 200 373 | 374 | # but only for Splash, not for a remote website 375 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, 376 | settings_auth, kwargs) 377 | response = assert_single_response(items) 378 | assert 'hello' not in response.text 379 | assert response.status == 401 380 | assert response.splash_response_status == 200 381 | 382 | 383 | @requires_splash 384 | @inlineCallbacks 385 | def test_protected_splash_settings_auth(settings_auth): 386 | settings_auth['SPLASH_USER'] = 'user' 387 | settings_auth['SPLASH_PASS'] = 'userpass' 388 | 389 | # settings works 390 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, 391 | settings_auth) 392 | response = assert_single_response(items) 393 | assert 'Unauthorized' not in response.text 394 | assert 'hello' in response.text 395 | assert response.status == 200 396 | assert response.splash_response_status == 200 397 | 398 | # they can be overridden via splash_headers 399 | bad_auth = {'splash_headers': {'Authorization': 'foo'}} 400 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorld, 401 | settings_auth, bad_auth) 402 | response = assert_single_response(items) 403 | assert response.status == 401 404 | assert response.splash_response_status == 401 405 | 406 | # auth error on remote website 407 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, 408 | settings_auth) 409 | response = assert_single_response(items) 410 | assert response.status == 401 411 | assert response.splash_response_status == 200 412 | 413 | # auth both for Splash and for the remote website 414 | REMOTE_AUTH = {'Authorization': basic_auth_header('user', 'userpass')} 415 | remote_auth_kwargs = {'headers': REMOTE_AUTH} 416 | items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected, 417 | settings_auth, remote_auth_kwargs) 418 | response = assert_single_response(items) 419 | assert response.status == 200 420 | assert response.splash_response_status == 200 421 | assert 'hello' in response.text 422 | 423 | # enable remote auth, but not splash auth - request should fail 424 | del settings_auth['SPLASH_USER'] 425 | del settings_auth['SPLASH_PASS'] 426 | items, url, crawler = yield crawl_items(LuaSpider, 427 | HelloWorldProtected, 428 | settings_auth, remote_auth_kwargs) 429 | response = assert_single_response(items) 430 | assert response.status == 401 431 | assert response.splash_response_status == 401 432 | 433 | 434 | @requires_splash 435 | @inlineCallbacks 436 | def test_protected_splash_httpauth_middleware(settings_auth): 437 | # httpauth middleware should enable auth for Splash, for backwards 438 | # compatibility reasons 439 | items, url, crawler = yield crawl_items(ScrapyAuthSpider, HelloWorld, 440 | settings_auth) 441 | response = assert_single_response(items) 442 | assert 'Unauthorized' not in response.text 443 | assert 'hello' in response.text 444 | assert response.status == 200 445 | assert response.splash_response_status == 200 446 | 447 | # but not for a remote website 448 | items, url, crawler = yield crawl_items(ScrapyAuthSpider, 449 | HelloWorldProtected, 450 | settings_auth) 451 | response = assert_single_response(items) 452 | assert 'hello' not in response.text 453 | assert response.status == 401 454 | assert response.splash_response_status == 200 455 | 456 | # headers shouldn't be sent to robots.txt file 457 | items, url, crawler = yield crawl_items(ScrapyAuthSpider, 458 | HelloWorldDisallowAuth, 459 | settings_auth) 460 | response = assert_single_response(items) 461 | assert 'hello' in response.text 462 | assert response.status == 200 463 | assert response.splash_response_status == 200 464 | 465 | # httpauth shouldn't be disabled for non-Splash requests 466 | items, url, crawler = yield crawl_items(NonSplashSpider, 467 | HelloWorldProtected, 468 | settings_auth) 469 | response = assert_single_response(items) 470 | assert 'hello' in response.text 471 | assert response.status == 200 472 | assert not hasattr(response, 'splash_response_status') 473 | 474 | 475 | @pytest.mark.xfail( 476 | parse_version(scrapy.__version__) < parse_version("1.1"), 477 | reason="https://github.com/scrapy/scrapy/issues/1471", 478 | strict=True, 479 | run=True, 480 | ) 481 | @requires_splash 482 | @inlineCallbacks 483 | def test_robotstxt_can_work(settings_auth): 484 | 485 | def assert_robots_disabled(items): 486 | response = assert_single_response(items) 487 | assert response.status == response.splash_response_status == 200 488 | assert b'hello' in response.body 489 | 490 | def assert_robots_enabled(items, crawler): 491 | assert len(items) == 0 492 | assert crawler.stats.get_value('downloader/exception_type_count/scrapy.exceptions.IgnoreRequest') == 1 493 | 494 | def _crawl_items(spider, resource): 495 | return crawl_items( 496 | spider, 497 | resource, 498 | settings_auth, 499 | url_path='/', # https://github.com/scrapy/protego/issues/17 500 | ) 501 | 502 | # when old auth method is used, robots.txt should be disabled 503 | items, url, crawler = yield _crawl_items(ScrapyAuthSpider, 504 | HelloWorldDisallowByRobots) 505 | assert_robots_disabled(items) 506 | 507 | # but robots.txt should still work for non-Splash requests 508 | items, url, crawler = yield _crawl_items(NonSplashSpider, 509 | HelloWorldDisallowByRobots) 510 | assert_robots_enabled(items, crawler) 511 | 512 | # robots.txt should work when a proper auth method is used 513 | settings_auth['SPLASH_USER'] = 'user' 514 | settings_auth['SPLASH_PASS'] = 'userpass' 515 | items, url, crawler = yield _crawl_items(LuaSpider, 516 | HelloWorldDisallowByRobots) 517 | assert_robots_enabled(items, crawler) 518 | 519 | # disable robotstxt middleware - robots middleware shouldn't work 520 | class DontObeyRobotsSpider(LuaSpider): 521 | custom_settings = { 522 | 'HTTPERROR_ALLOW_ALL': True, 523 | 'ROBOTSTXT_OBEY': False, 524 | } 525 | items, url, crawler = yield _crawl_items(DontObeyRobotsSpider, 526 | HelloWorldDisallowByRobots) 527 | assert_robots_disabled(items) 528 | 529 | # disable robotstxt middleware via request meta 530 | class MetaDontObeyRobotsSpider(ResponseSpider): 531 | def start_requests(self): 532 | yield SplashRequest(self.url, 533 | endpoint='execute', 534 | meta={'dont_obey_robotstxt': True}, 535 | args={'lua_source': DEFAULT_SCRIPT}) 536 | 537 | items, url, crawler = yield _crawl_items(MetaDontObeyRobotsSpider, 538 | HelloWorldDisallowByRobots) 539 | assert_robots_disabled(items) 540 | -------------------------------------------------------------------------------- /tests/test_middleware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import copy 4 | import json 5 | import base64 6 | 7 | import scrapy 8 | from scrapy.core.engine import ExecutionEngine 9 | from scrapy.utils.test import get_crawler 10 | from scrapy.http import Response, TextResponse, JsonResponse 11 | from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware 12 | 13 | import scrapy_splash 14 | from scrapy_splash.utils import to_unicode 15 | from scrapy_splash import ( 16 | SplashRequest, 17 | SplashMiddleware, 18 | SlotPolicy, 19 | SplashCookiesMiddleware, 20 | SplashDeduplicateArgsMiddleware, 21 | ) 22 | 23 | 24 | def _get_crawler(settings_dict): 25 | settings_dict = settings_dict.copy() 26 | settings_dict['DOWNLOAD_HANDLERS'] = {'s3': None} # for faster test running 27 | crawler = get_crawler(settings_dict=settings_dict) 28 | if not hasattr(crawler, 'logformatter'): 29 | crawler.logformatter = None 30 | crawler.engine = ExecutionEngine(crawler, lambda _: None) 31 | # spider = crawler._create_spider("foo") 32 | return crawler 33 | 34 | 35 | def _get_mw(settings_dict=None): 36 | crawler = _get_crawler(settings_dict or {}) 37 | return SplashMiddleware.from_crawler(crawler) 38 | 39 | 40 | def _get_cookie_mw(): 41 | return SplashCookiesMiddleware(debug=True) 42 | 43 | 44 | def test_nosplash(): 45 | mw = _get_mw() 46 | cookie_mw = _get_cookie_mw() 47 | req = scrapy.Request("http://example.com") 48 | old_meta = copy.deepcopy(req.meta) 49 | 50 | assert cookie_mw.process_request(req, None) is None 51 | assert mw.process_request(req, None) is None 52 | assert old_meta == req.meta 53 | 54 | # response is not changed 55 | response = Response("http://example.com", request=req) 56 | response2 = mw.process_response(req, response, None) 57 | response3 = cookie_mw.process_response(req, response, None) 58 | assert response2 is response 59 | assert response3 is response 60 | assert response3.url == "http://example.com" 61 | 62 | 63 | def test_splash_request(): 64 | mw = _get_mw() 65 | cookie_mw = _get_cookie_mw() 66 | 67 | req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") 68 | assert repr(req) == "" 69 | 70 | # check request preprocessing 71 | req2 = cookie_mw.process_request(req, None) or req 72 | req2 = mw.process_request(req2, None) or req2 73 | 74 | assert req2 is not None 75 | assert req2 is not req 76 | assert req2.url == "http://127.0.0.1:8050/render.html" 77 | assert req2.headers == {b'Content-Type': [b'application/json']} 78 | assert req2.method == 'POST' 79 | assert isinstance(req2, SplashRequest) 80 | assert repr(req2) == "" 81 | 82 | expected_body = {'url': req.url} 83 | assert json.loads(to_unicode(req2.body)) == expected_body 84 | 85 | # check response post-processing 86 | response = TextResponse("http://127.0.0.1:8050/render.html", 87 | # Scrapy doesn't pass request to constructor 88 | # request=req2, 89 | headers={b'Content-Type': b'text/html'}, 90 | body=b"Hello") 91 | response2 = mw.process_response(req2, response, None) 92 | response2 = cookie_mw.process_response(req2, response2, None) 93 | assert isinstance(response2, scrapy_splash.SplashTextResponse) 94 | assert response2 is not response 95 | assert response2.real_url == req2.url 96 | assert response2.url == req.url 97 | assert response2.body == b"Hello" 98 | assert response2.css("body").extract_first() == "Hello" 99 | assert response2.headers == {b'Content-Type': [b'text/html']} 100 | 101 | # check .replace method 102 | response3 = response2.replace(status=404) 103 | assert response3.status == 404 104 | assert isinstance(response3, scrapy_splash.SplashTextResponse) 105 | for attr in ['url', 'real_url', 'headers', 'body']: 106 | assert getattr(response3, attr) == getattr(response2, attr) 107 | 108 | 109 | def test_dont_process_response(): 110 | mw = _get_mw() 111 | req = SplashRequest("http://example.com/", 112 | endpoint="render.html", 113 | dont_process_response=True, 114 | ) 115 | req2 = mw.process_request(req, None) 116 | resp = Response("http://example.com/") 117 | resp2 = mw.process_response(req2, resp, None) 118 | assert resp2.__class__ is Response 119 | assert resp2 is resp 120 | 121 | 122 | def test_splash_request_parameters(): 123 | mw = _get_mw() 124 | cookie_mw = _get_cookie_mw() 125 | 126 | def cb(): 127 | pass 128 | 129 | req = SplashRequest("http://example.com/#!start", cb, 'POST', 130 | body="foo=bar", 131 | splash_url="http://mysplash.example.com", 132 | slot_policy=SlotPolicy.SINGLE_SLOT, 133 | endpoint="execute", 134 | splash_headers={'X-My-Header': 'value'}, 135 | args={ 136 | "lua_source": "function main() end", 137 | "myarg": 3.0, 138 | }, 139 | magic_response=False, 140 | headers={'X-My-Header': 'value'} 141 | ) 142 | req2 = cookie_mw.process_request(req, None) or req 143 | req2 = mw.process_request(req2, None) or req2 144 | 145 | assert req2.meta['ajax_crawlable'] is True 146 | assert req2.meta['splash'] == { 147 | 'endpoint': 'execute', 148 | 'splash_url': "http://mysplash.example.com", 149 | 'slot_policy': SlotPolicy.SINGLE_SLOT, 150 | 'splash_headers': {'X-My-Header': 'value'}, 151 | 'magic_response': False, 152 | 'session_id': 'default', 153 | 'http_status_from_error_code': True, 154 | 'args': { 155 | 'url': "http://example.com/#!start", 156 | 'http_method': 'POST', 157 | 'body': 'foo=bar', 158 | 'cookies': [], 159 | 'lua_source': 'function main() end', 160 | 'myarg': 3.0, 161 | 'headers': { 162 | 'X-My-Header': 'value', 163 | } 164 | }, 165 | } 166 | assert req2.callback == cb 167 | assert req2.headers == { 168 | b'Content-Type': [b'application/json'], 169 | b'X-My-Header': [b'value'], 170 | } 171 | 172 | # check response post-processing 173 | res = { 174 | 'html': 'Hello', 175 | 'num_divs': 0.0, 176 | } 177 | res_body = json.dumps(res) 178 | response = TextResponse("http://mysplash.example.com/execute", 179 | # Scrapy doesn't pass request to constructor 180 | # request=req2, 181 | headers={b'Content-Type': b'application/json'}, 182 | body=res_body.encode('utf8')) 183 | response2 = mw.process_response(req2, response, None) 184 | response2 = cookie_mw.process_response(req2, response2, None) 185 | assert isinstance(response2, scrapy_splash.SplashJsonResponse) 186 | assert response2 is not response 187 | assert response2.real_url == req2.url 188 | assert response2.url == req.meta['splash']['args']['url'] 189 | assert response2.data == res 190 | assert response2.body == res_body.encode('utf8') 191 | assert response2.text == response2.text == res_body 192 | assert response2.encoding == 'utf8' 193 | assert response2.headers == {b'Content-Type': [b'application/json']} 194 | assert response2.splash_response_headers == response2.headers 195 | assert response2.status == response2.splash_response_status == 200 196 | 197 | 198 | def test_magic_response(): 199 | mw = _get_mw() 200 | cookie_mw = _get_cookie_mw() 201 | 202 | req = SplashRequest('http://example.com/', 203 | endpoint='execute', 204 | args={'lua_source': 'function main() end'}, 205 | magic_response=True, 206 | cookies=[{'name': 'foo', 'value': 'bar'}]) 207 | req = cookie_mw.process_request(req, None) or req 208 | req = mw.process_request(req, None) or req 209 | 210 | resp_data = { 211 | 'url': "http://exmaple.com/#id42", 212 | 'html': 'Hello 404', 213 | 'http_status': 404, 214 | 'headers': [ 215 | {'name': 'Content-Type', 'value': "text/html"}, 216 | {'name': 'X-My-Header', 'value': "foo"}, 217 | {'name': 'Set-Cookie', 'value': "bar=baz"}, 218 | ], 219 | 'cookies': [ 220 | {'name': 'foo', 'value': 'bar'}, 221 | {'name': 'bar', 'value': 'baz', 'domain': '.example.com'}, 222 | {'name': 'session', 'value': '12345', 'path': '/', 223 | 'expires': '2055-07-24T19:20:30Z'}, 224 | ], 225 | } 226 | resp = TextResponse("http://mysplash.example.com/execute", 227 | headers={b'Content-Type': b'application/json'}, 228 | body=json.dumps(resp_data).encode('utf8')) 229 | resp2 = mw.process_response(req, resp, None) 230 | resp2 = cookie_mw.process_response(req, resp2, None) 231 | assert isinstance(resp2, scrapy_splash.SplashJsonResponse) 232 | assert resp2.data == resp_data 233 | assert resp2.body == b'Hello 404' 234 | assert resp2.text == 'Hello 404' 235 | assert resp2.headers == { 236 | b'Content-Type': [b'text/html'], 237 | b'X-My-Header': [b'foo'], 238 | b'Set-Cookie': [b'bar=baz'], 239 | } 240 | assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']} 241 | assert resp2.status == 404 242 | assert resp2.splash_response_status == 200 243 | assert resp2.url == "http://exmaple.com/#id42" 244 | assert len(resp2.cookiejar) == 3 245 | cookies = [c for c in resp2.cookiejar] 246 | assert {(c.name, c.value) for c in cookies} == { 247 | ('bar', 'baz'), 248 | ('foo', 'bar'), 249 | ('session', '12345') 250 | } 251 | 252 | # send second request using the same session and check the resulting cookies 253 | req = SplashRequest('http://example.com/foo', 254 | endpoint='execute', 255 | args={'lua_source': 'function main() end'}, 256 | magic_response=True, 257 | cookies={'spam': 'ham'}) 258 | req = cookie_mw.process_request(req, None) or req 259 | req = mw.process_request(req, None) or req 260 | 261 | resp_data = { 262 | 'html': 'Hello', 263 | 'headers': [ 264 | {'name': 'Content-Type', 'value': "text/html"}, 265 | {'name': 'X-My-Header', 'value': "foo"}, 266 | {'name': 'Set-Cookie', 'value': "bar=baz"}, 267 | ], 268 | 'cookies': [ 269 | {'name': 'spam', 'value': 'ham'}, 270 | {'name': 'egg', 'value': 'spam'}, 271 | {'name': 'bar', 'value': 'baz', 'domain': '.example.com'}, 272 | #{'name': 'foo', 'value': ''}, -- this won't be in response 273 | {'name': 'session', 'value': '12345', 'path': '/', 274 | 'expires': '2056-07-24T19:20:30Z'}, 275 | ], 276 | } 277 | resp = TextResponse("http://mysplash.example.com/execute", 278 | headers={b'Content-Type': b'application/json'}, 279 | body=json.dumps(resp_data).encode('utf8')) 280 | resp2 = mw.process_response(req, resp, None) 281 | resp2 = cookie_mw.process_response(req, resp2, None) 282 | assert isinstance(resp2, scrapy_splash.SplashJsonResponse) 283 | assert resp2.data == resp_data 284 | cookies = [c for c in resp2.cookiejar] 285 | assert {c.name for c in cookies} == {'session', 'egg', 'bar', 'spam'} 286 | for c in cookies: 287 | if c.name == 'session': 288 | assert c.expires == 2731692030 289 | if c.name == 'spam': 290 | assert c.value == 'ham' 291 | 292 | 293 | def test_cookies(): 294 | mw = _get_mw() 295 | cookie_mw = _get_cookie_mw() 296 | 297 | def request_with_cookies(cookies): 298 | req = SplashRequest( 299 | 'http://example.com/foo', 300 | endpoint='execute', 301 | args={'lua_source': 'function main() end'}, 302 | magic_response=True, 303 | cookies=cookies) 304 | req = cookie_mw.process_request(req, None) or req 305 | req = mw.process_request(req, None) or req 306 | return req 307 | 308 | def response_with_cookies(req, cookies): 309 | resp_data = { 310 | 'html': 'Hello', 311 | 'headers': [], 312 | 'cookies': cookies, 313 | } 314 | resp = TextResponse( 315 | 'http://mysplash.example.com/execute', 316 | headers={b'Content-Type': b'application/json'}, 317 | body=json.dumps(resp_data).encode('utf8')) 318 | resp = mw.process_response(req, resp, None) 319 | resp = cookie_mw.process_response(req, resp, None) 320 | return resp 321 | 322 | # Concurent requests 323 | req1 = request_with_cookies({'spam': 'ham'}) 324 | req2 = request_with_cookies({'bom': 'bam'}) 325 | resp1 = response_with_cookies(req1, [ 326 | {'name': 'spam', 'value': 'ham'}, 327 | {'name': 'spam_x', 'value': 'ham_x'}, 328 | ]) 329 | resp2 = response_with_cookies(req2, [ 330 | {'name': 'spam', 'value': 'ham'}, # because req2 was made after req1 331 | {'name': 'bom_x', 'value': 'bam_x'}, 332 | ]) 333 | assert resp1.cookiejar is resp2.cookiejar 334 | cookies = {c.name: c.value for c in resp1.cookiejar} 335 | assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'} 336 | 337 | # Removing already removed 338 | req1 = request_with_cookies({'spam': 'ham'}) 339 | req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'}) 340 | resp2 = response_with_cookies(req2, [ 341 | {'name': 'pom', 'value': 'pam'}, 342 | ]) 343 | resp1 = response_with_cookies(req1, []) 344 | assert resp1.cookiejar is resp2.cookiejar 345 | cookies = {c.name: c.value for c in resp1.cookiejar} 346 | assert cookies == {'pom': 'pam'} 347 | 348 | 349 | def test_magic_response2(): 350 | # check 'body' handling and another 'headers' format 351 | mw = _get_mw() 352 | req = SplashRequest('http://example.com/', magic_response=True, 353 | headers={'foo': 'bar'}, dont_send_headers=True) 354 | req = mw.process_request(req, None) or req 355 | assert 'headers' not in req.meta['splash']['args'] 356 | 357 | resp_data = { 358 | 'body': base64.b64encode(b"binary data").decode('ascii'), 359 | 'headers': {'Content-Type': 'text/plain'}, 360 | } 361 | resp = TextResponse("http://mysplash.example.com/execute", 362 | headers={b'Content-Type': b'application/json'}, 363 | body=json.dumps(resp_data).encode('utf8')) 364 | resp2 = mw.process_response(req, resp, None) 365 | assert resp2.data == resp_data 366 | assert resp2.body == b'binary data' 367 | assert resp2.headers == {b'Content-Type': [b'text/plain']} 368 | assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']} 369 | assert resp2.status == resp2.splash_response_status == 200 370 | assert resp2.url == "http://example.com/" 371 | 372 | 373 | def test_unicode_url(): 374 | mw = _get_mw() 375 | req = SplashRequest( 376 | # note unicode URL 377 | u"http://example.com/", endpoint='execute') 378 | req2 = mw.process_request(req, None) or req 379 | res = {'html': 'Hello'} 380 | res_body = json.dumps(res) 381 | response = TextResponse("http://mysplash.example.com/execute", 382 | # Scrapy doesn't pass request to constructor 383 | # request=req2, 384 | headers={b'Content-Type': b'application/json'}, 385 | body=res_body.encode('utf8')) 386 | response2 = mw.process_response(req2, response, None) 387 | assert response2.url == "http://example.com/" 388 | 389 | 390 | def test_magic_response_http_error(): 391 | mw = _get_mw() 392 | req = SplashRequest('http://example.com/foo') 393 | req = mw.process_request(req, None) or req 394 | 395 | resp_data = { 396 | "info": { 397 | "error": "http404", 398 | "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404", 399 | "line_number": 3, 400 | "type": "LUA_ERROR", 401 | "source": "[string \"function main(splash)\r...\"]" 402 | }, 403 | "description": "Error happened while executing Lua script", 404 | "error": 400, 405 | "type": "ScriptError" 406 | } 407 | resp = TextResponse("http://mysplash.example.com/execute", status=400, 408 | headers={b'Content-Type': b'application/json'}, 409 | body=json.dumps(resp_data).encode('utf8')) 410 | resp = mw.process_response(req, resp, None) 411 | assert resp.data == resp_data 412 | assert resp.status == 404 413 | assert resp.splash_response_status == 400 414 | assert resp.url == "http://example.com/foo" 415 | 416 | 417 | def test_change_response_class_to_text(): 418 | mw = _get_mw() 419 | req = SplashRequest('http://example.com/', magic_response=True) 420 | req = mw.process_request(req, None) or req 421 | # Such response can come when downloading a file, 422 | # or returning splash:html(): the headers say it's binary, 423 | # but it can be decoded so it becomes a TextResponse. 424 | resp = TextResponse('http://mysplash.example.com/execute', 425 | headers={b'Content-Type': b'application/pdf'}, 426 | body=b'ascii binary data', 427 | encoding='utf-8') 428 | resp2 = mw.process_response(req, resp, None) 429 | assert isinstance(resp2, TextResponse) 430 | assert resp2.url == 'http://example.com/' 431 | assert resp2.headers == {b'Content-Type': [b'application/pdf']} 432 | assert resp2.body == b'ascii binary data' 433 | 434 | 435 | def test_change_response_class_to_json_binary(): 436 | mw = _get_mw() 437 | # We set magic_response to False, because it's not a kind of data we would 438 | # expect from splash: we just return binary data. 439 | # If we set magic_response to True, the middleware will fail, 440 | # but this is ok because magic_response presumes we are expecting 441 | # a valid splash json response. 442 | req = SplashRequest('http://example.com/', magic_response=False) 443 | req = mw.process_request(req, None) or req 444 | resp = Response('http://mysplash.example.com/execute', 445 | headers={b'Content-Type': b'application/json'}, 446 | body=b'non-decodable data: \x98\x11\xe7\x17\x8f', 447 | ) 448 | resp2 = mw.process_response(req, resp, None) 449 | assert isinstance(resp2, Response) 450 | assert resp2.url == 'http://example.com/' 451 | assert resp2.headers == {b'Content-Type': [b'application/json']} 452 | assert resp2.body == b'non-decodable data: \x98\x11\xe7\x17\x8f' 453 | 454 | 455 | def test_magic_response_caching(tmpdir): 456 | # prepare middlewares 457 | spider = scrapy.Spider(name='foo') 458 | crawler = _get_crawler({ 459 | 'HTTPCACHE_DIR': str(tmpdir.join('cache')), 460 | 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage', 461 | 'HTTPCACHE_ENABLED': True 462 | }) 463 | cache_mw = HttpCacheMiddleware.from_crawler(crawler) 464 | mw = _get_mw() 465 | cookie_mw = _get_cookie_mw() 466 | 467 | def _get_req(): 468 | return SplashRequest( 469 | url="http://example.com", 470 | endpoint='execute', 471 | magic_response=True, 472 | args={'lua_source': 'function main(splash) end'}, 473 | ) 474 | 475 | # Emulate Scrapy middleware chain. 476 | 477 | # first call 478 | req = _get_req() 479 | req = cookie_mw.process_request(req, spider) or req 480 | req = mw.process_request(req, spider) or req 481 | req = cache_mw.process_request(req, spider) or req 482 | assert isinstance(req, scrapy.Request) # first call; the cache is empty 483 | 484 | resp_data = { 485 | 'html': "Hello", 486 | 'render_time': 0.5, 487 | } 488 | resp_body = json.dumps(resp_data).encode('utf8') 489 | resp = TextResponse("http://example.com", 490 | headers={b'Content-Type': b'application/json'}, 491 | body=resp_body) 492 | 493 | resp2 = cache_mw.process_response(req, resp, spider) 494 | resp3 = mw.process_response(req, resp2, spider) 495 | resp3 = cookie_mw.process_response(req, resp3, spider) 496 | 497 | assert resp3.text == "Hello" 498 | assert resp3.css("body").extract_first() == "Hello" 499 | assert resp3.data['render_time'] == 0.5 500 | 501 | # second call 502 | req = _get_req() 503 | req = cookie_mw.process_request(req, spider) or req 504 | req = mw.process_request(req, spider) or req 505 | cached_resp = cache_mw.process_request(req, spider) or req 506 | 507 | # response should be from cache: 508 | assert cached_resp.__class__ is JsonResponse 509 | assert cached_resp.body == resp_body 510 | resp2_1 = cache_mw.process_response(req, cached_resp, spider) 511 | resp3_1 = mw.process_response(req, resp2_1, spider) 512 | resp3_1 = cookie_mw.process_response(req, resp3_1, spider) 513 | 514 | assert isinstance(resp3_1, scrapy_splash.SplashJsonResponse) 515 | assert resp3_1.body == b"Hello" 516 | assert resp3_1.text == "Hello" 517 | assert resp3_1.css("body").extract_first() == "Hello" 518 | assert resp3_1.data['render_time'] == 0.5 519 | assert resp3_1.headers[b'Content-Type'] == b'text/html; charset=utf-8' 520 | 521 | 522 | def test_cache_args(): 523 | spider = scrapy.Spider(name='foo') 524 | mw = _get_mw() 525 | mw.crawler.spider = spider 526 | mw.spider_opened(spider) 527 | dedupe_mw = SplashDeduplicateArgsMiddleware() 528 | 529 | # ========= Send first request - it should use save_args: 530 | lua_source = 'function main(splash) end' 531 | req = SplashRequest('http://example.com/foo', 532 | endpoint='execute', 533 | args={'lua_source': lua_source}, 534 | cache_args=['lua_source']) 535 | 536 | assert req.meta['splash']['args']['lua_source'] == lua_source 537 | # <---- spider 538 | req, = list(dedupe_mw.process_start_requests([req], spider)) 539 | # ----> scheduler 540 | assert req.meta['splash']['args']['lua_source'] != lua_source 541 | assert list(mw._argument_values.values()) == [lua_source] 542 | assert list(mw._argument_values.keys()) == [req.meta['splash']['args']['lua_source']] 543 | # <---- scheduler 544 | # process request before sending it to the downloader 545 | req = mw.process_request(req, spider) or req 546 | # -----> downloader 547 | assert req.meta['splash']['args']['lua_source'] == lua_source 548 | assert req.meta['splash']['args']['save_args'] == ['lua_source'] 549 | assert 'load_args' not in req.meta['splash']['args'] 550 | assert req.meta['splash']['_local_arg_fingerprints'] == { 551 | 'lua_source': list(mw._argument_values.keys())[0] 552 | } 553 | # <---- downloader 554 | resp_body = b'{}' 555 | resp = TextResponse("http://example.com", 556 | headers={ 557 | b'Content-Type': b'application/json', 558 | b'X-Splash-Saved-Arguments': b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3' 559 | }, 560 | body=resp_body) 561 | resp = mw.process_response(req, resp, None) 562 | 563 | # ============ Send second request - it should use load_args 564 | req2 = SplashRequest('http://example.com/bar', 565 | endpoint='execute', 566 | args={'lua_source': lua_source}, 567 | cache_args=['lua_source']) 568 | req2, item = list(dedupe_mw.process_spider_output(resp, [req2, {'key': 'value'}], spider)) 569 | assert item == {'key': 'value'} 570 | # ----> scheduler 571 | assert req2.meta['splash']['args']['lua_source'] != lua_source 572 | # <---- scheduler 573 | # process request before sending it to the downloader 574 | req2 = mw.process_request(req2, spider) or req2 575 | # -----> downloader 576 | assert req2.meta['splash']['args']['load_args'] == {"lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3"} 577 | assert "lua_source" not in req2.meta['splash']['args'] 578 | assert "save_args" not in req2.meta['splash']['args'] 579 | assert json.loads(req2.body.decode('utf8')) == { 580 | 'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'}, 581 | 'url': 'http://example.com/bar' 582 | } 583 | # <---- downloader 584 | resp = TextResponse("http://example.com/bar", 585 | headers={b'Content-Type': b'application/json'}, 586 | body=b'{}') 587 | resp = mw.process_response(req, resp, spider) 588 | 589 | # =========== Third request is dispatched to another server where 590 | # =========== arguments are expired: 591 | req3 = SplashRequest('http://example.com/baz', 592 | endpoint='execute', 593 | args={'lua_source': lua_source}, 594 | cache_args=['lua_source']) 595 | req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider)) 596 | # ----> scheduler 597 | assert req3.meta['splash']['args']['lua_source'] != lua_source 598 | # <---- scheduler 599 | req3 = mw.process_request(req3, spider) or req3 600 | # -----> downloader 601 | assert json.loads(req3.body.decode('utf8')) == { 602 | 'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'}, 603 | 'url': 'http://example.com/baz' 604 | } 605 | # <---- downloader 606 | 607 | resp_body = json.dumps({ 608 | "type": "ExpiredArguments", 609 | "description": "Arguments stored with ``save_args`` are expired", 610 | "info": {"expired": ["html"]}, 611 | "error": 498 612 | }) 613 | resp = TextResponse("127.0.0.1:8050", 614 | headers={b'Content-Type': b'application/json'}, 615 | status=498, 616 | body=resp_body.encode('utf8')) 617 | req4 = mw.process_response(req3, resp, spider) 618 | assert isinstance(req4, SplashRequest) 619 | 620 | # process this request again 621 | req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider)) 622 | req4 = mw.process_request(req4, spider) or req4 623 | 624 | # it should become save_args request after all middlewares 625 | assert json.loads(req4.body.decode('utf8')) == { 626 | 'lua_source': 'function main(splash) end', 627 | 'save_args': ['lua_source'], 628 | 'url': 'http://example.com/baz' 629 | } 630 | assert mw._remote_keys == {} 631 | 632 | 633 | def test_splash_request_no_url(): 634 | mw = _get_mw() 635 | lua_source = "function main(splash) return {result='ok'} end" 636 | req1 = SplashRequest(meta={'splash': { 637 | 'args': {'lua_source': lua_source}, 638 | 'endpoint': 'execute', 639 | }}) 640 | req = mw.process_request(req1, None) 641 | assert req.url == 'http://127.0.0.1:8050/execute' 642 | assert json.loads(to_unicode(req.body)) == { 643 | 'url': 'about:blank', 644 | 'lua_source': lua_source 645 | } 646 | 647 | 648 | def test_post_request(): 649 | mw = _get_mw() 650 | for body in [b'', b'foo=bar']: 651 | req1 = scrapy.Request("http://example.com", 652 | method="POST", 653 | body=body, 654 | meta={'splash': {'endpoint': 'render.html'}}) 655 | req = mw.process_request(req1, None) 656 | assert json.loads(to_unicode(req.body)) == { 657 | 'url': 'http://example.com', 658 | 'http_method': 'POST', 659 | 'body': to_unicode(body), 660 | } 661 | 662 | 663 | def test_override_splash_url(): 664 | mw = _get_mw() 665 | req1 = scrapy.Request("http://example.com", meta={ 666 | 'splash': { 667 | 'endpoint': 'render.png', 668 | 'splash_url': 'http://splash.example.com' 669 | } 670 | }) 671 | req = mw.process_request(req1, None) 672 | req = mw.process_request(req, None) or req 673 | assert req.url == 'http://splash.example.com/render.png' 674 | assert json.loads(to_unicode(req.body)) == {'url': req1.url} 675 | 676 | 677 | def test_url_with_fragment(): 678 | mw = _get_mw() 679 | url = "http://example.com#id1" 680 | req = scrapy.Request("http://example.com", meta={ 681 | 'splash': {'args': {'url': url}} 682 | }) 683 | req = mw.process_request(req, None) or req 684 | assert json.loads(to_unicode(req.body)) == {'url': url} 685 | 686 | 687 | def test_splash_request_url_with_fragment(): 688 | mw = _get_mw() 689 | url = "http://example.com#id1" 690 | req = SplashRequest(url) 691 | req = mw.process_request(req, None) or req 692 | assert json.loads(to_unicode(req.body)) == {'url': url} 693 | 694 | 695 | def test_float_wait_arg(): 696 | mw = _get_mw() 697 | req1 = scrapy.Request("http://example.com", meta={ 698 | 'splash': { 699 | 'endpoint': 'render.html', 700 | 'args': {'wait': 0.5} 701 | } 702 | }) 703 | req = mw.process_request(req1, None) 704 | assert json.loads(to_unicode(req.body)) == {'url': req1.url, 'wait': 0.5} 705 | 706 | 707 | def test_slot_policy_single_slot(): 708 | mw = _get_mw() 709 | meta = {'splash': { 710 | 'slot_policy': scrapy_splash.SlotPolicy.SINGLE_SLOT 711 | }} 712 | 713 | req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) 714 | req1 = mw.process_request(req1, None) 715 | 716 | req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) 717 | req2 = mw.process_request(req2, None) 718 | 719 | assert req1.meta.get('download_slot') 720 | assert req1.meta['download_slot'] == req2.meta['download_slot'] 721 | 722 | 723 | def test_slot_policy_per_domain(): 724 | mw = _get_mw() 725 | meta = {'splash': { 726 | 'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN 727 | }} 728 | 729 | req1 = scrapy.Request("http://example.com/path?key=value", meta=meta) 730 | req1 = mw.process_request(req1, None) 731 | 732 | req2 = scrapy.Request("http://example.com/path2", meta=meta) 733 | req2 = mw.process_request(req2, None) 734 | 735 | req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta) 736 | req3 = mw.process_request(req3, None) 737 | 738 | assert req1.meta.get('download_slot') 739 | assert req3.meta.get('download_slot') 740 | 741 | assert req1.meta['download_slot'] == req2.meta['download_slot'] 742 | assert req1.meta['download_slot'] != req3.meta['download_slot'] 743 | 744 | 745 | def test_slot_policy_scrapy_default(): 746 | mw = _get_mw() 747 | req = scrapy.Request("http://example.com", meta={'splash': { 748 | 'slot_policy': scrapy_splash.SlotPolicy.SCRAPY_DEFAULT 749 | }}) 750 | req = mw.process_request(req, None) 751 | assert 'download_slot' not in req.meta 752 | 753 | 754 | def test_adjust_timeout(): 755 | mw = _get_mw() 756 | req1 = scrapy.Request("http://example.com", meta={ 757 | 'splash': {'args': {'timeout': 60, 'html': 1}}, 758 | 759 | # download_timeout is always present, 760 | # it is set by DownloadTimeoutMiddleware 761 | 'download_timeout': 30, 762 | }) 763 | req1 = mw.process_request(req1, None) 764 | assert req1.meta['download_timeout'] > 60 765 | 766 | req2 = scrapy.Request("http://example.com", meta={ 767 | 'splash': {'args': {'html': 1}}, 768 | 'download_timeout': 30, 769 | }) 770 | req2 = mw.process_request(req2, None) 771 | assert req2.meta['download_timeout'] == 30 772 | 773 | 774 | def test_auth(): 775 | def assert_auth_header(user, pwd, header): 776 | mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd}) 777 | req = mw.process_request(SplashRequest("http://example.com"), None) 778 | assert 'Authorization' in req.headers 779 | assert req.headers['Authorization'] == header 780 | 781 | def assert_no_auth_header(user, pwd): 782 | if user is not None or pwd is not None: 783 | mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd}) 784 | else: 785 | mw = _get_mw() 786 | req = mw.process_request(SplashRequest("http://example.com"), None) 787 | assert 'Authorization' not in req.headers 788 | 789 | assert_auth_header('root', '', b'Basic cm9vdDo=') 790 | assert_auth_header('root', 'pwd', b'Basic cm9vdDpwd2Q=') 791 | assert_auth_header('', 'pwd', b'Basic OnB3ZA==') 792 | 793 | assert_no_auth_header('', '') 794 | assert_no_auth_header(None, None) -------------------------------------------------------------------------------- /tests/test_request.py: -------------------------------------------------------------------------------- 1 | try: 2 | from urllib.parse import parse_qs 3 | except ImportError: 4 | from urlparse import parse_qs 5 | 6 | from scrapy.http import HtmlResponse 7 | from scrapy_splash import SplashRequest, SplashFormRequest 8 | 9 | 10 | def test_meta_None(): 11 | req1 = SplashRequest('http://example.com') 12 | req2 = SplashRequest('http://example.com', meta=None) 13 | assert req1.meta == req2.meta 14 | 15 | 16 | def test_splash_form_request(): 17 | req = SplashFormRequest( 18 | 'http://example.com', formdata={'foo': 'bar'}) 19 | assert req.method == 'POST' 20 | assert req.body == b'foo=bar' 21 | assert req.meta['splash']['args']['url'] == 'http://example.com' 22 | 23 | req = SplashFormRequest( 24 | 'http://example.com', method='GET', formdata={'foo': 'bar'}, 25 | endpoint='execute') 26 | assert req.method == 'GET' 27 | assert req.body == b'' 28 | assert req.url == req.meta['splash']['args']['url'] ==\ 29 | 'http://example.com?foo=bar' 30 | assert req.meta['splash']['endpoint'] == 'execute' 31 | 32 | 33 | def test_form_request_from_response(): 34 | # Copied from scrapy tests (test_from_response_submit_not_first_clickable) 35 | def _buildresponse(body, **kwargs): 36 | kwargs.setdefault('body', body) 37 | kwargs.setdefault('url', 'http://example.com') 38 | kwargs.setdefault('encoding', 'utf-8') 39 | return HtmlResponse(**kwargs) 40 | response = _buildresponse( 41 | """
42 | 43 | 44 | 45 | 46 |
""") 47 | req = SplashFormRequest.from_response( 48 | response, formdata={'two': '2'}, clickdata={'name': 'clickable2'}) 49 | assert req.method == 'GET' 50 | assert req.meta['splash']['args']['url'] == req.url 51 | fs = parse_qs(req.url.partition('?')[2], True) 52 | assert fs['clickable2'] == ['clicked2'] 53 | assert 'clickable1' not in fs 54 | assert fs['one'] == ['1'] 55 | assert fs['two'] == ['2'] 56 | 57 | 58 | def test_splash_request_meta(): 59 | meta = {'foo': 'bar'} 60 | req = SplashRequest('http://example.com', meta=meta) 61 | assert 'splash' in req.meta 62 | assert req.meta['foo'] == 'bar' 63 | assert meta == {'foo': 'bar'} 64 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import json 4 | 5 | from hypothesis import given, assume 6 | from hypothesis import strategies as st 7 | from scrapy.http import Headers 8 | from scrapy_splash.utils import ( 9 | headers_to_scrapy, 10 | _fast_hash, 11 | json_based_hash, 12 | dict_hash 13 | ) 14 | 15 | 16 | def test_headers_to_scrapy(): 17 | assert headers_to_scrapy(None) == Headers() 18 | assert headers_to_scrapy({}) == Headers() 19 | assert headers_to_scrapy([]) == Headers() 20 | 21 | html_headers = Headers({'Content-Type': 'text/html'}) 22 | 23 | assert headers_to_scrapy({'Content-Type': 'text/html'}) == html_headers 24 | assert headers_to_scrapy([('Content-Type', 'text/html')]) == html_headers 25 | assert headers_to_scrapy([{'name': 'Content-Type', 'value': 'text/html'}]) == html_headers 26 | 27 | 28 | _primitive = ( 29 | st.floats(allow_infinity=False, allow_nan=False) | 30 | st.booleans() | 31 | st.text() | 32 | st.none() | 33 | st.integers() 34 | ) 35 | _data = st.recursive(_primitive, 36 | lambda children: ( 37 | children | 38 | st.lists(children) | 39 | st.tuples(children) | 40 | st.dictionaries(st.text(), children) | 41 | st.tuples(st.just('h'), children) 42 | ), 43 | max_leaves=5, 44 | ) 45 | _data_notuples = st.recursive(_primitive, 46 | lambda children: ( 47 | children | 48 | st.lists(children) | 49 | st.dictionaries(st.text(), children) 50 | ), 51 | max_leaves=5, 52 | ) 53 | 54 | 55 | @given(_data, _data) 56 | def test_fast_hash(val1, val2): 57 | def _dump(v): 58 | return json.dumps(v, sort_keys=True) 59 | assume(_dump(val1) != _dump(val2)) 60 | assert _fast_hash(val1) == _fast_hash(val1) 61 | assert _fast_hash(val1) != _fast_hash(val2) 62 | 63 | 64 | @given(_data, _data) 65 | def test_dict_hash(val1, val2): 66 | assume(val1 != val2) 67 | assert dict_hash(val1) == dict_hash(val1) 68 | assert dict_hash(val1) != dict_hash(val2) 69 | 70 | 71 | @given(_data_notuples, _data_notuples) 72 | def test_json_based_hash(val1, val2): 73 | assume(val1 != val2) 74 | assert json_based_hash(val1) == json_based_hash(val1) 75 | assert json_based_hash(val1) != json_based_hash(val2) 76 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import pytest 4 | from pytest_twisted import inlineCallbacks 5 | from twisted.internet.defer import returnValue 6 | from scrapy.crawler import Crawler 7 | 8 | from .mockserver import MockServer 9 | 10 | 11 | requires_splash = pytest.mark.skipif( 12 | not os.environ.get('SPLASH_URL', ''), 13 | reason="set SPLASH_URL environment variable to run integrational tests" 14 | ) 15 | 16 | 17 | @inlineCallbacks 18 | def crawl_items( 19 | spider_cls, 20 | resource_cls, 21 | settings, 22 | spider_kwargs=None, 23 | url_path="", 24 | ): 25 | """ Use spider_cls to crawl resource_cls. URL of the resource is passed 26 | to the spider as ``url`` argument. 27 | Return ``(items, resource_url, crawler)`` tuple. 28 | """ 29 | spider_kwargs = {} if spider_kwargs is None else spider_kwargs 30 | crawler = make_crawler(spider_cls, settings) 31 | with MockServer(resource_cls) as s: 32 | print("mock server", s.root_url) 33 | root_url = s.root_url + url_path 34 | yield crawler.crawl(url=root_url, **spider_kwargs) 35 | items = getattr(crawler.spider, 'collected_items', []) 36 | result = items, root_url, crawler 37 | returnValue(result) 38 | 39 | 40 | def make_crawler(spider_cls, settings): 41 | if not getattr(spider_cls, 'name', None): 42 | class Spider(spider_cls): 43 | name = 'test_spider' 44 | Spider.__name__ = spider_cls.__name__ 45 | Spider.__module__ = spider_cls.__module__ 46 | spider_cls = Spider 47 | return Crawler(spider_cls, settings) 48 | 49 | 50 | class CollectorPipeline: 51 | def process_item(self, item, spider): 52 | if not hasattr(spider, 'collected_items'): 53 | spider.collected_items = [] 54 | spider.collected_items.append(item) 55 | return item 56 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py37,py38,py39,py310,py311 8 | 9 | [common] 10 | deps = 11 | pytest >= 3.3.2 12 | pytest-cov >= 2.5.1 13 | pytest-twisted >= 1.6 14 | pytest-xdist >= 1.22 15 | hypothesis >= 3.44.14 16 | hypothesis-pytest 17 | service_identity 18 | 19 | [testenv] 20 | passenv = SPLASH_URL 21 | deps = 22 | {[common]deps} 23 | scrapy 24 | commands = 25 | pip install -e . 26 | py.test --doctest-modules --doctest-glob '*.py,*.rst' --cov=scrapy_splash --cov-report=xml {posargs:README.rst scrapy_splash tests} 27 | --------------------------------------------------------------------------------