├── .coveragerc
├── .github
    └── workflows
    │   ├── publish.yml
    │   └── tests.yml
├── .gitignore
├── CHANGES.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── example
    ├── scrapy.cfg
    └── scrashtest
    │   ├── __init__.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── quotes.py
├── pyproject.toml
├── pytest.ini
├── scrapy_splash
    ├── __init__.py
    ├── cache.py
    ├── cookies.py
    ├── dupefilter.py
    ├── middleware.py
    ├── request.py
    ├── response.py
    ├── responsetypes.py
    └── utils.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── mockserver.py
    ├── resources.py
    ├── test_cookies.py
    ├── test_fingerprints.py
    ├── test_integration.py
    ├── test_middleware.py
    ├── test_request.py
    ├── test_utils.py
    └── utils.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = true
3 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | on:
 3 |   push:
 4 |     tags:
 5 |     - '[0-9]+.[0-9]+.[0-9]+'
 6 | jobs:
 7 |   publish:
 8 |     runs-on: ubuntu-latest
 9 |     environment:
10 |       name: pypi
11 |       url: https://pypi.org/p/${{ github.event.repository.name }}
12 |     permissions:
13 |       id-token: write
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - uses: actions/setup-python@v5
17 |       with:
18 |         python-version: 3.13
19 |     - run: |
20 |         python -m pip install --upgrade build
21 |         python -m build
22 |     - name: Publish to PyPI
23 |       uses: pypa/gh-action-pypi-publish@release/v1
24 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   tests:
 6 |     if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 7 |     runs-on: ${{ matrix.os || 'ubuntu-latest' }}
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         include:
12 |         - python-version: '3.9'
13 |         - python-version: '3.10'
14 |         - python-version: '3.11'
15 |         - python-version: '3.12'
16 |         - python-version: '3.13'
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 | 
21 |     - name: Run Splash
22 |       run: |
23 |         docker run --rm -d -p 8050:8050 --network host scrapinghub/splash
24 | 
25 |     - name: Set up Python ${{ matrix.python-version }}
26 |       uses: actions/setup-python@v2
27 |       with:
28 |         python-version: ${{ matrix.python-version }}
29 | 
30 |     - name: Run tests
31 |       env:
32 |         TOXENV: py
33 |       run: |
34 |         pip install -U tox
35 |         SPLASH_URL=http://127.0.0.1:8050 tox
36 | 
37 |     - name: Upload coverage report
38 |       uses: codecov/codecov-action@v5
39 |       with:
40 |         token: ${{ secrets.CODECOV_TOKEN }}
41 | 
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .tox
 3 | build
 4 | dist
 5 | scrapyjs.egg-info
 6 | scrapy_splash.egg-info
 7 | .cache
 8 | .coverage
 9 | .scrapy
10 | htmlcov
11 | .hypothesis
12 | .ipynb_checkpoints
13 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
  1 | Changes
  2 | =======
  3 | 
  4 | 0.11.1 (2025-02-11)
  5 | -------------------
  6 | 
  7 | * Fixed ``SplashAwareDupeFilter`` failing to initialize.
  8 | 
  9 | 0.11.0 (2025-02-06)
 10 | -------------------
 11 | 
 12 | * Scrapy 2.4 or higher is now required.
 13 | 
 14 | * The ``url`` parameter of ``SplashRequest`` is once again optional, reverting
 15 |   a backward-incompatible change from scrapy-splash 0.9.0.
 16 | 
 17 | 0.10.1 (2025-01-27)
 18 | -------------------
 19 | 
 20 | * Fixed ``SplashAwareDupeFilter`` failing to initialize.
 21 | 
 22 | * Improved the README.
 23 | 
 24 | 0.10.0 (2025-01-21)
 25 | -------------------
 26 | 
 27 | * Removed official support for Python 3.7 and 3.8, and added official support
 28 |   for Python 3.12 and 3.13.
 29 | 
 30 | * Added support for Scrapy 2.12+.
 31 | 
 32 |   This includes deprecating ``SplashAwareDupeFilter`` and
 33 |   ``SplashAwareFSCacheStorage`` in favor of the corresponding built-in, default
 34 |   Scrapy components, and instead using the new ``SplashRequestFingerprinter``
 35 |   component to ensure request fingerprinting for Splash requests stays the
 36 |   same, now for every Scrapy component doing request fingerprinting and not
 37 |   only for duplicate filtering and HTTP caching.
 38 | 
 39 | 0.9.0 (2023-02-03)
 40 | ------------------
 41 | 
 42 | * Removed official support for Python 2.7, 3.4, 3.5 and 3.6, and added official
 43 |   support for Python 3.9, 3.10 and 3.11.
 44 | 
 45 | * Deprecated ``SplashJsonResponse.body_as_unicode()``, to be replaced by
 46 |   ``SplashJsonResponse.text``.
 47 | 
 48 | * Removed calls to obsolete ``to_native_str``, removed in Scrapy 2.8.
 49 | 
 50 | 0.8.0 (2021-10-05)
 51 | ------------------
 52 | 
 53 | *   **Security bug fix:**
 54 | 
 55 |     If you use HttpAuthMiddleware_ (i.e. the ``http_user`` and ``http_pass``
 56 |     spider attributes) for Splash authentication, any non-Splash request will
 57 |     expose your credentials to the request target. This includes ``robots.txt``
 58 |     requests sent by Scrapy when the ``ROBOTSTXT_OBEY`` setting is set to
 59 |     ``True``.
 60 | 
 61 |     Use the new ``SPLASH_USER`` and ``SPLASH_PASS`` settings instead to set
 62 |     your Splash authentication credentials safely.
 63 | 
 64 |     .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth
 65 | 
 66 | *   Responses now expose the HTTP status code and headers from Splash as
 67 |     ``response.splash_response_status`` and
 68 |     ``response.splash_response_headers`` (#158)
 69 | 
 70 | *   The ``meta`` argument passed to the ``scrapy_splash.request.SplashRequest``
 71 |     constructor is no longer modified (#164)
 72 | 
 73 | *   Website responses with 400 or 498 as HTTP status code are no longer
 74 |     handled as the equivalent Splash responses (#158)
 75 | 
 76 | *   Cookies are no longer sent to Splash itself (#156)
 77 | 
 78 | *   ``scrapy_splash.utils.dict_hash`` now also works with ``obj=None``
 79 |     (``225793b``)
 80 | 
 81 | *   Our test suite now includes integration tests (#156) and tests can be run
 82 |     in parallel (``6fb8c41``)
 83 | 
 84 | *   There’s a new ‘Getting help’ section in the ``README.rst`` file (#161,
 85 |     #162), the documentation about ``SPLASH_SLOT_POLICY`` has been improved
 86 |     (#157) and a typo as been fixed (#121)
 87 | 
 88 | *   Made some internal improvements (``ee5000d``, ``25de545``, ``2aaa79d``)
 89 | 
 90 | 
 91 | 0.7.2 (2017-03-30)
 92 | ------------------
 93 | 
 94 | * fixed issue with response type detection.
 95 | 
 96 | 0.7.1 (2016-12-20)
 97 | ------------------
 98 | 
 99 | * Scrapy 1.0.x support is back;
100 | * README updates.
101 | 
102 | 0.7 (2016-05-16)
103 | ----------------
104 | 
105 | * ``SPLASH_COOKIES_DEBUG`` setting allows to log cookies
106 |   sent and received to/from Splash in ``cookies`` request/response fields.
107 |   It is similar to Scrapy's builtin ``COOKIES_DEBUG``, but works for
108 |   Splash requests;
109 | * README cleanup.
110 | 
111 | 0.6.1 (2016-04-29)
112 | ------------------
113 | 
114 | * Warning about HTTP methods is no longer logged for non-Splash requests.
115 | 
116 | 0.6 (2016-04-20)
117 | ----------------
118 | 
119 | * ``SplashAwareDupeFilter`` and ``splash_request_fingerprint`` are improved:
120 |   they now canonicalize URLs and take URL fragments in account;
121 | * ``cache_args`` value fingerprints are now calculated faster.
122 | 
123 | 0.5 (2016-04-18)
124 | ----------------
125 | 
126 | * ``cache_args`` SplashRequest argument and
127 |   ``request.meta['splash']['cache_args']`` key allow to save network traffic
128 |   and disk storage by not storing duplicate Splash arguments in disk request
129 |   queues and not sending them to Splash multiple times. This feature requires
130 |   Splash 2.1+.
131 | 
132 | To upgrade from v0.4 enable ``SplashDeduplicateArgsMiddleware`` in settings.py::
133 | 
134 |   SPIDER_MIDDLEWARES = {
135 |       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
136 |   }
137 | 
138 | 0.4 (2016-04-14)
139 | ----------------
140 | 
141 | * SplashFormRequest class is added; it is a variant of FormRequest which uses
142 |   Splash;
143 | * Splash parameters are no longer stored in request.meta twice; this change
144 |   should decrease disk queues data size;
145 | * SplashMiddleware now increases request priority when rescheduling the request;
146 |   this should decrease disk queue data size and help with stale cookie
147 |   problems.
148 | 
149 | 0.3 (2016-04-11)
150 | ----------------
151 | 
152 | Package is renamed from ``scrapyjs`` to ``scrapy-splash``.
153 | 
154 | An easiest way to upgrade is to replace ``scrapyjs`` imports with
155 | ``scrapy_splash`` and update ``settings.py`` with new defaults
156 | (check the README).
157 | 
158 | There are many new helpers to handle JavaScript rendering transparently;
159 | the recommended way is now to use ``scrapy_splash.SplashRequest`` instead
160 | of  ``request.meta['splash']``. Please make sure to read the README if
161 | you're upgrading from scrapyjs - you may be able to drop some code from your
162 | project, especially if you want to access response html, handle cookies
163 | and headers.
164 | 
165 | * new SplashRequest class; it can be used as a replacement for scrapy.Request
166 |   to provide a better integration with Splash;
167 | * added support for POST requests;
168 | * SplashResponse, SplashTextResponse and SplashJsonResponse allow to
169 |   handle Splash responses transparently, taking care of response.url,
170 |   response.body, response.headers and response.status. SplashJsonResponse
171 |   allows to access decoded response JSON data as ``response.data``.
172 | * cookie handling improvements: it is possible to handle Scrapy and Splash
173 |   cookies transparently; current cookiejar is exposed as response.cookiejar;
174 | * headers are passed to Splash by default;
175 | * URLs with fragments are handled automatically when using SplashRequest;
176 | * logging is improved: ``SplashRequest.__repr__`` shows both requested URL
177 |   and Splash URL;
178 | * in case of Splash HTTP 400 errors the response is logged by default;
179 | * an issue with dupefilters is fixed: previously the order of keys in
180 |   JSON request body could vary, making requests appear as non-duplicates;
181 | * it is now possible to pass custom headers to Splash server itself;
182 | * test coverage reports are enabled.
183 | 
184 | 0.2 (2016-03-26)
185 | ----------------
186 | 
187 | * Scrapy 1.0 and 1.1 support;
188 | * Python 3 support;
189 | * documentation improvements;
190 | * project is moved to https://github.com/scrapy-plugins/scrapy-splash.
191 | 
192 | 0.1.1 (2015-03-16)
193 | ------------------
194 | 
195 | Fixed fingerprint calculation for non-string meta values.
196 | 
197 | 0.1 (2015-02-28)
198 | ----------------
199 | 
200 | Initial release
201 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) Scrapy developers.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice,
 8 |        this list of conditions and the following disclaimer.
 9 | 
10 |     2. Redistributions in binary form must reproduce the above copyright
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of Scrapy-Splash nor the names of its contributors may
15 |        be used to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.rst
3 | include tox.ini
4 | recursive-include tests *.py
5 | recursive-include example *.py *.cfg
6 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ==============================================
  2 | Scrapy & JavaScript integration through Splash
  3 | ==============================================
  4 | 
  5 | .. image:: https://img.shields.io/pypi/v/scrapy-splash.svg
  6 |    :target: https://pypi.python.org/pypi/scrapy-splash
  7 |    :alt: PyPI Version
  8 | 
  9 | .. image:: https://github.com/scrapy-plugins/scrapy-splash/workflows/Tests/badge.svg
 10 |    :target: https://github.com/scrapy-plugins/scrapy-splash/actions/workflows/tests.yml
 11 |    :alt: Test Status
 12 | 
 13 | .. image:: http://codecov.io/github/scrapy-plugins/scrapy-splash/coverage.svg?branch=master
 14 |    :target: http://codecov.io/github/scrapy-plugins/scrapy-splash?branch=master
 15 |    :alt: Code Coverage
 16 | 
 17 | This library provides Scrapy_ and JavaScript integration using Splash_.
 18 | The license is BSD 3-clause.
 19 | 
 20 | .. _Scrapy: https://github.com/scrapy/scrapy
 21 | .. _Splash: https://github.com/scrapinghub/splash
 22 | 
 23 | Installation
 24 | ============
 25 | 
 26 | Install scrapy-splash using pip::
 27 | 
 28 |     $ pip install scrapy-splash
 29 | 
 30 | Scrapy-Splash uses Splash_ HTTP API, so you also need a Splash instance.
 31 | Usually to install & run Splash, something like this is enough::
 32 | 
 33 |     $ docker run -p 8050:8050 scrapinghub/splash
 34 | 
 35 | Check Splash `install docs`_ for more info.
 36 | 
 37 | .. _install docs: http://splash.readthedocs.org/en/latest/install.html
 38 | 
 39 | 
 40 | Configuration
 41 | =============
 42 | 
 43 | 1. Add the Splash server address to ``settings.py`` of your Scrapy project
 44 |    like this::
 45 | 
 46 |       SPLASH_URL = 'http://192.168.59.103:8050'
 47 | 
 48 | 2. Enable the Splash middleware by adding it to ``DOWNLOADER_MIDDLEWARES``
 49 |    in your ``settings.py`` file and changing HttpCompressionMiddleware
 50 |    priority:
 51 | 
 52 |    .. code:: python
 53 | 
 54 |       DOWNLOADER_MIDDLEWARES = {
 55 |           'scrapy_splash.SplashCookiesMiddleware': 723,
 56 |           'scrapy_splash.SplashMiddleware': 725,
 57 |           'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 58 |       }
 59 | 
 60 |    Order `723` is just before `HttpProxyMiddleware` (750) in default
 61 |    scrapy settings.
 62 | 
 63 |    HttpCompressionMiddleware priority should be changed in order to allow
 64 |    advanced response processing; see https://github.com/scrapy/scrapy/issues/1895
 65 |    for details.
 66 | 
 67 | 3. Enable ``SplashDeduplicateArgsMiddleware`` by adding it to
 68 |    ``SPIDER_MIDDLEWARES`` in your ``settings.py``:
 69 | 
 70 |    .. code:: python
 71 | 
 72 |       SPIDER_MIDDLEWARES = {
 73 |           'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 74 |       }
 75 | 
 76 |    This middleware is needed to support ``cache_args`` feature; it allows
 77 |    to save disk space by not storing duplicate Splash arguments multiple
 78 |    times in a disk request queue. If Splash 2.1+ is used the middleware
 79 |    also allows to save network traffic by not sending these duplicate
 80 |    arguments to Splash server multiple times.
 81 | 
 82 | 4. Set a custom ``REQUEST_FINGERPRINTER_CLASS``:
 83 | 
 84 |    .. code:: python
 85 | 
 86 |       REQUEST_FINGERPRINTER_CLASS = 'scrapy_splash.SplashRequestFingerprinter'
 87 | 
 88 | 
 89 | There are also some additional options available.
 90 | Put them into your ``settings.py`` if you want to change the defaults:
 91 | 
 92 | * ``SPLASH_COOKIES_DEBUG`` is ``False`` by default.
 93 |   Set to ``True`` to enable debugging cookies in the ``SplashCookiesMiddleware``.
 94 |   This option is similar to ``COOKIES_DEBUG``
 95 |   for the built-in Scrapy cookies middleware: it logs sent and received cookies
 96 |   for all requests.
 97 | * ``SPLASH_LOG_400`` is ``True`` by default - it instructs to log all 400 errors
 98 |   from Splash. They are important because they show errors occurred
 99 |   when executing the Splash script. Set it to ``False`` to disable this logging.
100 | * ``SPLASH_SLOT_POLICY`` is ``scrapy_splash.SlotPolicy.PER_DOMAIN`` (as object, not just a string) by default.
101 |   It specifies how concurrency & politeness are maintained for Splash requests,
102 |   and specify the default value for ``slot_policy`` argument for
103 |   ``SplashRequest``, which is described below.
104 | * ``SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS`` is ``scrapy.settings.default_settings.REQUEST_FINGERPRINTER_CLASS`` by default. This changes the base class the Fingerprinter uses to get a fingerprint.
105 | 
106 | 
107 | Usage
108 | =====
109 | 
110 | Requests
111 | --------
112 | 
113 | The easiest way to render requests with Splash is to
114 | use ``scrapy_splash.SplashRequest``:
115 | 
116 | .. code:: python
117 | 
118 |     yield SplashRequest(url, self.parse_result,
119 |         args={
120 |             # optional; parameters passed to Splash HTTP API
121 |             'wait': 0.5,
122 | 
123 |             # 'url' is prefilled from request url
124 |             # 'http_method' is set to 'POST' for POST requests
125 |             # 'body' is set to request body for POST requests
126 |         },
127 |         endpoint='render.json', # optional; default is render.html
128 |         splash_url='<url>',     # optional; overrides SPLASH_URL
129 |         slot_policy=scrapy_splash.SlotPolicy.PER_DOMAIN,  # optional
130 |     )
131 | 
132 | Alternatively, you can use regular scrapy.Request and
133 | ``'splash'`` Request `meta` key:
134 | 
135 | .. code:: python
136 | 
137 |     yield scrapy.Request(url, self.parse_result, meta={
138 |         'splash': {
139 |             'args': {
140 |                 # set rendering arguments here
141 |                 'html': 1,
142 |                 'png': 1,
143 | 
144 |                 # 'url' is prefilled from request url
145 |                 # 'http_method' is set to 'POST' for POST requests
146 |                 # 'body' is set to request body for POST requests
147 |             },
148 | 
149 |             # optional parameters
150 |             'endpoint': 'render.json',  # optional; default is render.json
151 |             'splash_url': '<url>',      # optional; overrides SPLASH_URL
152 |             'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN,
153 |             'splash_headers': {},       # optional; a dict with headers sent to Splash
154 |             'dont_process_response': True, # optional, default is False
155 |             'dont_send_headers': True,  # optional, default is False
156 |             'magic_response': False,    # optional, default is True
157 |         }
158 |     })
159 | 
160 | Use ``request.meta['splash']`` API in middlewares or when scrapy.Request
161 | subclasses are used (there is also ``SplashFormRequest`` described below).
162 | For example, ``meta['splash']`` allows to create a middleware which enables
163 | Splash for all outgoing requests by default.
164 | 
165 | ``SplashRequest`` is a convenient utility to fill ``request.meta['splash']``;
166 | it should be easier to use in most cases. For each ``request.meta['splash']``
167 | key there is a corresponding ``SplashRequest`` keyword argument: for example,
168 | to set ``meta['splash']['args']`` use ``SplashRequest(..., args=myargs)``.
169 | 
170 | * ``meta['splash']['args']`` contains arguments sent to Splash.
171 |   scrapy-splash adds some default keys/values to ``args``:
172 | 
173 |   * 'url' is set to request.url;
174 |   * 'http_method' is set to 'POST' for POST requests;
175 |   * 'body' is set to to request.body for POST requests.
176 | 
177 |   You can override default values by setting them explicitly.
178 | 
179 |   Note that by default Scrapy escapes URL fragments using AJAX escaping scheme.
180 |   If you want to pass a URL with a fragment to Splash then set ``url``
181 |   in ``args`` dict manually. This is handled automatically if you use
182 |   ``SplashRequest``, but you need to keep that in mind if you use raw
183 |   ``meta['splash']`` API.
184 | 
185 |   Splash 1.8+ is required to handle POST requests; in earlier Splash versions
186 |   'http_method' and 'body' arguments are ignored. If you work with ``/execute``
187 |   endpoint and want to support POST requests you have to handle
188 |   ``http_method`` and ``body`` arguments in your Lua script manually.
189 | 
190 | * ``meta['splash']['cache_args']`` is a list of argument names to cache
191 |   on Splash side. These arguments are sent to Splash only once, then cached
192 |   values are used; it allows to save network traffic and decreases request
193 |   queue disk memory usage. Use ``cache_args`` only for large arguments
194 |   which don't change with each request; ``lua_source`` is a good candidate
195 |   (if you don't use string formatting to build it). Splash 2.1+ is required
196 |   for this feature to work.
197 | 
198 | * ``meta['splash']['endpoint']`` is the Splash endpoint to use.
199 |   In case of SplashRequest
200 |   `render.html <http://splash.readthedocs.org/en/latest/api.html#render-html>`_
201 |   is used by default. If you're using raw scrapy.Request then
202 |   `render.json <http://splash.readthedocs.org/en/latest/api.html#render-json>`_
203 |   is a default (for historical reasons). It is better to always pass endpoint
204 |   explicitly.
205 | 
206 |   See Splash `HTTP API docs`_ for a full list of available endpoints
207 |   and parameters.
208 | 
209 | .. _HTTP API docs: http://splash.readthedocs.org/en/latest/api.html
210 | 
211 | * ``meta['splash']['splash_url']`` overrides the Splash URL set
212 |   in ``settings.py``.
213 | 
214 | * ``meta['splash']['splash_headers']`` allows to add or change headers
215 |   which are sent to Splash server. Note that this option **is not** for
216 |   setting headers which are sent to the remote website.
217 | 
218 | * ``meta['splash']['slot_policy']`` customize how
219 |   concurrency & politeness are maintained for Splash requests.
220 | 
221 |   Currently there are 3 policies available:
222 | 
223 |   1. ``scrapy_splash.SlotPolicy.PER_DOMAIN`` (default) - send Splash requests to
224 |      downloader slots based on URL being rendered. It is useful if you want
225 |      to maintain per-domain politeness & concurrency settings.
226 | 
227 |   2. ``scrapy_splash.SlotPolicy.SINGLE_SLOT`` - send all Splash requests to
228 |      a single downloader slot. It is useful if you want to throttle requests
229 |      to Splash.
230 | 
231 |   3. ``scrapy_splash.SlotPolicy.SCRAPY_DEFAULT`` - don't do anything with slots.
232 |      It is similar to ``SINGLE_SLOT`` policy, but can be different if you access
233 |      other services on the same address as Splash.
234 | 
235 | * ``meta['splash']['dont_process_response']`` - when set to True,
236 |   SplashMiddleware won't change the response to a custom scrapy.Response
237 |   subclass. By default for Splash requests one of SplashResponse,
238 |   SplashTextResponse or SplashJsonResponse is passed to the callback.
239 | 
240 | * ``meta['splash']['dont_send_headers']``: by default scrapy-splash passes
241 |   request headers to Splash in 'headers' JSON POST field. For all render.xxx
242 |   endpoints it means Scrapy header options are respected by default
243 |   (http://splash.readthedocs.org/en/stable/api.html#arg-headers). In Lua
244 |   scripts you can use ``headers`` argument of ``splash:go`` to apply the
245 |   passed headers: ``splash:go{url, headers=splash.args.headers}``.
246 | 
247 |   Set 'dont_send_headers' to True if you don't want to pass ``headers``
248 |   to Splash.
249 | 
250 | * ``meta['splash']['http_status_from_error_code']`` - set response.status
251 |   to HTTP error code when ``assert(splash:go(..))`` fails; it requires
252 |   ``meta['splash']['magic_response']=True``. ``http_status_from_error_code``
253 |   option is False by default if you use raw meta API;
254 |   SplashRequest sets it to True by default.
255 | 
256 | * ``meta['splash']['magic_response']`` - when set to True and a JSON
257 |   response is received from Splash, several attributes of the response
258 |   (headers, body, url, status code) are filled using data returned in JSON:
259 | 
260 |   * response.headers are filled from 'headers' keys;
261 |   * response.url is set to the value of 'url' key;
262 |   * response.body is set to the value of 'html' key,
263 |     or to base64-decoded value of 'body' key;
264 |   * response.status is set to the value of 'http_status' key.
265 |     When ``meta['splash']['http_status_from_error_code']`` is True
266 |     and ``assert(splash:go(..))`` fails with an HTTP error
267 |     response.status is also set to HTTP error code.
268 | 
269 |   Original URL, status and headers are available as ``response.real_url``,
270 |   ``response.splash_response_status`` and ``response.splash_response_headers``.
271 | 
272 |   This option is set to True by default if you use SplashRequest.
273 |   ``render.json`` and ``execute`` endpoints may not have all the necessary
274 |   keys/values in the response.
275 |   For non-JSON endpoints, only url is filled, regardless of the
276 |   ``magic_response`` setting.
277 | 
278 | 
279 | Use ``scrapy_splash.SplashFormRequest`` if you want to make a ``FormRequest``
280 | via splash. It accepts the same arguments as ``SplashRequest``,
281 | and also ``formdata``, like ``FormRequest`` from scrapy::
282 | 
283 |     >>> from scrapy_splash import SplashFormRequest
284 |     >>> SplashFormRequest('http://example.com', formdata={'foo': 'bar'})
285 |     <POST http://example.com>
286 | 
287 | ``SplashFormRequest.from_response`` is also supported, and works as described
288 | in `scrapy documentation <http://scrapy.readthedocs.org/en/latest/topics/request-response.html#scrapy.http.FormRequest.from_response>`_.
289 | 
290 | Responses
291 | ---------
292 | 
293 | scrapy-splash returns Response subclasses for Splash requests:
294 | 
295 | * SplashResponse is returned for binary Splash responses - e.g. for
296 |   /render.png responses;
297 | * SplashTextResponse is returned when the result is text - e.g. for
298 |   /render.html responses;
299 | * SplashJsonResponse is returned when the result is a JSON object - e.g.
300 |   for /render.json responses or /execute responses when script returns
301 |   a Lua table.
302 | 
303 | To use standard Response classes set ``meta['splash']['dont_process_response']=True``
304 | or pass ``dont_process_response=True`` argument to SplashRequest.
305 | 
306 | All these responses set ``response.url`` to the URL of the original request
307 | (i.e. to the URL of a website you want to render), not to the URL of the
308 | requested Splash endpoint. "True" URL is still available as
309 | ``response.real_url``.
310 | 
311 | SplashJsonResponse provide extra features:
312 | 
313 | * ``response.data`` attribute contains response data decoded from JSON;
314 |   you can access it like ``response.data['html']``.
315 | 
316 | * If Splash session handling is configured, you can access current cookies
317 |   as ``response.cookiejar``; it is a CookieJar instance.
318 | 
319 | * If Scrapy-Splash response magic is enabled in request (default),
320 |   several response attributes (headers, body, url, status code)
321 |   are set automatically from original response body:
322 | 
323 |   * response.headers are filled from 'headers' keys;
324 |   * response.url is set to the value of 'url' key;
325 |   * response.body is set to the value of 'html' key,
326 |     or to base64-decoded value of 'body' key;
327 |   * response.status is set from the value of 'http_status' key.
328 | 
329 | When ``response.body`` is updated in SplashJsonResponse
330 | (either from 'html' or from 'body' keys) familiar ``response.css``
331 | and ``response.xpath`` methods are available.
332 | 
333 | To turn off special handling of JSON result keys either set
334 | ``meta['splash']['magic_response']=False`` or pass ``magic_response=False``
335 | argument to SplashRequest.
336 | 
337 | Session Handling
338 | ================
339 | 
340 | Splash itself is stateless - each request starts from a clean state.
341 | In order to support sessions the following is required:
342 | 
343 | 1. client (Scrapy) must send current cookies to Splash;
344 | 2. Splash script should make requests using these cookies and update
345 |    them from HTTP response headers or JavaScript code;
346 | 3. updated cookies should be sent back to the client;
347 | 4. client should merge current cookies with the updated cookies.
348 | 
349 | For (2) and (3) Splash provides ``splash:get_cookies()`` and
350 | ``splash:init_cookies()`` methods which can be used in Splash Lua scripts.
351 | 
352 | scrapy-splash provides helpers for (1) and (4): to send current cookies
353 | in 'cookies' field and merge cookies back from 'cookies' response field
354 | set ``request.meta['splash']['session_id']`` to the session
355 | identifier. If you only want a single session use the same ``session_id`` for
356 | all request; any value like '1' or 'foo' is fine.
357 | 
358 | For scrapy-splash session handling to work you must use ``/execute`` endpoint
359 | and a Lua script which accepts 'cookies' argument and returns 'cookies'
360 | field in the result:
361 | 
362 | .. code:: python
363 | 
364 |    function main(splash)
365 |        splash:init_cookies(splash.args.cookies)
366 | 
367 |        -- ... your script
368 | 
369 |        return {
370 |            cookies = splash:get_cookies(),
371 |            -- ... other results, e.g. html
372 |        }
373 |    end
374 | 
375 | SplashRequest sets ``session_id`` automatically for ``/execute`` endpoint,
376 | i.e. cookie handling is enabled by default if you use SplashRequest,
377 | ``/execute`` endpoint and a compatible Lua rendering script.
378 | 
379 | If you want to start from the same set of cookies, but then 'fork' sessions
380 | set ``request.meta['splash']['new_session_id']`` in addition to
381 | ``session_id``. Request cookies will be fetched from cookiejar ``session_id``,
382 | but response cookies will be merged back to the ``new_session_id`` cookiejar.
383 | 
384 | Standard Scrapy ``cookies`` argument can be used with ``SplashRequest``
385 | to add cookies to the current Splash cookiejar.
386 | 
387 | Examples
388 | ========
389 | 
390 | Get HTML contents:
391 | 
392 | .. code:: python
393 | 
394 |     import scrapy
395 |     from scrapy_splash import SplashRequest
396 | 
397 |     class MySpider(scrapy.Spider):
398 |         name = "MySpider"
399 |         start_urls = ["http://example.com", "http://example.com/foo"]
400 | 
401 |         def start_requests(self):
402 |             for url in self.start_urls:
403 |                 yield SplashRequest(url, self.parse, args={'wait': 0.5})
404 | 
405 |         def parse(self, response):
406 |             # response.body is a result of render.html call; it
407 |             # contains HTML processed by a browser.
408 |             # ...
409 | 
410 | Get HTML contents and a screenshot:
411 | 
412 | .. code:: python
413 | 
414 |     import json
415 |     import base64
416 |     import scrapy
417 |     from scrapy_splash import SplashRequest
418 | 
419 |     class MySpider(scrapy.Spider):
420 | 
421 |         # ...
422 |             splash_args = {
423 |                 'wait': 1,
424 |                 'html': 1,
425 |                 'png': 1,
426 |                 'width': 600,
427 |                 'render_all': 1,
428 |             }
429 |             yield SplashRequest(url, self.parse_result, endpoint='render.json',
430 |                                 args=splash_args)
431 | 
432 |         # ...
433 |         def parse_result(self, response):
434 |             # magic responses are turned ON by default,
435 |             # so the result under 'html' key is available as response.body
436 |             html = response.body
437 | 
438 |             # you can also query the html result as usual
439 |             title = response.css('title').extract_first()
440 | 
441 |             # full decoded JSON data is available as response.data:
442 |             png_bytes = base64.b64decode(response.data['png'])
443 | 
444 |             # ...
445 | 
446 | Run a simple `Splash Lua Script`_:
447 | 
448 | .. code:: python
449 | 
450 |     import json
451 |     import base64
452 |     from scrapy_splash import SplashRequest
453 | 
454 | 
455 |     class MySpider(scrapy.Spider):
456 | 
457 |         # ...
458 |             script = """
459 |             function main(splash)
460 |                 assert(splash:go(splash.args.url))
461 |                 return splash:evaljs("document.title")
462 |             end
463 |             """
464 |             yield SplashRequest(url, self.parse_result, endpoint='execute',
465 |                                 args={'lua_source': script})
466 | 
467 |         # ...
468 |         def parse_result(self, response):
469 |             doc_title = response.text
470 |             # ...
471 | 
472 | 
473 | More complex `Splash Lua Script`_ example - get a screenshot of an HTML
474 | element by its CSS selector (it requires Splash 2.1+).
475 | Note how are arguments passed to the script:
476 | 
477 | .. code:: python
478 | 
479 |     import json
480 |     import base64
481 |     from scrapy_splash import SplashRequest
482 | 
483 |     script = """
484 |     -- Arguments:
485 |     -- * url - URL to render;
486 |     -- * css - CSS selector to render;
487 |     -- * pad - screenshot padding size.
488 | 
489 |     -- this function adds padding around region
490 |     function pad(r, pad)
491 |       return {r[1]-pad, r[2]-pad, r[3]+pad, r[4]+pad}
492 |     end
493 | 
494 |     -- main script
495 |     function main(splash)
496 | 
497 |       -- this function returns element bounding box
498 |       local get_bbox = splash:jsfunc([[
499 |         function(css) {
500 |           var el = document.querySelector(css);
501 |           var r = el.getBoundingClientRect();
502 |           return [r.left, r.top, r.right, r.bottom];
503 |         }
504 |       ]])
505 | 
506 |       assert(splash:go(splash.args.url))
507 |       assert(splash:wait(0.5))
508 | 
509 |       -- don't crop image by a viewport
510 |       splash:set_viewport_full()
511 | 
512 |       local region = pad(get_bbox(splash.args.css), splash.args.pad)
513 |       return splash:png{region=region}
514 |     end
515 |     """
516 | 
517 |     class MySpider(scrapy.Spider):
518 | 
519 | 
520 |         # ...
521 |             yield SplashRequest(url, self.parse_element_screenshot,
522 |                 endpoint='execute',
523 |                 args={
524 |                     'lua_source': script,
525 |                     'pad': 32,
526 |                     'css': 'a.title'
527 |                 }
528 |              )
529 | 
530 |         # ...
531 |         def parse_element_screenshot(self, response):
532 |             image_data = response.body  # binary image data in PNG format
533 |             # ...
534 | 
535 | 
536 | Use a Lua script to get an HTML response with cookies, headers, body
537 | and method set to correct values; ``lua_source`` argument value is cached
538 | on Splash server and is not sent with each request (it requires Splash 2.1+):
539 | 
540 | .. code:: python
541 | 
542 |     import scrapy
543 |     from scrapy_splash import SplashRequest
544 | 
545 |     script = """
546 |     function main(splash)
547 |       splash:init_cookies(splash.args.cookies)
548 |       assert(splash:go{
549 |         splash.args.url,
550 |         headers=splash.args.headers,
551 |         http_method=splash.args.http_method,
552 |         body=splash.args.body,
553 |         })
554 |       assert(splash:wait(0.5))
555 | 
556 |       local entries = splash:history()
557 |       local last_response = entries[#entries].response
558 |       return {
559 |         url = splash:url(),
560 |         headers = last_response.headers,
561 |         http_status = last_response.status,
562 |         cookies = splash:get_cookies(),
563 |         html = splash:html(),
564 |       }
565 |     end
566 |     """
567 | 
568 |     class MySpider(scrapy.Spider):
569 | 
570 | 
571 |         # ...
572 |             yield SplashRequest(url, self.parse_result,
573 |                 endpoint='execute',
574 |                 cache_args=['lua_source'],
575 |                 args={'lua_source': script},
576 |                 headers={'X-My-Header': 'value'},
577 |             )
578 | 
579 |         def parse_result(self, response):
580 |             # here response.body contains result HTML;
581 |             # response.headers are filled with headers from last
582 |             # web page loaded to Splash;
583 |             # cookies from all responses and from JavaScript are collected
584 |             # and put into Set-Cookie response header, so that Scrapy
585 |             # can remember them.
586 | 
587 | 
588 | 
589 | .. _Splash Lua Script: http://splash.readthedocs.org/en/latest/scripting-tutorial.html
590 | 
591 | 
592 | HTTP Basic Auth
593 | ===============
594 | 
595 | If you need to use HTTP Basic Authentication to access Splash, use the
596 | ``SPLASH_USER`` and ``SPLASH_PASS`` optional settings::
597 | 
598 |     SPLASH_USER = 'user'
599 |     SPLASH_PASS = 'userpass'
600 | 
601 | Another option is ``meta['splash']['splash_headers']``: it allows to set
602 | custom headers which are sent to Splash server; add Authorization header
603 | to ``splash_headers`` if you want to change credentials per-request::
604 | 
605 |     import scrapy
606 |     from w3lib.http import basic_auth_header
607 | 
608 |     class MySpider(scrapy.Spider):
609 |         # ...
610 |         def start_requests(self):
611 |             auth = basic_auth_header('user', 'userpass')
612 |             yield SplashRequest(url, self.parse,
613 |                                 splash_headers={'Authorization': auth})
614 | 
615 | **WARNING:** Don't use `HttpAuthMiddleware`_
616 | (i.e. ``http_user`` / ``http_pass`` spider attributes) for Splash
617 | authentication: if you occasionally send a non-Splash request from your spider,
618 | you may expose Splash credentials to a remote website, as HttpAuthMiddleware
619 | sets credentials for all requests unconditionally.
620 | 
621 | .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth
622 | 
623 | Why not use the Splash HTTP API directly?
624 | =========================================
625 | 
626 | The obvious alternative to scrapy-splash would be to send requests directly
627 | to the Splash `HTTP API`_. Take a look at the example below and make
628 | sure to read the observations after it:
629 | 
630 | .. code:: python
631 | 
632 |     import json
633 | 
634 |     import scrapy
635 |     from scrapy.http.headers import Headers
636 | 
637 |     RENDER_HTML_URL = "http://127.0.0.1:8050/render.html"
638 | 
639 |     class MySpider(scrapy.Spider):
640 |         start_urls = ["http://example.com", "http://example.com/foo"]
641 | 
642 |         def start_requests(self):
643 |             for url in self.start_urls:
644 |                 body = json.dumps({"url": url, "wait": 0.5}, sort_keys=True)
645 |                 headers = Headers({'Content-Type': 'application/json'})
646 |                 yield scrapy.Request(RENDER_HTML_URL, self.parse, method="POST",
647 |                                      body=body, headers=headers)
648 | 
649 |         def parse(self, response):
650 |             # response.body is a result of render.html call; it
651 |             # contains HTML processed by a browser.
652 |             # ...
653 | 
654 | 
655 | It works and is easy enough, but there are some issues that you should be
656 | aware of:
657 | 
658 | 1. There is a bit of boilerplate.
659 | 
660 | 2. As seen by Scrapy, we're sending requests to ``RENDER_HTML_URL`` instead
661 |    of the target URLs. It affects concurrency and politeness settings:
662 |    ``CONCURRENT_REQUESTS_PER_DOMAIN``, ``DOWNLOAD_DELAY``, etc could behave
663 |    in unexpected ways since delays and concurrency settings are no longer
664 |    per-domain.
665 | 
666 | 3. As seen by Scrapy, response.url is an URL of the Splash server.
667 |    scrapy-splash fixes it to be an URL of a requested page.
668 |    "Real" URL is still available as ``response.real_url``. scrapy-splash also
669 |    allows to handle ``response.status`` and ``response.headers`` transparently
670 |    on Scrapy side.
671 | 
672 | 4. Some options depend on each other - for example, if you use timeout_
673 |    Splash option then you may want to set ``download_timeout``
674 |    scrapy.Request meta key as well.
675 | 
676 | 5. It is easy to get it subtly wrong - e.g. if you won't use
677 |    ``sort_keys=True`` argument when preparing JSON body then binary POST body
678 |    content could vary even if all keys and values are the same, and it means
679 |    dupefilter and cache will work incorrectly.
680 | 
681 | 6. Default Scrapy duplication filter doesn't take Splash specifics in
682 |    account. For example, if an URL is sent in a JSON POST request body
683 |    Scrapy will compute request fingerprint without canonicalizing this URL.
684 | 
685 | 7. Splash Bad Request (HTTP 400) errors are hard to debug because by default
686 |    response content is not displayed by Scrapy. SplashMiddleware logs content
687 |    of HTTP 400 Splash responses by default (it can be turned off by setting
688 |    ``SPLASH_LOG_400 = False`` option).
689 | 
690 | 8. Cookie handling is tedious to implement, and you can't use Scrapy
691 |    built-in Cookie middleware to handle cookies when working with Splash.
692 | 
693 | 9. Large Splash arguments which don't change with every request
694 |    (e.g. ``lua_source``) may take a lot of space when saved to Scrapy disk
695 |    request queues. ``scrapy-splash`` provides a way to store such static
696 |    parameters only once.
697 | 
698 | 10. Splash 2.1+ provides a way to save network traffic by caching large
699 |     static arguments on server, but it requires client support: client should
700 |     send proper ``save_args`` and ``load_args`` values and handle HTTP 498
701 |     responses.
702 | 
703 | scrapy-splash utilities allow to handle such edge cases and reduce
704 | the boilerplate.
705 | 
706 | .. _HTTP API: http://splash.readthedocs.org/en/latest/api.html
707 | .. _timeout: http://splash.readthedocs.org/en/latest/api.html#arg-timeout
708 | 
709 | 
710 | Getting help
711 | ============
712 | 
713 | * for problems with rendering pages read "`Splash FAQ`_" page
714 | * for Scrapy-related bugs take a look at "`reporting Scrapy bugs`_" page
715 | 
716 | Best approach to get any other help is to ask a question on `Stack Overflow`_
717 | 
718 | .. _reporting Scrapy bugs: https://doc.scrapy.org/en/master/contributing.html#reporting-bugs
719 | .. _Splash FAQ: http://splash.readthedocs.io/en/stable/faq.html#website-is-not-rendered-correctly
720 | .. _Stack Overflow: https://stackoverflow.com/questions/tagged/scrapy-splash?sort=frequent&pageSize=15&mixed=1
721 | 
722 | 
723 | Contributing
724 | ============
725 | 
726 | Source code and bug tracker are on github:
727 | https://github.com/scrapy-plugins/scrapy-splash
728 | 
729 | To run tests, install "tox" Python package and then run ``tox`` command
730 | from the source checkout.
731 | 
732 | To run integration tests, start Splash and set SPLASH_URL env variable
733 | to Splash address before running ``tox`` command::
734 | 
735 |    docker run -d --rm -p8050:8050 scrapinghub/splash:3.0
736 |    SPLASH_URL=http://127.0.0.1:8050 tox -e py36
737 | 


--------------------------------------------------------------------------------
/example/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = scrashtest.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrashtest
12 | 


--------------------------------------------------------------------------------
/example/scrashtest/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-splash/72a8788212746b938e1e4d45aad56ff27857924a/example/scrashtest/__init__.py


--------------------------------------------------------------------------------
/example/scrashtest/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'scrashtest'
 4 | 
 5 | SPIDER_MODULES = ['scrashtest.spiders']
 6 | NEWSPIDER_MODULE = 'scrashtest.spiders'
 7 | 
 8 | DOWNLOADER_MIDDLEWARES = {
 9 |     # Engine side
10 |     'scrapy_splash.SplashCookiesMiddleware': 723,
11 |     'scrapy_splash.SplashMiddleware': 725,
12 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
13 |     # Downloader side
14 | }
15 | 
16 | SPIDER_MIDDLEWARES = {
17 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
18 | }
19 | SPLASH_URL = 'http://127.0.0.1:8050/'
20 | # SPLASH_URL = 'http://192.168.59.103:8050/'
21 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
22 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
23 | ROBOTSTXT_OBEY = True


--------------------------------------------------------------------------------
/example/scrashtest/spiders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy-plugins/scrapy-splash/72a8788212746b938e1e4d45aad56ff27857924a/example/scrashtest/spiders/__init__.py


--------------------------------------------------------------------------------
/example/scrashtest/spiders/quotes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | 
 5 | from scrapy_splash import SplashRequest
 6 | 
 7 | 
 8 | class QuotesSpider(scrapy.Spider):
 9 |     name = "quotes"
10 |     allowed_domains = ["toscrape.com"]
11 |     start_urls = ['http://quotes.toscrape.com/']
12 | 
13 |     #custom_settings = {
14 |         #'SPLASH_USER': 'splash-user',
15 |         #'SPLASH_PASS': 'splash-password',
16 |     #}
17 | 
18 |     def parse(self, response):
19 |         le = LinkExtractor()
20 |         for link in le.extract_links(response):
21 |             yield SplashRequest(
22 |                 link.url,
23 |                 self.parse_link,
24 |                 endpoint='render.json',
25 |                 args={
26 |                     'har': 1,
27 |                     'html': 1,
28 |                 }
29 |             )
30 | 
31 |     def parse_link(self, response):
32 |         print("PARSED", response.real_url, response.url)
33 |         print(response.css("title").extract())
34 |         print(response.data["har"]["log"]["pages"])
35 |         print(response.headers.get('Content-Type'))
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.bumpversion]
 2 | current_version = "0.11.1"
 3 | commit = true
 4 | tag = true
 5 | tag_name = "{new_version}"
 6 | 
 7 | [[tool.bumpversion.files]]
 8 | filename = 'CHANGES.rst'
 9 | search = "\\(unreleased\\)$"
10 | replace = "({now:%Y-%m-%d})"
11 | regex = true
12 | 
13 | [[tool.bumpversion.files]]
14 | filename = "setup.py"
15 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | doctest_optionflags = NORMALIZE_WHITESPACE ALLOW_UNICODE
3 | 


--------------------------------------------------------------------------------
/scrapy_splash/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | from .middleware import (
 5 |     SplashMiddleware,
 6 |     SplashCookiesMiddleware,
 7 |     SplashDeduplicateArgsMiddleware,
 8 |     SlotPolicy,
 9 | )
10 | from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint
11 | from .cache import SplashAwareFSCacheStorage
12 | from .response import SplashResponse, SplashTextResponse, SplashJsonResponse
13 | from .request import SplashRequest, SplashFormRequest, SplashRequestFingerprinter
14 | 


--------------------------------------------------------------------------------
/scrapy_splash/cache.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | To handle "splash" Request meta key correctly when HTTP cache is enabled
 4 | Scrapy needs a custom caching backed.
 5 | 
 6 | See https://github.com/scrapy/scrapy/issues/900 for more info.
 7 | """
 8 | from __future__ import absolute_import
 9 | import os
10 | from warnings import warn
11 | 
12 | from scrapy.extensions.httpcache import FilesystemCacheStorage
13 | 
14 | from .dupefilter import splash_request_fingerprint
15 | 
16 | 
17 | class SplashAwareFSCacheStorage(FilesystemCacheStorage):
18 |     def __init__(self, settings):
19 |         warn(
20 |             (
21 |                 "scrapy-splash.SplashAwareFSCacheStorage is deprecated. Set "
22 |                 "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
23 |                 "\"scrapy_splash.SplashRequestFingerprinter\" instead."
24 |             ),
25 |             DeprecationWarning,
26 |             stacklevel=2,
27 |         )
28 |         super().__init__(settings)
29 | 
30 |     def _get_request_path(self, spider, request):
31 |         key = splash_request_fingerprint(request)
32 |         return os.path.join(self.cachedir, spider.name, key[0:2], key)
33 | 


--------------------------------------------------------------------------------
/scrapy_splash/cookies.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Cookie-related utilities.
  4 | """
  5 | from __future__ import absolute_import
  6 | import time
  7 | import calendar
  8 | 
  9 | from six.moves.http_cookiejar import CookieJar, Cookie
 10 | 
 11 | 
 12 | def jar_to_har(cookiejar):
 13 |     """ Convert CookieJar to HAR cookies format """
 14 |     return [cookie_to_har(c) for c in cookiejar]
 15 | 
 16 | 
 17 | def har_to_jar(cookiejar, har_cookies, request_cookies=None):
 18 |     """ Add HAR cookies to the cookiejar.
 19 |     If request_cookies is given, remove cookies absent from har_cookies
 20 |     but present in request_cookies (they were removed). """
 21 |     har_cookie_keys = set()
 22 |     for c in har_cookies:
 23 |         cookie = har_to_cookie(c)
 24 |         cookiejar.set_cookie(cookie)
 25 |         har_cookie_keys.add(_cookie_key(cookie))
 26 |     if request_cookies:
 27 |         for c in request_cookies:
 28 |             cookie = har_to_cookie(c)
 29 |             if _cookie_key(cookie) not in har_cookie_keys:
 30 |                 # We sent it but it did not come back: remove it
 31 |                 try:
 32 |                     cookiejar.clear(cookie.domain, cookie.path, cookie.name)
 33 |                 except KeyError:
 34 |                     pass  # It could have been already removed
 35 | 
 36 | 
 37 | def _cookie_key(cookie):
 38 |     return (cookie.domain, cookie.path, cookie.name)
 39 | 
 40 | 
 41 | def har_to_cookie(har_cookie):
 42 |     """
 43 |     Convert a cookie dict in HAR format to a Cookie instance.
 44 | 
 45 |     >>> har_cookie =  {
 46 |     ...     "name": "TestCookie",
 47 |     ...     "value": "Cookie Value",
 48 |     ...     "path": "/foo",
 49 |     ...     "domain": "www.janodvarko.cz",
 50 |     ...     "expires": "2009-07-24T19:20:30Z",
 51 |     ...     "httpOnly": True,
 52 |     ...     "secure": True,
 53 |     ...     "comment": "this is a test"
 54 |     ... }
 55 |     >>> cookie = har_to_cookie(har_cookie)
 56 |     >>> cookie.name
 57 |     'TestCookie'
 58 |     >>> cookie.value
 59 |     'Cookie Value'
 60 |     >>> cookie.port
 61 |     >>> cookie.domain
 62 |     'www.janodvarko.cz'
 63 |     >>> cookie.path
 64 |     '/foo'
 65 |     >>> cookie.secure
 66 |     True
 67 |     >>> cookie.expires
 68 |     1248463230
 69 |     >>> cookie.comment
 70 |     'this is a test'
 71 |     >>> cookie.get_nonstandard_attr('HttpOnly')
 72 |     True
 73 |     """
 74 | 
 75 |     expires_timestamp = None
 76 |     if har_cookie.get('expires'):
 77 |         expires = time.strptime(har_cookie['expires'], "%Y-%m-%dT%H:%M:%SZ")
 78 |         expires_timestamp = calendar.timegm(expires)
 79 | 
 80 |     kwargs = dict(
 81 |         version=har_cookie.get('version') or 0,
 82 |         name=har_cookie['name'],
 83 |         value=har_cookie['value'],
 84 |         port=None,
 85 |         domain=har_cookie.get('domain', ''),
 86 |         path=har_cookie.get('path', '/'),
 87 |         secure=har_cookie.get('secure', False),
 88 |         expires=expires_timestamp,
 89 |         discard=False,
 90 |         comment=har_cookie.get('comment'),
 91 |         comment_url=bool(har_cookie.get('comment')),
 92 |         rest={'HttpOnly': har_cookie.get('httpOnly')},
 93 |         rfc2109=False,
 94 |     )
 95 |     kwargs['port_specified'] = bool(kwargs['port'])
 96 |     kwargs['domain_specified'] = bool(kwargs['domain'])
 97 |     kwargs['domain_initial_dot'] = kwargs['domain'].startswith('.')
 98 |     kwargs['path_specified'] = bool(kwargs['path'])
 99 |     return Cookie(**kwargs)
100 | 
101 | 
102 | def cookie_to_har(cookie):
103 |     """
104 |     Convert a Cookie instance to a dict in HAR cookie format.
105 |     """
106 |     c = {
107 |         'name': cookie.name,
108 |         'value': cookie.value,
109 |         'secure': cookie.secure,
110 |     }
111 |     if cookie.path_specified:
112 |         c['path'] = cookie.path
113 | 
114 |     if cookie.domain_specified:
115 |         c['domain'] = cookie.domain
116 | 
117 |     if cookie.expires:
118 |         tm = time.gmtime(cookie.expires)
119 |         c['expires'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", tm)
120 | 
121 |     http_only = cookie.get_nonstandard_attr('HttpOnly')
122 |     if http_only is not None:
123 |         c['httpOnly'] = bool(http_only)
124 | 
125 |     if cookie.comment:
126 |         c['comment'] = cookie.comment
127 | 
128 |     return c
129 | 


--------------------------------------------------------------------------------
/scrapy_splash/dupefilter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | To handle "splash" Request meta key properly a custom DupeFilter must be set.
  4 | See https://github.com/scrapy/scrapy/issues/900 for more info.
  5 | """
  6 | from __future__ import absolute_import, annotations
  7 | from copy import deepcopy
  8 | import hashlib
  9 | from weakref import WeakKeyDictionary
 10 | from warnings import warn
 11 | 
 12 | from scrapy.dupefilters import RFPDupeFilter
 13 | 
 14 | from scrapy.utils.python import to_bytes
 15 | from scrapy.utils.url import canonicalize_url
 16 | from scrapy.utils.request import RequestFingerprinterProtocol
 17 | 
 18 | from .utils import dict_hash
 19 | 
 20 | 
 21 | _deprecated_fingerprint_cache = WeakKeyDictionary()
 22 | 
 23 | 
 24 | def _serialize_headers(
 25 |     headers, request
 26 | ):
 27 |     for header in headers:
 28 |         if header in request.headers:
 29 |             yield header
 30 |             for value in request.headers.getlist(header):
 31 |                 yield value
 32 | 
 33 | 
 34 | # From https://docs.scrapy.org/en/2.11/_modules/scrapy/utils/request.html
 35 | # Needs to be added here since it was deletedin Scrapy 2.12
 36 | def request_fingerprint(
 37 |     request,
 38 |     include_headers=None,
 39 |     keep_fragments=False,
 40 | ):
 41 |     """
 42 |     Return the request fingerprint as an hexadecimal string.
 43 | 
 44 |     The request fingerprint is a hash that uniquely identifies the resource the
 45 |     request points to. For example, take the following two urls:
 46 | 
 47 |     http://www.example.com/query?id=111&cat=222
 48 |     http://www.example.com/query?cat=222&id=111
 49 | 
 50 |     Even though those are two different URLs both point to the same resource
 51 |     and are equivalent (i.e. they should return the same response).
 52 | 
 53 |     Another example are cookies used to store session ids. Suppose the
 54 |     following page is only accessible to authenticated users:
 55 | 
 56 |     http://www.example.com/members/offers.html
 57 | 
 58 |     Lots of sites use a cookie to store the session id, which adds a random
 59 |     component to the HTTP Request and thus should be ignored when calculating
 60 |     the fingerprint.
 61 | 
 62 |     For this reason, request headers are ignored by default when calculating
 63 |     the fingerprint. If you want to include specific headers use the
 64 |     include_headers argument, which is a list of Request headers to include.
 65 | 
 66 |     Also, servers usually ignore fragments in urls when handling requests,
 67 |     so they are also ignored by default when calculating the fingerprint.
 68 |     If you want to include them, set the keep_fragments argument to True
 69 |     (for instance when handling requests with a headless browser).
 70 |     """
 71 |     processed_include_headers = None
 72 |     if include_headers:
 73 |         processed_include_headers = tuple(
 74 |             to_bytes(h.lower()) for h in sorted(include_headers)
 75 |         )
 76 |     cache = _deprecated_fingerprint_cache.setdefault(request, {})
 77 |     cache_key = (processed_include_headers, keep_fragments)
 78 |     if cache_key not in cache:
 79 |         fp = hashlib.sha1()
 80 |         fp.update(to_bytes(request.method))
 81 |         fp.update(
 82 |             to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))
 83 |         )
 84 |         fp.update(request.body or b"")
 85 |         if processed_include_headers:
 86 |             for part in _serialize_headers(processed_include_headers, request):
 87 |                 fp.update(part)
 88 |         cache[cache_key] = fp.hexdigest()
 89 |     return cache[cache_key]
 90 | 
 91 | 
 92 | def splash_request_fingerprint(request, include_headers=None):
 93 |     """ Request fingerprint which takes 'splash' meta key into account """
 94 |     warn(
 95 |         (
 96 |             "scrapy_splash.splash_request_fingerprint is deprecated. Set "
 97 |             "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
 98 |             "\"scrapy_splash.SplashRequestFingerprinter\" instead."
 99 |         ),
100 |         DeprecationWarning,
101 |         stacklevel=2,
102 |     )
103 | 
104 |     fp = request_fingerprint(request, include_headers=include_headers)
105 |     if 'splash' not in request.meta:
106 |         return fp
107 | 
108 |     splash_options = deepcopy(request.meta['splash'])
109 |     args = splash_options.setdefault('args', {})
110 | 
111 |     if 'url' in args:
112 |         args['url'] = canonicalize_url(args['url'], keep_fragments=True)
113 | 
114 |     return dict_hash(splash_options, fp)
115 | 
116 | 
117 | class SplashAwareDupeFilter(RFPDupeFilter):
118 |     """
119 |     DupeFilter that takes 'splash' meta key in account.
120 |     It should be used with SplashMiddleware.
121 |     """
122 | 
123 |     def __init__(
124 |             self,
125 |             path: str | None = None,
126 |             debug: bool = False,
127 |             *,
128 |             fingerprinter: RequestFingerprinterProtocol | None = None
129 |     ):
130 |         warn(
131 |             (
132 |                 "SplashAwareDupeFilter is deprecated. Set "
133 |                 "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
134 |                 "\"scrapy_splash.SplashRequestFingerprinter\" instead."
135 |             ),
136 |             DeprecationWarning,
137 |             stacklevel=2,
138 |         )
139 |         super().__init__(path, debug, fingerprinter=fingerprinter)
140 | 
141 |     def request_fingerprint(self, request):
142 |         return splash_request_fingerprint(request)
143 | 


--------------------------------------------------------------------------------
/scrapy_splash/middleware.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | 
  4 | import copy
  5 | import json
  6 | import logging
  7 | import warnings
  8 | from collections import defaultdict
  9 | 
 10 | from six.moves.urllib.parse import urljoin
 11 | from six.moves.http_cookiejar import CookieJar
 12 | 
 13 | from w3lib.http import basic_auth_header
 14 | import scrapy
 15 | from scrapy.exceptions import NotConfigured, IgnoreRequest
 16 | from scrapy.http.headers import Headers
 17 | from scrapy.http.response.text import TextResponse
 18 | from scrapy import signals
 19 | from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
 20 | 
 21 | from scrapy_splash.responsetypes import responsetypes
 22 | from scrapy_splash.cookies import jar_to_har, har_to_jar
 23 | from scrapy_splash.utils import (
 24 |     scrapy_headers_to_unicode_dict,
 25 |     json_based_hash,
 26 |     parse_x_splash_saved_arguments_header,
 27 | )
 28 | from scrapy_splash.response import get_splash_status, get_splash_headers
 29 | 
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | class SlotPolicy(object):
 35 |     PER_DOMAIN = 'per_domain'
 36 |     SINGLE_SLOT = 'single_slot'
 37 |     SCRAPY_DEFAULT = 'scrapy_default'
 38 | 
 39 |     _known = {PER_DOMAIN, SINGLE_SLOT, SCRAPY_DEFAULT}
 40 | 
 41 | 
 42 | class SplashCookiesMiddleware(object):
 43 |     """
 44 |     This downloader middleware maintains cookiejars for Splash requests.
 45 | 
 46 |     It gets cookies from 'cookies' field in Splash JSON responses
 47 |     and sends current cookies in 'cookies' JSON POST argument instead of
 48 |     sending them in http headers.
 49 | 
 50 |     It should process requests before SplashMiddleware, and process responses
 51 |     after SplashMiddleware.
 52 |     """
 53 |     def __init__(self, debug=False):
 54 |         self.jars = defaultdict(CookieJar)
 55 |         self.debug = debug
 56 | 
 57 |     @classmethod
 58 |     def from_crawler(cls, crawler):
 59 |         return cls(debug=crawler.settings.getbool('SPLASH_COOKIES_DEBUG'))
 60 | 
 61 |     def process_request(self, request, spider):
 62 |         """
 63 |         For Splash requests add 'cookies' key with current
 64 |         cookies to ``request.meta['splash']['args']`` and remove cookie
 65 |         headers sent to Splash itself.
 66 |         """
 67 |         if 'splash' not in request.meta:
 68 |             return
 69 | 
 70 |         if request.meta.get('_splash_processed'):
 71 |             request.headers.pop('Cookie', None)
 72 |             return
 73 | 
 74 |         splash_options = request.meta['splash']
 75 | 
 76 |         splash_args = splash_options.setdefault('args', {})
 77 |         if 'cookies' in splash_args:  # cookies already set
 78 |             return
 79 | 
 80 |         if 'session_id' not in splash_options:
 81 |             return
 82 | 
 83 |         jar = self.jars[splash_options['session_id']]
 84 | 
 85 |         cookies = self._get_request_cookies(request)
 86 |         har_to_jar(jar, cookies)
 87 | 
 88 |         splash_args['cookies'] = jar_to_har(jar)
 89 |         self._debug_cookie(request, spider)
 90 | 
 91 |     def process_response(self, request, response, spider):
 92 |         """
 93 |         For Splash JSON responses add all cookies from
 94 |         'cookies' in a response to the cookiejar.
 95 |         """
 96 |         from scrapy_splash import SplashJsonResponse
 97 |         if not isinstance(response, SplashJsonResponse):
 98 |             return response
 99 | 
100 |         if 'cookies' not in response.data:
101 |             return response
102 | 
103 |         if 'splash' not in request.meta:
104 |             return response
105 | 
106 |         if not request.meta.get('_splash_processed'):
107 |             warnings.warn("SplashCookiesMiddleware requires SplashMiddleware")
108 |             return response
109 | 
110 |         splash_options = request.meta['splash']
111 |         session_id = splash_options.get('new_session_id',
112 |                                         splash_options.get('session_id'))
113 |         if session_id is None:
114 |             return response
115 | 
116 |         jar = self.jars[session_id]
117 |         request_cookies = splash_options['args'].get('cookies', [])
118 |         har_to_jar(jar, response.data['cookies'], request_cookies)
119 |         self._debug_set_cookie(response, spider)
120 |         response.cookiejar = jar
121 |         return response
122 | 
123 |     def _get_request_cookies(self, request):
124 |         if isinstance(request.cookies, dict):
125 |             return [
126 |                 {'name': k, 'value': v} for k, v in request.cookies.items()
127 |             ]
128 |         return request.cookies or []
129 | 
130 |     def _debug_cookie(self, request, spider):
131 |         if self.debug:
132 |             cl = request.meta['splash']['args']['cookies']
133 |             if cl:
134 |                 cookies = '\n'.join(
135 |                     'Cookie: {}'.format(self._har_repr(c)) for c in cl)
136 |                 msg = 'Sending cookies to: {}\n{}'.format(request, cookies)
137 |                 logger.debug(msg, extra={'spider': spider})
138 | 
139 |     def _debug_set_cookie(self, response, spider):
140 |         if self.debug:
141 |             cl = response.data['cookies']
142 |             if cl:
143 |                 cookies = '\n'.join(
144 |                     'Set-Cookie: {}'.format(self._har_repr(c)) for c in cl)
145 |                 msg = 'Received cookies from: {}\n{}'.format(response, cookies)
146 |                 logger.debug(msg, extra={'spider': spider})
147 | 
148 |     @staticmethod
149 |     def _har_repr(har_cookie):
150 |         return '{}={}'.format(har_cookie['name'], har_cookie['value'])
151 | 
152 | 
153 | class SplashDeduplicateArgsMiddleware(object):
154 |     """
155 |     Spider middleware which allows not to store duplicate Splash argument
156 |     values in request queue. It works together with SplashMiddleware downloader
157 |     middleware.
158 |     """
159 |     local_values_key = '_splash_local_values'
160 | 
161 |     def process_spider_output(self, response, result, spider):
162 |         for el in result:
163 |             if isinstance(el, scrapy.Request):
164 |                 yield self._process_request(el, spider)
165 |             else:
166 |                 yield el
167 | 
168 |     def process_start_requests(self, start_requests, spider):
169 |         if not hasattr(spider, 'state'):
170 |             spider.state = {}
171 |         spider.state.setdefault(self.local_values_key, {})  # fingerprint => value dict
172 | 
173 |         for req in start_requests:
174 |             yield self._process_request(req, spider)
175 | 
176 |     def _process_request(self, request, spider):
177 |         """
178 |         Replace requested meta['splash']['args'] values with their fingerprints.
179 |         This allows to store values only once in request queue, which helps
180 |         with disk queue size.
181 | 
182 |         Downloader middleware should restore the values from fingerprints.
183 |         """
184 |         if 'splash' not in request.meta:
185 |             return request
186 | 
187 |         if '_replaced_args' in request.meta['splash']:
188 |             # don't process re-scheduled requests
189 |             # XXX: does it work as expected?
190 |             warnings.warn("Unexpected request.meta['splash']['_replaced_args']")
191 |             return request
192 | 
193 |         request.meta['splash']['_replaced_args'] = []
194 |         cache_args = request.meta['splash'].get('cache_args', [])
195 |         args = request.meta['splash'].setdefault('args', {})
196 | 
197 |         for name in cache_args:
198 |             if name not in args:
199 |                 continue
200 |             value = args[name]
201 |             fp = 'LOCAL+' + json_based_hash(value)
202 |             spider.state[self.local_values_key][fp] = value
203 |             args[name] = fp
204 |             request.meta['splash']['_replaced_args'].append(name)
205 | 
206 |         return request
207 | 
208 | 
209 | class SplashMiddleware(object):
210 |     """
211 |     Scrapy downloader and spider middleware that passes requests
212 |     through Splash when 'splash' Request.meta key is set.
213 | 
214 |     This middleware also works together with SplashDeduplicateArgsMiddleware
215 |     spider middleware to allow not to store duplicate Splash argument values
216 |     in request queue and not to send them multiple times to Splash
217 |     (the latter requires Splash 2.1+).
218 |     """
219 |     default_splash_url = 'http://127.0.0.1:8050'
220 |     default_endpoint = "render.json"
221 |     splash_extra_timeout = 5.0
222 |     default_policy = SlotPolicy.PER_DOMAIN
223 |     rescheduling_priority_adjust = +100
224 |     retry_498_priority_adjust = +50
225 |     remote_keys_key = '_splash_remote_keys'
226 | 
227 |     def __init__(self, crawler, splash_base_url, slot_policy, log_400, auth):
228 |         self.crawler = crawler
229 |         self.splash_base_url = splash_base_url
230 |         self.slot_policy = slot_policy
231 |         self.log_400 = log_400
232 |         self.crawler.signals.connect(self.spider_opened, signals.spider_opened)
233 |         self.auth = auth
234 | 
235 |     @classmethod
236 |     def from_crawler(cls, crawler):
237 |         s = crawler.settings
238 |         splash_base_url = s.get('SPLASH_URL', cls.default_splash_url)
239 |         log_400 = s.getbool('SPLASH_LOG_400', True)
240 |         slot_policy = s.get('SPLASH_SLOT_POLICY', cls.default_policy)
241 |         if slot_policy not in SlotPolicy._known:
242 |             raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
243 | 
244 |         splash_user = s.get('SPLASH_USER', '')
245 |         splash_pass = s.get('SPLASH_PASS', '')
246 |         auth = None
247 |         if splash_user or splash_pass:
248 |             auth = basic_auth_header(splash_user, splash_pass)
249 |         return cls(crawler, splash_base_url, slot_policy, log_400, auth)
250 | 
251 |     def spider_opened(self, spider):
252 |         if _http_auth_enabled(spider):
253 |             replace_downloader_middleware(self.crawler, RobotsTxtMiddleware,
254 |                                           SafeRobotsTxtMiddleware)
255 |         if not hasattr(spider, 'state'):
256 |             spider.state = {}
257 | 
258 |         # local fingerprint => key returned by splash
259 |         spider.state.setdefault(self.remote_keys_key, {})
260 | 
261 |     @property
262 |     def _argument_values(self):
263 |         key = SplashDeduplicateArgsMiddleware.local_values_key
264 |         return self.crawler.spider.state[key]
265 | 
266 |     @property
267 |     def _remote_keys(self):
268 |         return self.crawler.spider.state[self.remote_keys_key]
269 | 
270 |     def process_request(self, request, spider):
271 |         if 'splash' not in request.meta:
272 |             return
273 |         splash_options = request.meta['splash']
274 | 
275 |         if request.method not in {'GET', 'POST'}:
276 |             logger.error(
277 |                 "Currently only GET and POST requests are supported by "
278 |                 "SplashMiddleware; %(request)s is dropped",
279 |                 {'request': request},
280 |                 extra={'spider': spider}
281 |             )
282 |             self.crawler.stats.inc_value('splash/dropped/method/{}'.format(
283 |                 request.method))
284 |             raise IgnoreRequest("SplashRequest doesn't support "
285 |                                 "HTTP {} method".format(request.method))
286 | 
287 |         if request.meta.get("_splash_processed"):
288 |             # don't process the same request more than once
289 |             return
290 | 
291 |         request.meta['_splash_processed'] = True
292 | 
293 |         slot_policy = splash_options.get('slot_policy', self.slot_policy)
294 |         self._set_download_slot(request, request.meta, slot_policy)
295 | 
296 |         args = splash_options.setdefault('args', {})
297 | 
298 |         if '_replaced_args' in splash_options:
299 |             # restore arguments before sending request to the downloader
300 |             load_args = {}
301 |             save_args = []
302 |             local_arg_fingerprints = {}
303 |             for name in splash_options['_replaced_args']:
304 |                 fp = args[name]
305 |                 # Use remote Splash argument cache: if Splash key
306 |                 # for a value is known then don't send the value to Splash;
307 |                 # if it is unknown then try to save the value on server using
308 |                 # ``save_args``.
309 |                 if fp in self._remote_keys:
310 |                     load_args[name] = self._remote_keys[fp]
311 |                     del args[name]
312 |                 else:
313 |                     save_args.append(name)
314 |                     args[name] = self._argument_values[fp]
315 | 
316 |                 local_arg_fingerprints[name] = fp
317 | 
318 |             if load_args:
319 |                 args['load_args'] = load_args
320 |             if save_args:
321 |                 args['save_args'] = save_args
322 |             splash_options['_local_arg_fingerprints'] = local_arg_fingerprints
323 | 
324 |             del splash_options['_replaced_args']  # ??
325 | 
326 |         args.setdefault('url', request.url)
327 |         if request.method == 'POST':
328 |             args.setdefault('http_method', request.method)
329 |             # XXX: non-UTF8 request bodies are not supported now
330 |             args.setdefault('body', request.body.decode('utf8'))
331 | 
332 |         if not splash_options.get('dont_send_headers'):
333 |             headers = scrapy_headers_to_unicode_dict(request.headers)
334 |             if headers:
335 |                 # Headers set by HttpAuthMiddleware should be used for Splash,
336 |                 # not for the remote website (backwards compatibility).
337 |                 if _http_auth_enabled(spider):
338 |                     headers.pop('Authorization', None)
339 |                 args.setdefault('headers', headers)
340 | 
341 |         body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4)
342 |         # print(body)
343 | 
344 |         if 'timeout' in args:
345 |             # User requested a Splash timeout explicitly.
346 |             #
347 |             # We can't catch a case when user requested `download_timeout`
348 |             # explicitly because a default value for `download_timeout`
349 |             # is set by DownloadTimeoutMiddleware.
350 |             #
351 |             # As user requested Splash timeout explicitly, we shouldn't change
352 |             # it. Another reason not to change the requested Splash timeout is
353 |             # because it may cause a validation error on the remote end.
354 |             #
355 |             # But we can change Scrapy `download_timeout`: increase
356 |             # it when it's too small. Decreasing `download_timeout` is not
357 |             # safe.
358 | 
359 |             timeout_requested = float(args['timeout'])
360 |             timeout_expected = timeout_requested + self.splash_extra_timeout
361 | 
362 |             # no timeout means infinite timeout
363 |             timeout_current = request.meta.get('download_timeout', 1e6)
364 | 
365 |             if timeout_expected > timeout_current:
366 |                 request.meta['download_timeout'] = timeout_expected
367 | 
368 |         endpoint = splash_options.setdefault('endpoint', self.default_endpoint)
369 |         splash_base_url = splash_options.get('splash_url', self.splash_base_url)
370 |         splash_url = urljoin(splash_base_url, endpoint)
371 | 
372 |         headers = Headers({'Content-Type': 'application/json'})
373 |         if self.auth is not None:
374 |             headers['Authorization'] = self.auth
375 |         headers.update(splash_options.get('splash_headers', {}))
376 |         new_request = request.replace(
377 |             url=splash_url,
378 |             method='POST',
379 |             body=body,
380 |             headers=headers,
381 |             priority=request.priority + self.rescheduling_priority_adjust
382 |         )
383 |         new_request.meta['dont_obey_robotstxt'] = True
384 |         self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
385 |         return new_request
386 | 
387 |     def process_response(self, request, response, spider):
388 |         if not request.meta.get("_splash_processed"):
389 |             return response
390 | 
391 |         splash_options = request.meta['splash']
392 |         if not splash_options:
393 |             return response
394 | 
395 |         # update stats
396 |         endpoint = splash_options['endpoint']
397 |         self.crawler.stats.inc_value(
398 |             'splash/%s/response_count/%s' % (endpoint, response.status)
399 |         )
400 | 
401 |         # handle save_args/load_args
402 |         self._process_x_splash_saved_arguments(request, response)
403 |         if get_splash_status(response) == 498:
404 |             logger.debug("Got HTTP 498 response for {}; "
405 |                          "sending arguments again.".format(request),
406 |                          extra={'spider': spider})
407 |             return self._498_retry_request(request, response)
408 | 
409 |         if splash_options.get('dont_process_response', False):
410 |             return response
411 | 
412 |         response = self._change_response_class(request, response)
413 | 
414 |         if self.log_400 and get_splash_status(response) == 400:
415 |             self._log_400(request, response, spider)
416 | 
417 |         return response
418 | 
419 |     def _change_response_class(self, request, response):
420 |         from scrapy_splash import SplashResponse, SplashTextResponse
421 |         if not isinstance(response, (SplashResponse, SplashTextResponse)):
422 |             # create a custom Response subclass based on response Content-Type
423 |             # XXX: usually request is assigned to response only when all
424 |             # downloader middlewares are executed. Here it is set earlier.
425 |             # Does it have any negative consequences?
426 |             respcls = responsetypes.from_args(headers=response.headers)
427 |             if isinstance(response, TextResponse) and respcls is SplashResponse:
428 |                 # Even if the headers say it's binary, it has already
429 |                 # been detected as a text response by scrapy (for example
430 |                 # because it was decoded successfully), so we should not
431 |                 # convert it to SplashResponse.
432 |                 respcls = SplashTextResponse
433 |             response = response.replace(cls=respcls, request=request)
434 |         return response
435 | 
436 |     def _log_400(self, request, response, spider):
437 |         from scrapy_splash import SplashJsonResponse
438 |         if isinstance(response, SplashJsonResponse):
439 |             logger.warning(
440 |                 "Bad request to Splash: %s" % response.data,
441 |                 {'request': request},
442 |                 extra={'spider': spider}
443 |             )
444 | 
445 |     def _process_x_splash_saved_arguments(self, request, response):
446 |         """ Keep track of arguments saved by Splash. """
447 |         saved_args = get_splash_headers(response).get(b'X-Splash-Saved-Arguments')
448 |         if not saved_args:
449 |             return
450 |         saved_args = parse_x_splash_saved_arguments_header(saved_args)
451 |         arg_fingerprints = request.meta['splash']['_local_arg_fingerprints']
452 |         for name, key in saved_args.items():
453 |             fp = arg_fingerprints[name]
454 |             self._remote_keys[fp] = key
455 | 
456 |     def _498_retry_request(self, request, response):
457 |         """
458 |         Return a retry request for HTTP 498 responses. HTTP 498 means
459 |         load_args are not present on server; client should retry the request
460 |         with full argument values instead of their hashes.
461 |         """
462 |         meta = copy.deepcopy(request.meta)
463 |         local_arg_fingerprints = meta['splash']['_local_arg_fingerprints']
464 |         args = meta['splash']['args']
465 |         args.pop('load_args', None)
466 |         args['save_args'] = list(local_arg_fingerprints.keys())
467 | 
468 |         for name, fp in local_arg_fingerprints.items():
469 |             args[name] = self._argument_values[fp]
470 |             # print('remote_keys before:', self._remote_keys)
471 |             self._remote_keys.pop(fp, None)
472 |             # print('remote_keys after:', self._remote_keys)
473 | 
474 |         body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4)
475 |         # print(body)
476 |         request = request.replace(
477 |             meta=meta,
478 |             body=body,
479 |             priority=request.priority+self.retry_498_priority_adjust
480 |         )
481 |         return request
482 | 
483 |     def _set_download_slot(self, request, meta, slot_policy):
484 |         if slot_policy == SlotPolicy.PER_DOMAIN:
485 |             # Use the same download slot to (sort of) respect download
486 |             # delays and concurrency options.
487 |             meta['download_slot'] = self._get_slot_key(request)
488 | 
489 |         elif slot_policy == SlotPolicy.SINGLE_SLOT:
490 |             # Use a single slot for all Splash requests
491 |             meta['download_slot'] = '__splash__'
492 | 
493 |         elif slot_policy == SlotPolicy.SCRAPY_DEFAULT:
494 |             # Use standard Scrapy concurrency setup
495 |             pass
496 | 
497 |     def _get_slot_key(self, request_or_response):
498 |         return self.crawler.engine.downloader._get_slot_key(
499 |             request_or_response, None
500 |         )
501 | 
502 | 
503 | class SafeRobotsTxtMiddleware(RobotsTxtMiddleware):
504 |     def process_request(self, request, spider):
505 |         # disable robots.txt for Splash requests
506 |         if _http_auth_enabled(spider) and 'splash' in request.meta:
507 |             return
508 |         return super(SafeRobotsTxtMiddleware, self).process_request(
509 |             request, spider)
510 | 
511 | 
512 | def _http_auth_enabled(spider):
513 |     # FIXME: this function should always return False if HttpAuthMiddleware is
514 |     # not in a middleware list.
515 |     return getattr(spider, 'http_user', '') or getattr(spider, 'http_pass', '')
516 | 
517 | 
518 | def replace_downloader_middleware(crawler, old_cls, new_cls):
519 |     """ Replace downloader middleware with another one """
520 |     try:
521 |         new_mw = new_cls.from_crawler(crawler)
522 |     except NotConfigured:
523 |         return
524 | 
525 |     mw_manager = crawler.engine.downloader.middleware
526 |     mw_manager.middlewares = tuple([
527 |         mw if mw.__class__ is not old_cls else new_mw
528 |         for mw in mw_manager.middlewares
529 |     ])
530 |     for method_name, callbacks in mw_manager.methods.items():
531 |         for idx, meth in enumerate(callbacks):
532 |             method_cls = meth.__self__.__class__
533 |             if method_cls is old_cls:
534 |                 new_meth = getattr(new_mw, method_name)
535 |                 # logger.debug("{} is replaced with {}".format(meth, new_meth))
536 |                 callbacks[idx] = new_meth
537 | 


--------------------------------------------------------------------------------
/scrapy_splash/request.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import copy
  4 | import scrapy
  5 | from scrapy.http import FormRequest
  6 | from scrapy.utils.url import canonicalize_url
  7 | 
  8 | from scrapy_splash import SlotPolicy
  9 | from scrapy_splash.utils import to_unicode, dict_hash
 10 | from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
 11 | from scrapy.utils.misc import load_object
 12 | 
 13 | try:
 14 |     from scrapy.utils.misc import build_from_crawler
 15 | except ImportError:  # Scrapy < 2.12
 16 |     from scrapy.utils.misc import create_instance
 17 | 
 18 |     def build_from_crawler(objcls, crawler, /, *args, **kwargs):
 19 |         return create_instance(objcls, None, crawler, *args, **kwargs)
 20 | 
 21 | # XXX: we can't implement SplashRequest without middleware support
 22 | # because there is no way to set Splash URL based on settings
 23 | # from inside SplashRequest.
 24 | 
 25 | 
 26 | class SplashRequest(scrapy.Request):
 27 |     """
 28 |     scrapy.Request subclass which instructs Scrapy to render
 29 |     the page using Splash.
 30 | 
 31 |     It requires SplashMiddleware to work.
 32 |     """
 33 |     def __init__(self,
 34 |                  url=None,
 35 |                  callback=None,
 36 |                  method='GET',
 37 |                  endpoint='render.html',
 38 |                  args=None,
 39 |                  splash_url=None,
 40 |                  slot_policy=SlotPolicy.PER_DOMAIN,
 41 |                  splash_headers=None,
 42 |                  dont_process_response=False,
 43 |                  dont_send_headers=False,
 44 |                  magic_response=True,
 45 |                  session_id='default',
 46 |                  http_status_from_error_code=True,
 47 |                  cache_args=None,
 48 |                  meta=None,
 49 |                  **kwargs):
 50 | 
 51 |         if url is None:
 52 |             url = 'about:blank'
 53 |         url = to_unicode(url)
 54 | 
 55 |         meta = copy.deepcopy(meta) or {}
 56 |         splash_meta = meta.setdefault('splash', {})
 57 |         splash_meta.setdefault('endpoint', endpoint)
 58 |         splash_meta.setdefault('slot_policy', slot_policy)
 59 |         if splash_url is not None:
 60 |             splash_meta['splash_url'] = splash_url
 61 |         if splash_headers is not None:
 62 |             splash_meta['splash_headers'] = splash_headers
 63 |         if dont_process_response:
 64 |             splash_meta['dont_process_response'] = True
 65 |         else:
 66 |             splash_meta.setdefault('magic_response', magic_response)
 67 |         if dont_send_headers:
 68 |             splash_meta['dont_send_headers'] = True
 69 |         if http_status_from_error_code:
 70 |             splash_meta['http_status_from_error_code'] = True
 71 |         if cache_args is not None:
 72 |             splash_meta['cache_args'] = cache_args
 73 | 
 74 |         if session_id is not None:
 75 |             if splash_meta['endpoint'].strip('/') == 'execute':
 76 |                 splash_meta.setdefault('session_id', session_id)
 77 | 
 78 |         _args = {'url': url}  # put URL to args in order to preserve #fragment
 79 |         _args.update(args or {})
 80 |         _args.update(splash_meta.get('args', {}))
 81 |         splash_meta['args'] = _args
 82 | 
 83 |         # This is not strictly required, but it strengthens Splash
 84 |         # requests against AjaxCrawlMiddleware
 85 |         meta['ajax_crawlable'] = True
 86 | 
 87 |         super(SplashRequest, self).__init__(url, callback, method, meta=meta,
 88 |                                             **kwargs)
 89 | 
 90 |     @property
 91 |     def _processed(self):
 92 |         return self.meta.get('_splash_processed')
 93 | 
 94 |     @property
 95 |     def _splash_args(self):
 96 |         return self.meta.get('splash', {}).get('args', {})
 97 | 
 98 |     @property
 99 |     def _original_url(self):
100 |         return self._splash_args.get('url')
101 | 
102 |     @property
103 |     def _original_method(self):
104 |         return self._splash_args.get('http_method', 'GET')
105 | 
106 |     def __repr__(self):
107 |         if not self._processed:
108 |             return super().__repr__()
109 |         return "<%s %s via %s>" % (self._original_method, self._original_url, self.url)
110 | 
111 | 
112 | class SplashFormRequest(SplashRequest, FormRequest):
113 |     """
114 |     Use SplashFormRequest if you want to make a FormRequest via splash.
115 |     Accepts the same arguments as SplashRequest, and also formdata,
116 |     like FormRequest. First, FormRequest is initialized, and then it's
117 |     url, method and body are passed to SplashRequest.
118 |     Note that FormRequest calls escape_ajax on url (via Request._set_url).
119 |     """
120 |     def __init__(self, url=None, callback=None, method=None, formdata=None,
121 |                  body=None, **kwargs):
122 |         # First init FormRequest to get url, body and method
123 |         if formdata:
124 |             FormRequest.__init__(
125 |                 self, url=url, method=method, formdata=formdata)
126 |             url, method, body = self.url, self.method, self.body
127 |         # Then pass all other kwargs to SplashRequest
128 |         SplashRequest.__init__(
129 |             self, url=url, callback=callback, method=method, body=body,
130 |             **kwargs)
131 | 
132 | 
133 | class SplashRequestFingerprinter:
134 |     @classmethod
135 |     def from_crawler(cls, crawler):
136 |         return cls(crawler)
137 | 
138 |     def __init__(self, crawler):
139 |         self._base_request_fingerprinter = build_from_crawler(
140 |                 load_object(
141 |                     crawler.settings.get(
142 |                         "SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS",
143 |                         REQUEST_FINGERPRINTER_CLASS,
144 |                     )
145 |                 ),
146 |                 crawler,
147 |             )
148 | 
149 |     def fingerprint(self, request):
150 |         """ Request fingerprint which takes 'splash' meta key into account """
151 | 
152 |         fp = self._base_request_fingerprinter.fingerprint(request)
153 |         if 'splash' not in request.meta:
154 |             return fp
155 | 
156 |         splash_options = copy.deepcopy(request.meta['splash'])
157 |         args = splash_options.setdefault('args', {})
158 | 
159 |         if 'url' in args:
160 |             args['url'] = canonicalize_url(args['url'], keep_fragments=True)
161 | 
162 |         return dict_hash(splash_options, fp).encode()
163 | 


--------------------------------------------------------------------------------
/scrapy_splash/response.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | 
  4 | import json
  5 | import base64
  6 | import re
  7 | from warnings import warn
  8 | 
  9 | from scrapy.http import Response, TextResponse
 10 | from scrapy import Selector
 11 | 
 12 | from scrapy_splash.utils import headers_to_scrapy
 13 | 
 14 | 
 15 | def get_splash_status(resp):
 16 |     return getattr(resp, 'splash_response_status', resp.status)
 17 | 
 18 | 
 19 | def get_splash_headers(resp):
 20 |     return getattr(resp, 'splash_response_headers', resp.headers)
 21 | 
 22 | 
 23 | class _SplashResponseMixin(object):
 24 |     """
 25 |     This mixin fixes response.url and adds response.real_url
 26 |     """
 27 |     def __init__(self, url, *args, **kwargs):
 28 |         real_url = kwargs.pop('real_url', None)
 29 |         if real_url is not None:
 30 |             self.real_url = real_url
 31 |         else:
 32 |             self.real_url = None
 33 |             # FIXME: create a .request @property with a setter?
 34 |             # Scrapy doesn't pass request to Response constructor;
 35 |             # it is worked around in SplashMiddleware.
 36 |             request = kwargs['request']
 37 |             splash_args = self._splash_args(request)
 38 |             _url = splash_args.get('url')
 39 |             if _url is not None:
 40 |                 self.real_url = url
 41 |                 url = _url
 42 |         self.splash_response_status = kwargs.pop('splash_response_status',
 43 |                                                  None)
 44 |         self.splash_response_headers = kwargs.pop('splash_response_headers',
 45 |                                                   None)
 46 |         super(_SplashResponseMixin, self).__init__(url, *args, **kwargs)
 47 |         if self.splash_response_status is None:
 48 |             self.splash_response_status = self.status
 49 |         if self.splash_response_headers is None:
 50 |             self.splash_response_headers = self.headers.copy()
 51 | 
 52 |     def replace(self, *args, **kwargs):
 53 |         """Create a new Response with the same attributes except for those
 54 |         given new values.
 55 |         """
 56 |         for x in ['url', 'status', 'headers', 'body', 'request', 'flags',
 57 |                   'real_url', 'splash_response_status',
 58 |                   'splash_response_headers']:
 59 |             kwargs.setdefault(x, getattr(self, x))
 60 |         cls = kwargs.pop('cls', self.__class__)
 61 |         return cls(*args, **kwargs)
 62 | 
 63 |     def _splash_options(self, request=None):
 64 |         if request is None:
 65 |             request = self.request
 66 |         return request.meta.get("splash", {})
 67 | 
 68 |     def _splash_args(self, request=None):
 69 |         return self._splash_options(request).get('args', {})
 70 | 
 71 | 
 72 | class SplashResponse(_SplashResponseMixin, Response):
 73 |     """
 74 |     This Response subclass sets response.url to the URL of a remote website
 75 |     instead of an URL of Splash server. "Real" response URL is still available
 76 |     as ``response.real_url``.
 77 |     """
 78 | 
 79 | 
 80 | class SplashTextResponse(_SplashResponseMixin, TextResponse):
 81 |     """
 82 |     This TextResponse subclass sets response.url to the URL of a remote website
 83 |     instead of an URL of Splash server. "Real" response URL is still available
 84 |     as ``response.real_url``.
 85 |     """
 86 |     def replace(self, *args, **kwargs):
 87 |         kwargs.setdefault('encoding', self.encoding)
 88 |         return _SplashResponseMixin.replace(self, *args, **kwargs)
 89 | 
 90 | 
 91 | class SplashJsonResponse(SplashResponse):
 92 |     """
 93 |     Splash Response with JSON data. It provides a convenient way to access
 94 |     parsed JSON response using ``response.data`` attribute and exposes
 95 |     current Splash cookiejar when it is available.
 96 | 
 97 |     If Scrapy-Splash response magic is enabled in request
 98 |     (['splash']['magic_response'] is not False), several other response
 99 |     attributes (headers, body, url, status code) are set automatically:
100 | 
101 |     * response.url is set to the value of 'url' key, original url is
102 |       available as ``responce.real_url``;
103 |     * response.headers are filled from 'headers' keys; original headers are
104 |       available as ``response.splash_response_headers``;
105 |     * response.status is set from the value of 'http_status' key; original
106 |       status is available as ``response.splash_response_status``;
107 |     * response.body is set to the value of 'html' key,
108 |       or to base64-decoded value of 'body' key;
109 |     """
110 |     def __init__(self, *args, **kwargs):
111 |         self.cookiejar = None
112 |         self._cached_ubody = None
113 |         self._cached_data = None
114 |         self._cached_selector = None
115 |         kwargs.pop('encoding', None)  # encoding is always utf-8
116 |         super(SplashJsonResponse, self).__init__(*args, **kwargs)
117 | 
118 |         # FIXME: it assumes self.request is set
119 |         if self._splash_options().get('magic_response', True):
120 |             self._load_from_json()
121 | 
122 |     @property
123 |     def data(self):
124 |         if self._cached_data is None:
125 |             self._cached_data = json.loads(self._ubody)
126 |         return self._cached_data
127 | 
128 |     @property
129 |     def text(self):
130 |         return self._ubody
131 | 
132 |     def body_as_unicode(self):
133 |         warn(
134 |             (
135 |                 "The body_as_unicode() method is deprecated, use the text "
136 |                 "property instead."
137 |             ),
138 |             DeprecationWarning,
139 |             stacklevel=2,
140 |         )
141 |         return self._ubody
142 | 
143 |     @property
144 |     def _ubody(self):
145 |         if self._cached_ubody is None:
146 |             self._cached_ubody = self.body.decode(self.encoding)
147 |         return self._cached_ubody
148 | 
149 |     @property
150 |     def encoding(self):
151 |         return 'utf8'
152 | 
153 |     @property
154 |     def selector(self):
155 |         if self._cached_selector is None:
156 |             self._cached_selector = Selector(text=self.text, type='html')
157 |         return self._cached_selector
158 | 
159 |     def xpath(self, query):
160 |         return self.selector.xpath(query)
161 | 
162 |     def css(self, query):
163 |         return self.selector.css(query)
164 | 
165 |     def _load_from_json(self):
166 |         """ Fill response attributes from JSON results """
167 | 
168 |         # response.status
169 |         if 'http_status' in self.data:
170 |             self.status = int(self.data['http_status'])
171 |         elif self._splash_options().get('http_status_from_error_code', False):
172 |             if 'error' in self.data:
173 |                 try:
174 |                     error = self.data['info']['error']
175 |                 except KeyError:
176 |                     error = ''
177 |                 http_code_m = re.match(r'http(\d{3})', error)
178 |                 if http_code_m:
179 |                     self.status = int(http_code_m.group(1))
180 | 
181 |         # response.url
182 |         if 'url' in self.data:
183 |             self._url = self.data['url']
184 | 
185 |         # response.body
186 |         if 'body' in self.data:
187 |             self._body = base64.b64decode(self.data['body'])
188 |             self._cached_ubody = self._body.decode(self.encoding)
189 |         elif 'html' in self.data:
190 |             self._cached_ubody = self.data['html']
191 |             self._body = self._cached_ubody.encode(self.encoding)
192 |             self.headers[b"Content-Type"] = b"text/html; charset=utf-8"
193 | 
194 |         # response.headers
195 |         if 'headers' in self.data:
196 |             self.headers = headers_to_scrapy(self.data['headers'])
197 | 


--------------------------------------------------------------------------------
/scrapy_splash/responsetypes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | from scrapy.http import Response
 5 | from scrapy.responsetypes import ResponseTypes
 6 | 
 7 | import scrapy_splash
 8 | 
 9 | 
10 | class SplashResponseTypes(ResponseTypes):
11 |     CLASSES = {
12 |         'text/html': 'scrapy_splash.response.SplashTextResponse',
13 |         'application/atom+xml': 'scrapy_splash.response.SplashTextResponse',
14 |         'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse',
15 |         'application/rss+xml': 'scrapy_splash.response.SplashTextResponse',
16 |         'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse',
17 |         'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse',
18 |         'application/xml': 'scrapy_splash.response.SplashTextResponse',
19 |         'application/json': 'scrapy_splash.response.SplashJsonResponse',
20 |         'application/x-json': 'scrapy_splash.response.SplashJsonResponse',
21 |         'application/javascript': 'scrapy_splash.response.SplashTextResponse',
22 |         'application/x-javascript': 'scrapy_splash.response.SplashTextResponse',
23 |         'text/xml': 'scrapy_splash.response.SplashTextResponse',
24 |         'text/*': 'scrapy_splash.response.SplashTextResponse',
25 |     }
26 | 
27 |     def from_args(self, headers=None, url=None, filename=None, body=None):
28 |         """Guess the most appropriate Response class based on
29 |         the given arguments."""
30 |         cls = super(SplashResponseTypes, self).from_args(
31 |             headers=headers,
32 |             url=url,
33 |             filename=filename,
34 |             body=body
35 |         )
36 |         if cls is Response:
37 |             cls = scrapy_splash.SplashResponse
38 |         return cls
39 | 
40 | 
41 | responsetypes = SplashResponseTypes()
42 | 


--------------------------------------------------------------------------------
/scrapy_splash/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import json
  4 | import hashlib
  5 | import six
  6 | 
  7 | from scrapy.http import Headers
  8 | from scrapy.utils.python import to_unicode, to_bytes
  9 | 
 10 | 
 11 | def dict_hash(obj, start=''):
 12 |     """ Return a hash for a dict, based on its contents """
 13 |     h = hashlib.sha1(to_bytes(start))
 14 |     h.update(to_bytes(obj.__class__.__name__))
 15 |     if isinstance(obj, dict):
 16 |         for key, value in sorted(obj.items()):
 17 |             h.update(to_bytes(key))
 18 |             h.update(to_bytes(dict_hash(value)))
 19 |     elif isinstance(obj, (list, tuple)):
 20 |         for el in obj:
 21 |             h.update(to_bytes(dict_hash(el)))
 22 |     else:
 23 |         # basic types
 24 |         if isinstance(obj, bool):
 25 |             value = str(int(obj))
 26 |         elif isinstance(obj, (six.integer_types, float)):
 27 |             value = str(obj)
 28 |         elif isinstance(obj, (six.text_type, bytes)):
 29 |             value = obj
 30 |         elif obj is None:
 31 |             value = b''
 32 |         else:
 33 |             raise ValueError("Unsupported value type: %s" % obj.__class__)
 34 |         h.update(to_bytes(value))
 35 |     return h.hexdigest()
 36 | 
 37 | 
 38 | def _process(value, sha=False):
 39 |     if isinstance(value, (six.text_type, bytes)):
 40 |         if sha:
 41 |             return hashlib.sha1(to_bytes(value)).hexdigest()
 42 |         return 'h', hash(value)
 43 |     if isinstance(value, dict):
 44 |         return {_process(k, sha=True): _process(v, sha) for k, v in value.items()}
 45 |     if isinstance(value, (list, tuple)):
 46 |         return [_process(v, sha) for v in value]
 47 |     return value
 48 | 
 49 | 
 50 | def _fast_hash(value):
 51 |     """
 52 |     Return a hash for any JSON-serializable value.
 53 |     Hash is not guaranteed to be the same in different Python processes,
 54 |     but it is very fast to compute for data structures with large string
 55 |     values.
 56 |     """
 57 |     return _json_based_hash(_process(value))
 58 | 
 59 | 
 60 | _hash_cache = {}  # fast hash => hash
 61 | def json_based_hash(value):
 62 |     """
 63 |     Return a hash for any JSON-serializable value.
 64 | 
 65 |     >>> json_based_hash({"foo": "bar", "baz": [1, 2]})
 66 |     '0570066939bea46c610bfdc35b20f37ef09d05ed'
 67 |     """
 68 |     fp = _fast_hash(value)
 69 |     if fp not in _hash_cache:
 70 |         _hash_cache[fp] = _json_based_hash(_process(value, sha=True))
 71 |     return _hash_cache[fp]
 72 | 
 73 | 
 74 | def _json_based_hash(value):
 75 |     v = json.dumps(value, sort_keys=True, ensure_ascii=False).encode('utf8')
 76 |     return hashlib.sha1(v).hexdigest()
 77 | 
 78 | 
 79 | def headers_to_scrapy(headers):
 80 |     """
 81 |     Return scrapy.http.Headers instance from headers data.
 82 |     3 data formats are supported:
 83 | 
 84 |     * {name: value, ...} dict;
 85 |     * [(name, value), ...] list;
 86 |     * [{'name': name, 'value': value'}, ...] list (HAR headers format).
 87 |     """
 88 |     if isinstance(headers or {}, dict):
 89 |         return Headers(headers or {})
 90 | 
 91 |     if isinstance(headers[0], dict):
 92 |         return Headers([
 93 |             (d['name'], d.get('value', ''))
 94 |             for d in headers
 95 |         ])
 96 | 
 97 |     return Headers(headers)
 98 | 
 99 | 
100 | def scrapy_headers_to_unicode_dict(headers):
101 |     """
102 |     Convert scrapy.http.Headers instance to a dictionary
103 |     suitable for JSON encoding.
104 |     """
105 |     return {
106 |         to_unicode(key): to_unicode(b','.join(value))
107 |         for key, value in headers.items()
108 |     }
109 | 
110 | 
111 | def parse_x_splash_saved_arguments_header(value):
112 |     """
113 |     Parse X-Splash-Saved-Arguments header value.
114 | 
115 |     >>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3"
116 |     >>> dct = parse_x_splash_saved_arguments_header(value)
117 |     >>> sorted(list(dct.keys()))
118 |     ['name1', 'name2']
119 |     >>> dct['name1']
120 |     '9a6747fc6259aa374ab4e1bb03074b6ec672cf99'
121 |     >>> dct['name2']
122 |     'ba001160ef96fe2a3f938fea9e6762e204a562b3'
123 | 
124 |     Binary header values are also supported:
125 |     >>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8'))
126 |     >>> dct2 == dct
127 |     True
128 |     """
129 |     value = to_unicode(value)
130 |     return dict(kv.split('=', 1) for kv in  value.split(";"))
131 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import setup
 3 | 
 4 | setup(
 5 |     name='scrapy-splash',
 6 |     version='0.11.1',
 7 |     url='https://github.com/scrapy-plugins/scrapy-splash',
 8 |     description='JavaScript support for Scrapy using Splash',
 9 |     long_description=open('README.rst').read() + "\n\n" + open("CHANGES.rst").read(),
10 |     author='Scrapy developers',
11 |     maintainer='Mikhail Korobov',
12 |     maintainer_email='kmike84@gmail.com',
13 |     license='BSD',
14 |     packages=['scrapy_splash'],
15 |     zip_safe=False,
16 |     classifiers=[
17 |         'Development Status :: 4 - Beta',
18 |         'License :: OSI Approved :: BSD License',
19 |         'Programming Language :: Python',
20 |         'Programming Language :: Python :: 2',
21 |         'Programming Language :: Python :: 2.7',
22 |         'Programming Language :: Python :: 3',
23 |         'Programming Language :: Python :: 3.4',
24 |         'Programming Language :: Python :: 3.5',
25 |         'Programming Language :: Python :: 3.6',
26 |         'Framework :: Scrapy',
27 |         'Intended Audience :: Developers',
28 |         'Operating System :: OS Independent',
29 |         'Topic :: Internet :: WWW/HTTP',
30 |         'Topic :: Software Development :: Libraries :: Application Frameworks',
31 |         'Topic :: Software Development :: Libraries :: Python Modules',
32 |     ],
33 |     install_requires=['scrapy>=2.4', 'six'],
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import  
3 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from .mockserver import MockServer
 5 | from .resources import SplashProtected
 6 | 
 7 | 
 8 | @pytest.fixture()
 9 | def settings():
10 |     """ Default scrapy-splash settings """
11 |     s = dict(
12 |         # collect scraped items to .collected_items attribute
13 |         ITEM_PIPELINES={
14 |             'tests.utils.CollectorPipeline': 100,
15 |         },
16 | 
17 |         # scrapy-splash settings
18 |         SPLASH_URL=os.environ.get('SPLASH_URL'),
19 |         DOWNLOADER_MIDDLEWARES={
20 |             # Engine side
21 |             'scrapy_splash.SplashCookiesMiddleware': 723,
22 |             'scrapy_splash.SplashMiddleware': 725,
23 |             'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
24 |             # Downloader side
25 |         },
26 |         SPIDER_MIDDLEWARES={
27 |             'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
28 |         },
29 |         REQUEST_FINGERPRINTER_CLASS='scrapy_splash.SplashRequestFingerprinter',
30 |     )
31 |     return s
32 | 
33 | 
34 | @pytest.fixture()
35 | def settings_auth(settings):
36 |     with MockServer(SplashProtected) as s:
37 |         print("splash url:", s.root_url)
38 |         settings['SPLASH_URL'] = s.root_url
39 |         yield settings
40 | 


--------------------------------------------------------------------------------
/tests/mockserver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse, socket, sys, time
 3 | from subprocess import Popen, PIPE
 4 | from importlib import import_module
 5 | 
 6 | from twisted.internet import reactor
 7 | from twisted.web.server import Site
 8 | 
 9 | 
10 | def get_ephemeral_port():
11 |     s = socket.socket()
12 |     s.bind(("", 0))
13 |     return s.getsockname()[1]
14 | 
15 | 
16 | class MockServer():
17 |     def __init__(self, resource, port=None):
18 |         self.resource = '{}.{}'.format(resource.__module__, resource.__name__)
19 |         self.proc = None
20 |         host = socket.gethostbyname(socket.gethostname())
21 |         self.port = port or get_ephemeral_port()
22 |         self.root_url = 'http://%s:%d' % (host, self.port)
23 | 
24 |     def __enter__(self):
25 |         self.proc = Popen(
26 |             [sys.executable, '-u', '-m', 'tests.mockserver',
27 |              self.resource, '--port', str(self.port)],
28 |             stdout=PIPE)
29 |         self.proc.stdout.readline()
30 |         return self
31 | 
32 |     def __exit__(self, exc_type, exc_value, traceback):
33 |         self.proc.kill()
34 |         self.proc.wait()
35 |         time.sleep(0.2)
36 | 
37 | 
38 | def main():
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('resource')
41 |     parser.add_argument('--port', type=int)
42 |     args = parser.parse_args()
43 |     module_name, name = args.resource.rsplit('.', 1)
44 |     sys.path.append('.')
45 |     resource = getattr(import_module(module_name), name)()
46 |     http_port = reactor.listenTCP(args.port, Site(resource))
47 |     def print_listening():
48 |         host = http_port.getHost()
49 |         print('Mock server {} running at http://{}:{}'.format(
50 |             resource, host.host, host.port))
51 |     reactor.callWhenRunning(print_listening)
52 |     reactor.run()
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/tests/resources.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | from six.moves.urllib.parse import urlparse
  4 | 
  5 | from twisted.web.resource import Resource
  6 | from zope.interface import implementer
  7 | from twisted.web import resource, guard, proxy
  8 | from twisted.cred.portal import IRealm, Portal
  9 | from twisted.cred.checkers import InMemoryUsernamePasswordDatabaseDontUse
 10 | 
 11 | from scrapy_splash.utils import to_bytes
 12 | 
 13 | 
 14 | class HtmlResource(Resource):
 15 |     isLeaf = True
 16 |     content_type = 'text/html'
 17 |     html = ''
 18 |     extra_headers = {}
 19 |     status_code = 200
 20 | 
 21 |     def render_GET(self, request):
 22 |         request.setHeader(b'content-type', to_bytes(self.content_type))
 23 |         for name, value in self.extra_headers.items():
 24 |             request.setHeader(to_bytes(name), to_bytes(value))
 25 |         request.setResponseCode(self.status_code)
 26 |         return to_bytes(self.html)
 27 | 
 28 | 
 29 | class HelloWorld(HtmlResource):
 30 |     html = """
 31 |     <html><body><script>document.write('hello world!');</script></body></html>
 32 |     """
 33 |     extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
 34 | 
 35 | 
 36 | class HelloWorldDisallowByRobots(HelloWorld):
 37 |     """ Disallow itself via robots.txt """
 38 |     isLeaf = False
 39 | 
 40 |     def getChild(self, name, request):
 41 |         if name == b"robots.txt":
 42 |             return self.RobotsTxt()
 43 |         return self
 44 | 
 45 |     class RobotsTxt(Resource):
 46 |         isLeaf = True
 47 |         def render_GET(self, request):
 48 |             return b'User-Agent: *\nDisallow: /\n'
 49 | 
 50 | 
 51 | class HelloWorldDisallowAuth(HelloWorldDisallowByRobots):
 52 |     """ Disallow itself via robots.txt if a request to robots.txt
 53 |     contains basic auth header. """
 54 |     class RobotsTxt(HelloWorldDisallowByRobots.RobotsTxt):
 55 |         def render_GET(self, request):
 56 |             if request.requestHeaders.hasHeader('Authorization'):
 57 |                 return super(HelloWorldDisallowAuth.RobotsTxt, self).render_GET(request)
 58 |             request.setResponseCode(404)
 59 |             return b''
 60 | 
 61 | 
 62 | class Http400Resource(HtmlResource):
 63 |     status_code = 400
 64 |     html = "Website returns HTTP 400 error"
 65 | 
 66 | 
 67 | class ManyCookies(Resource, object):
 68 |     class SetMyCookie(HtmlResource):
 69 |         html = "hello!"
 70 |         extra_headers = {'Set-Cookie': 'login=1'}
 71 | 
 72 |     def __init__(self):
 73 |         super(ManyCookies, self).__init__()
 74 |         self.putChild(b'', HelloWorld())
 75 |         self.putChild(b'login', self.SetMyCookie())
 76 | 
 77 | 
 78 | def splash_proxy():
 79 |     splash_url = os.environ.get('SPLASH_URL')
 80 |     p = urlparse(splash_url)
 81 |     return lambda: proxy.ReverseProxyResource(p.hostname, int(p.port), b'')
 82 | 
 83 | 
 84 | def password_protected(resource_cls, username, password):
 85 |     # Sorry, but this is nuts. A zillion of classes, arbitrary
 86 |     # unicode / bytes requirements at random places. Is there a simpler
 87 |     # way to get HTTP Basic Auth working in Twisted?
 88 |     @implementer(IRealm)
 89 |     class SimpleRealm(object):
 90 |         def requestAvatar(self, avatarId, mind, *interfaces):
 91 |             if resource.IResource in interfaces:
 92 |                 return resource.IResource, resource_cls(), lambda: None
 93 |             raise NotImplementedError()
 94 | 
 95 |     creds = {username: password}
 96 |     checkers = [InMemoryUsernamePasswordDatabaseDontUse(**creds)]
 97 |     return lambda: guard.HTTPAuthSessionWrapper(
 98 |         Portal(SimpleRealm(), checkers),
 99 |         [guard.BasicCredentialFactory(b'example.com')])
100 | 
101 | 
102 | HelloWorldProtected = password_protected(HelloWorld, 'user', b'userpass')
103 | HelloWorldProtected.__name__ = 'HelloWorldProtected'
104 | HelloWorldProtected.__module__ = __name__
105 | 
106 | SplashProtected = password_protected(splash_proxy(), 'user', b'userpass')
107 | SplashProtected.__name__ = 'SplashProtected'
108 | SplashProtected.__module__ = __name__
109 | 


--------------------------------------------------------------------------------
/tests/test_cookies.py:
--------------------------------------------------------------------------------
 1 | from scrapy_splash.cookies import har_to_cookie, cookie_to_har
 2 | 
 3 | 
 4 | # See also doctests in scrapy_splash.cookies module
 5 | 
 6 | 
 7 | def test_cookie_to_har():
 8 |     har_cookie = {
 9 |         "name": "TestCookie",
10 |         "value": "Cookie Value",
11 |         "path": "/foo",
12 |         "domain": "www.janodvarko.cz",
13 |         "expires": "2009-07-24T19:20:30Z",
14 |         "httpOnly": True,
15 |         "secure": True,
16 |         "comment": "this is a test"
17 |     }
18 |     assert cookie_to_har(har_to_cookie(har_cookie)) == har_cookie
19 |     cookie = har_to_cookie(har_cookie)
20 |     assert vars(cookie) == vars(har_to_cookie(cookie_to_har(cookie)))
21 | 


--------------------------------------------------------------------------------
/tests/test_fingerprints.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from copy import deepcopy
  4 | 
  5 | import pytest
  6 | import scrapy
  7 | 
  8 | from scrapy_splash import SplashRequest
  9 | from scrapy_splash.dupefilter import request_fingerprint, splash_request_fingerprint
 10 | from scrapy_splash.utils import dict_hash
 11 | 
 12 | from .test_middleware import _get_mw
 13 | from .utils import make_crawler
 14 | from scrapy_splash.request import SplashRequestFingerprinter
 15 | 
 16 | 
 17 | def test_dict_hash():
 18 |     h1 = dict_hash({"foo": "bar", "bar": "baz"})
 19 |     h2 = dict_hash({"foo": "bar", "bar": "baz"})
 20 |     assert h1 == h2
 21 | 
 22 |     h3 = dict_hash({"egg": "spam"})
 23 |     assert h3 != h2
 24 | 
 25 | 
 26 | def test_dict_hash_nested():
 27 |     h1 = dict_hash({"foo": "bar", "bar": {"baz": "spam"}})
 28 |     h2 = dict_hash({"foo": "bar", "bar": {"baz": "spam"}})
 29 |     assert h1 == h2
 30 | 
 31 |     h3 = dict_hash({"foo": "bar", "bar": {"baz": "egg"}})
 32 |     h4 = dict_hash({"foo": "bar", "bar": {"bam": "spam"}})
 33 |     assert h3 != h2
 34 |     assert h4 != h2
 35 | 
 36 | 
 37 | def test_dict_hash_non_strings():
 38 |     h1 = dict_hash({"foo": "bar", "float": 1.1, "int": 2, "bool": False,
 39 |                     "seq": ["x", "y", (2, 3.7, {"x": 5, "y": [6, 7]})]})
 40 |     h2 = dict_hash({"foo": "bar", "float": 1.2, "int": 2, "bool": False})
 41 |     assert h1 != h2
 42 | 
 43 | 
 44 | def test_dict_hash_invalid():
 45 |     with pytest.raises(ValueError):
 46 |         dict_hash({"foo": scrapy})
 47 | 
 48 | 
 49 | def test_request_fingerprint_nosplash():
 50 |     r1 = scrapy.Request("http://example.com")
 51 |     r2 = scrapy.Request("http://example.com", meta={"foo": "bar"})
 52 |     assert request_fingerprint(r1) == splash_request_fingerprint(r1)
 53 |     assert request_fingerprint(r1) == request_fingerprint(r2)
 54 |     assert request_fingerprint(r1) == splash_request_fingerprint(r2)
 55 | 
 56 | 
 57 | def assert_fingerprints_match(r1, r2):
 58 |     assert splash_request_fingerprint(r1) == splash_request_fingerprint(r2)
 59 | 
 60 | 
 61 | def assert_fingerprints_dont_match(r1, r2):
 62 |     assert splash_request_fingerprint(r1) != splash_request_fingerprint(r2)
 63 | 
 64 | 
 65 | def test_request_fingerprint_splash():
 66 |     r1 = scrapy.Request("http://example.com")
 67 |     r2 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1}}})
 68 |     r3 = scrapy.Request("http://example.com", meta={"splash": {"args": {"png": 1}}})
 69 |     r4 = scrapy.Request("http://example.com", meta={"foo": "bar", "splash": {"args": {"html": 1}}})
 70 |     r5 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1, "wait": 1.0}}})
 71 | 
 72 |     assert request_fingerprint(r1) == request_fingerprint(r2)
 73 |     assert_fingerprints_dont_match(r1, r2)
 74 |     assert_fingerprints_dont_match(r1, r3)
 75 |     assert_fingerprints_dont_match(r1, r4)
 76 |     assert_fingerprints_dont_match(r1, r5)
 77 |     assert_fingerprints_dont_match(r2, r3)
 78 | 
 79 |     # only "splash" contents is taken into account
 80 |     assert_fingerprints_match(r2, r4)
 81 | 
 82 | 
 83 | def assert_fingerprints_match_fingerprinter(fingerprinter, r1, r2):
 84 |     assert fingerprinter.fingerprint(r1) == fingerprinter.fingerprint(r2)
 85 | 
 86 | 
 87 | def assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r2):
 88 |     assert fingerprinter.fingerprint(r1) != fingerprinter.fingerprint(r2)
 89 | 
 90 | 
 91 | class TestSpider(scrapy.Spider):
 92 |     name = 'test_spider'
 93 | 
 94 | 
 95 | def test_splash_request_fingerprinter():
 96 |     crawler = make_crawler(TestSpider, {})
 97 |     fingerprinter = SplashRequestFingerprinter(crawler)
 98 | 
 99 |     r1 = scrapy.Request("http://example.com")
100 |     r2 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1}}})
101 |     r3 = scrapy.Request("http://example.com", meta={"splash": {"args": {"png": 1}}})
102 |     r4 = scrapy.Request("http://example.com", meta={"foo": "bar", "splash": {"args": {"html": 1}}})
103 |     r5 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1, "wait": 1.0}}})
104 | 
105 |     assert request_fingerprint(r1) == request_fingerprint(r2)
106 |     assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r2)
107 |     assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r3)
108 |     assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r4)
109 |     assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r5)
110 |     assert_fingerprints_dont_match_fingerprinter(fingerprinter, r2, r3)
111 | 
112 |     # only "splash" contents is taken into account
113 |     assert_fingerprints_match_fingerprinter(fingerprinter, r2, r4)
114 | 
115 | 
116 | @pytest.fixture()
117 | def splash_middleware():
118 |     return _get_mw()
119 | 
120 | 
121 | @pytest.fixture
122 | def splash_mw_process(splash_middleware):
123 |     def _process(r):
124 |         r_copy = r.replace(meta=deepcopy(r.meta))
125 |         return splash_middleware.process_request(r_copy, None) or r
126 |     return _process
127 | 
128 | 
129 | @pytest.fixture()
130 | def requests():
131 |     url1 = "http://example.com/foo?x=1&y=2"
132 |     url2 = "http://example.com/foo?y=2&x=1"
133 |     url3 = "http://example.com/foo?x=1&y=2&z=3"
134 |     url4 = "http://example.com/foo?x=1&y=2#id2"
135 |     url5 = "http://example.com/foo?x=1&y=2#!id2"
136 |     request_kwargs = [
137 |         dict(url=url1),                         # 0
138 |         dict(url=url1, method='POST'),          # 1
139 |         dict(url=url1, endpoint='render.har'),  # 2
140 |         dict(url=url2),                         # 3
141 |         dict(url=url1, args={'wait': 0.5}),     # 4
142 |         dict(url=url2, args={'wait': 0.5}),     # 5
143 |         dict(url=url3),                         # 6
144 |         dict(url=url2, method='POST'),          # 7
145 |         dict(args={'wait': 0.5}),               # 8
146 |         dict(args={'wait': 0.5}),               # 9
147 |         dict(args={'wait': 0.7}),               # 10
148 |         dict(url=url4),                         # 11
149 |     ]
150 |     splash_requests = [SplashRequest(**kwargs) for kwargs in request_kwargs]
151 |     scrapy_requests = [
152 |         scrapy.Request(url=url1),               # 12
153 |         scrapy.Request(url=url2),               # 13
154 |         scrapy.Request(url=url4),               # 14
155 |         scrapy.Request(url=url5),               # 15
156 |     ]
157 |     return splash_requests + scrapy_requests
158 | 
159 | 
160 | @pytest.mark.parametrize(["i", "dupe_indices"], [
161 |     (0, {3}),
162 |     (1, {7}),
163 |     (2, set()),
164 |     (3, {0}),
165 |     (4, {5}),
166 |     (5, {4}),
167 |     (6, set()),
168 |     (7, {1}),
169 |     (8, {9}),
170 |     (9, {8}),
171 |     (10, set()),
172 |     (11, set()),
173 |     (12, {13, 14}),
174 |     (13, {12, 14}),
175 |     (14, {13, 12}),
176 |     (15, set()),
177 | ])
178 | def test_duplicates(i, dupe_indices, requests, splash_mw_process):
179 |     def assert_not_filtered(r1, r2):
180 |         assert_fingerprints_dont_match(r1, r2)
181 |         assert_fingerprints_dont_match(
182 |             splash_mw_process(r1),
183 |             splash_mw_process(r2),
184 |         )
185 | 
186 |     def assert_filtered(r1, r2):
187 |         # request is filtered if it is filtered either
188 |         # before rescheduling or after
189 |         fp1 = splash_request_fingerprint(r1)
190 |         fp2 = splash_request_fingerprint(r2)
191 |         if fp1 != fp2:
192 |             assert_fingerprints_match(
193 |                 splash_mw_process(r1),
194 |                 splash_mw_process(r2),
195 |             )
196 | 
197 |     dupe_indices = set(dupe_indices)
198 |     dupe_indices.add(i)
199 |     non_dupe_indices = set(range(len(requests))) - dupe_indices
200 | 
201 |     for j in dupe_indices:
202 |         assert_filtered(requests[i], requests[j])
203 |     for j in non_dupe_indices:
204 |         assert_not_filtered(requests[i], requests[j])
205 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pytest
  3 | import scrapy
  4 | from pkg_resources import parse_version
  5 | from pytest_twisted import inlineCallbacks
  6 | from w3lib.url import canonicalize_url
  7 | from w3lib.http import basic_auth_header
  8 | 
  9 | from scrapy_splash import SplashRequest
 10 | from .utils import crawl_items, requires_splash
 11 | from .resources import (
 12 |     HelloWorld,
 13 |     Http400Resource,
 14 |     ManyCookies,
 15 |     HelloWorldProtected,
 16 |     HelloWorldDisallowByRobots,
 17 |     HelloWorldDisallowAuth,
 18 | )
 19 | 
 20 | 
 21 | DEFAULT_SCRIPT = """
 22 | function main(splash)
 23 |   splash:init_cookies(splash.args.cookies)
 24 |   splash:go{
 25 |     splash.args.url,
 26 |     headers=splash.args.headers,
 27 |     http_method=splash.args.http_method,
 28 |     body=splash.args.body,
 29 |   }
 30 |   local wait = 0.01
 31 |   if splash.args.wait ~= nil then
 32 |     wait = splash.args.wait
 33 |   end
 34 |   assert(splash:wait(wait))
 35 | 
 36 |   local entries = splash:history()
 37 |   local last_response = entries[#entries].response
 38 |   return {
 39 |     url = splash:url(),
 40 |     headers = last_response.headers,
 41 |     http_status = last_response.status,
 42 |     cookies = splash:get_cookies(),
 43 |     html = splash:html(),
 44 |     args = splash.args,
 45 |     jsvalue = splash:evaljs("1+2"),
 46 |   }
 47 | end
 48 | """
 49 | 
 50 | 
 51 | class ResponseSpider(scrapy.Spider):
 52 |     """ Make a request to URL, return Scrapy response """
 53 |     custom_settings = {
 54 |         'HTTPERROR_ALLOW_ALL': True,
 55 |         'ROBOTSTXT_OBEY': True,
 56 |     }
 57 |     url = None
 58 | 
 59 |     def start_requests(self):
 60 |         yield SplashRequest(self.url)
 61 | 
 62 |     def parse(self, response):
 63 |         yield {'response': response}
 64 | 
 65 | 
 66 | class LuaSpider(ResponseSpider):
 67 |     """ Make a request to URL using default Lua script """
 68 |     headers = None
 69 |     splash_headers = None
 70 | 
 71 |     def start_requests(self):
 72 |         yield SplashRequest(self.url,
 73 |                             endpoint='execute',
 74 |                             args={'lua_source': DEFAULT_SCRIPT},
 75 |                             headers=self.headers,
 76 |                             splash_headers=self.splash_headers)
 77 | 
 78 | 
 79 | class ScrapyAuthSpider(LuaSpider):
 80 |     """ Spider with incorrect (old, insecure) auth method """
 81 |     http_user = 'user'
 82 |     http_pass = 'userpass'
 83 |     http_auth_domain = None
 84 | 
 85 | 
 86 | class NonSplashSpider(ResponseSpider):
 87 |     """ Spider which uses HTTP auth and doesn't use Splash """
 88 |     http_user = 'user'
 89 |     http_pass = 'userpass'
 90 |     http_auth_domain = None
 91 | 
 92 |     def start_requests(self):
 93 |         yield scrapy.Request(self.url)
 94 | 
 95 | 
 96 | def assert_single_response(items):
 97 |     assert len(items) == 1
 98 |     return items[0]['response']
 99 | 
100 | 
101 | @requires_splash
102 | @inlineCallbacks
103 | def test_basic(settings):
104 |     items, url, crawler = yield crawl_items(ResponseSpider, HelloWorld,
105 |                                             settings)
106 |     resp = assert_single_response(items)
107 |     assert resp.url == url
108 |     assert resp.css('body::text').extract_first().strip() == "hello world!"
109 | 
110 | 
111 | @requires_splash
112 | @inlineCallbacks
113 | def test_reload(settings):
114 | 
115 |     class ReloadSpider(ResponseSpider):
116 |         """ Make two requests to URL, store both responses.
117 |         This spider activates both start_requests and parse methods,
118 |         and checks that dupefilter takes fragment into account. """
119 | 
120 |         def parse(self, response):
121 |             yield {'response': response}
122 |             yield SplashRequest(self.url + '#foo')
123 | 
124 |     items, url, crawler = yield crawl_items(ReloadSpider, HelloWorld, settings)
125 |     assert len(items) == 2
126 |     assert crawler.stats.get_value('dupefilter/filtered') == 1
127 |     resp = items[0]['response']
128 |     assert resp.url == url
129 |     assert resp.css('body::text').extract_first().strip() == "hello world!"
130 |     assert resp.status == resp.splash_response_status == 200
131 |     assert resp.headers == resp.splash_response_headers
132 |     assert resp.splash_response_headers['Content-Type'] == b"text/html; charset=utf-8"
133 | 
134 |     resp2 = items[1]['response']
135 |     assert resp2.body == resp.body
136 |     assert resp2 is not resp
137 |     assert resp2.url == resp.url + "#foo"
138 | 
139 | 
140 | @requires_splash
141 | @inlineCallbacks
142 | def test_basic_lua(settings):
143 | 
144 |     class LuaScriptSpider(ResponseSpider):
145 |         """ Make a request using a Lua script similar to the one from README
146 |         """
147 |         def start_requests(self):
148 |             yield SplashRequest(self.url + "#foo", endpoint='execute',
149 |                             args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
150 | 
151 | 
152 |     items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld,
153 |                                             settings)
154 |     resp = assert_single_response(items)
155 |     assert resp.url == url + "/#foo"
156 |     assert resp.status == resp.splash_response_status == 200
157 |     assert resp.css('body::text').extract_first().strip() == "hello world!"
158 |     assert resp.data['jsvalue'] == 3
159 |     assert resp.headers['X-MyHeader'] == b'my value'
160 |     assert resp.headers['Content-Type'] == b'text/html'
161 |     assert resp.splash_response_headers['Content-Type'] == b'application/json'
162 |     assert resp.data['args']['foo'] == 'bar'
163 | 
164 | 
165 | @requires_splash
166 | @inlineCallbacks
167 | def test_bad_request(settings):
168 |     class BadRequestSpider(ResponseSpider):
169 |         def start_requests(self):
170 |             yield SplashRequest(self.url, endpoint='execute',
171 |                                 args={'lua_source': DEFAULT_SCRIPT, 'wait': 'bar'})
172 | 
173 |     items, url, crawler = yield crawl_items(BadRequestSpider, HelloWorld,
174 |                                             settings)
175 |     resp = assert_single_response(items)
176 |     assert resp.status == 400
177 |     assert resp.splash_response_status == 400
178 | 
179 |     items, url, crawler = yield crawl_items(LuaSpider, Http400Resource,
180 |                                             settings)
181 |     resp = assert_single_response(items)
182 |     assert resp.status == 400
183 |     assert resp.splash_response_status == 200
184 | 
185 | 
186 | @requires_splash
187 | @inlineCallbacks
188 | def test_cache_args(settings):
189 | 
190 |     class CacheArgsSpider(ResponseSpider):
191 |         def _request(self, url):
192 |             return SplashRequest(url, endpoint='execute',
193 |                                  args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'},
194 |                                  cache_args=['lua_source'])
195 | 
196 |         def start_requests(self):
197 |             yield self._request(self.url)
198 | 
199 |         def parse(self, response):
200 |             yield {'response': response}
201 |             yield self._request(self.url + "#foo")
202 | 
203 | 
204 |     items, url, crawler = yield crawl_items(CacheArgsSpider, HelloWorld,
205 |                                             settings)
206 |     assert len(items) == 2
207 |     resp = items[0]['response']
208 |     assert b"function main(splash)" in resp.request.body
209 |     assert b"yy" in resp.request.body
210 |     print(resp.body, resp.request.body)
211 | 
212 |     resp = items[1]['response']
213 |     assert b"function main(splash)" not in resp.request.body
214 |     assert b"yy" in resp.request.body
215 |     print(resp.body, resp.request.body)
216 | 
217 | 
218 | @requires_splash
219 | @inlineCallbacks
220 | def test_cookies(settings):
221 | 
222 |     # 64K for headers is over Twisted limit,
223 |     # so if these headers are sent to Splash request would fail.
224 |     BOMB = 'x' * 64000
225 | 
226 |     class LuaScriptSpider(ResponseSpider):
227 |         """ Cookies must be sent to website, not to Splash """
228 |         custom_settings = {
229 |             'SPLASH_COOKIES_DEBUG': True,
230 |             'COOKIES_DEBUG': True,
231 |         }
232 | 
233 |         def start_requests(self):
234 |             # cookies set without Splash should be still
235 |             # sent to a remote website. FIXME: this is not the case.
236 |             yield scrapy.Request(self.url + "/login", self.parse,
237 |                                  cookies={'x-set-scrapy': '1'})
238 | 
239 |         def parse(self, response):
240 |             yield SplashRequest(self.url + "#egg", self.parse_1,
241 |                                 endpoint='execute',
242 |                                 args={'lua_source': DEFAULT_SCRIPT},
243 |                                 cookies={'x-set-splash': '1'})
244 | 
245 |         def parse_1(self, response):
246 |             yield {'response': response}
247 |             yield SplashRequest(self.url + "#foo", self.parse_2,
248 |                                 endpoint='execute',
249 |                                 args={'lua_source': DEFAULT_SCRIPT})
250 | 
251 |         def parse_2(self, response):
252 |             yield {'response': response}
253 |             yield scrapy.Request(self.url, self.parse_3)
254 | 
255 |         def parse_3(self, response):
256 |             # Splash (Twisted) drops requests with huge http headers,
257 |             # but this one should work, as cookies are not sent
258 |             # to Splash itself.
259 |             yield {'response': response}
260 |             yield SplashRequest(self.url + "#bar", self.parse_4,
261 |                                 endpoint='execute',
262 |                                 args={'lua_source': DEFAULT_SCRIPT},
263 |                                 cookies={'bomb': BOMB})
264 | 
265 |         def parse_4(self, response):
266 |             yield {'response': response}
267 | 
268 | 
269 |     def _cookie_dict(har_cookies):
270 |         return {c['name']: c['value'] for c in har_cookies}
271 | 
272 |     items, url, crawler = yield crawl_items(LuaScriptSpider, ManyCookies,
273 |                                             settings)
274 |     assert len(items) == 4
275 | 
276 |     # cookie should be sent to remote website, not to Splash
277 |     resp = items[0]['response']
278 |     splash_request_headers = resp.request.headers
279 |     cookies = resp.data['args']['cookies']
280 |     print(splash_request_headers)
281 |     print(cookies)
282 |     assert _cookie_dict(cookies) == {
283 |         # 'login': '1',   # FIXME
284 |         'x-set-splash': '1'
285 |     }
286 |     assert splash_request_headers.get(b'Cookie') is None
287 | 
288 |     # new cookie should be also sent to remote website, not to Splash
289 |     resp2 = items[1]['response']
290 |     splash_request_headers = resp2.request.headers
291 |     headers = resp2.data['args']['headers']
292 |     cookies = resp2.data['args']['cookies']
293 |     assert canonicalize_url(headers['Referer']) == canonicalize_url(url)
294 |     assert _cookie_dict(cookies) == {
295 |         # 'login': '1',
296 |         'x-set-splash': '1',
297 |         'sessionid': 'ABCD'
298 |     }
299 |     print(splash_request_headers)
300 |     print(headers)
301 |     print(cookies)
302 |     assert splash_request_headers.get(b'Cookie') is None
303 | 
304 |     # TODO/FIXME: Cookies fetched when working with Splash should be picked up
305 |     # by Scrapy
306 |     resp3 = items[2]['response']
307 |     splash_request_headers = resp3.request.headers
308 |     cookie_header = splash_request_headers.get(b'Cookie')
309 |     assert b'x-set-scrapy=1' in cookie_header
310 |     assert b'login=1' in cookie_header
311 |     assert b'x-set-splash=1' in cookie_header
312 |     # assert b'sessionid=ABCD' in cookie_header  # FIXME
313 | 
314 |     # cookie bomb shouldn't cause problems
315 |     resp4 = items[3]['response']
316 |     splash_request_headers = resp4.request.headers
317 |     cookies = resp4.data['args']['cookies']
318 |     assert _cookie_dict(cookies) == {
319 |         # 'login': '1',
320 |         'x-set-splash': '1',
321 |         'sessionid': 'ABCD',
322 |         'bomb': BOMB,
323 |     }
324 |     assert splash_request_headers.get(b'Cookie') is None
325 | 
326 | 
327 | @requires_splash
328 | @inlineCallbacks
329 | def test_access_http_auth(settings):
330 |     # website is protected
331 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
332 |                                             settings)
333 |     response = assert_single_response(items)
334 |     assert response.status == 401
335 |     assert response.splash_response_status == 200
336 | 
337 |     # header can be used to access it
338 |     AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')}
339 |     kwargs = {'headers': AUTH_HEADERS}
340 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
341 |                                             settings, kwargs)
342 |     response = assert_single_response(items)
343 |     assert 'hello' in response.text
344 |     assert response.status == 200
345 |     assert response.splash_response_status == 200
346 | 
347 | 
348 | @requires_splash
349 | @inlineCallbacks
350 | def test_protected_splash_no_auth(settings_auth):
351 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
352 |                                             settings_auth)
353 |     response = assert_single_response(items)
354 |     assert 'Unauthorized' in response.text
355 |     assert 'hello' not in response.text
356 |     assert response.status == 401
357 |     assert response.splash_response_status == 401
358 | 
359 | 
360 | @requires_splash
361 | @inlineCallbacks
362 | def test_protected_splash_manual_headers_auth(settings_auth):
363 |     AUTH_HEADERS = {'Authorization': basic_auth_header('user', 'userpass')}
364 |     kwargs = {'splash_headers': AUTH_HEADERS}
365 | 
366 |     # auth via splash_headers should work
367 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
368 |                                             settings_auth, kwargs)
369 |     response = assert_single_response(items)
370 |     assert 'hello' in response.text
371 |     assert response.status == 200
372 |     assert response.splash_response_status == 200
373 | 
374 |     # but only for Splash, not for a remote website
375 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
376 |                                             settings_auth, kwargs)
377 |     response = assert_single_response(items)
378 |     assert 'hello' not in response.text
379 |     assert response.status == 401
380 |     assert response.splash_response_status == 200
381 | 
382 | 
383 | @requires_splash
384 | @inlineCallbacks
385 | def test_protected_splash_settings_auth(settings_auth):
386 |     settings_auth['SPLASH_USER'] = 'user'
387 |     settings_auth['SPLASH_PASS'] = 'userpass'
388 | 
389 |     # settings works
390 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
391 |                                             settings_auth)
392 |     response = assert_single_response(items)
393 |     assert 'Unauthorized' not in response.text
394 |     assert 'hello' in response.text
395 |     assert response.status == 200
396 |     assert response.splash_response_status == 200
397 | 
398 |     # they can be overridden via splash_headers
399 |     bad_auth = {'splash_headers': {'Authorization': 'foo'}}
400 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorld,
401 |                                             settings_auth, bad_auth)
402 |     response = assert_single_response(items)
403 |     assert response.status == 401
404 |     assert response.splash_response_status == 401
405 | 
406 |     # auth error on remote website
407 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
408 |                                             settings_auth)
409 |     response = assert_single_response(items)
410 |     assert response.status == 401
411 |     assert response.splash_response_status == 200
412 | 
413 |     # auth both for Splash and for the remote website
414 |     REMOTE_AUTH = {'Authorization': basic_auth_header('user', 'userpass')}
415 |     remote_auth_kwargs = {'headers': REMOTE_AUTH}
416 |     items, url, crawler = yield crawl_items(LuaSpider, HelloWorldProtected,
417 |                                             settings_auth, remote_auth_kwargs)
418 |     response = assert_single_response(items)
419 |     assert response.status == 200
420 |     assert response.splash_response_status == 200
421 |     assert 'hello' in response.text
422 | 
423 |     # enable remote auth, but not splash auth - request should fail
424 |     del settings_auth['SPLASH_USER']
425 |     del settings_auth['SPLASH_PASS']
426 |     items, url, crawler = yield crawl_items(LuaSpider,
427 |                                             HelloWorldProtected,
428 |                                             settings_auth, remote_auth_kwargs)
429 |     response = assert_single_response(items)
430 |     assert response.status == 401
431 |     assert response.splash_response_status == 401
432 | 
433 | 
434 | @requires_splash
435 | @inlineCallbacks
436 | def test_protected_splash_httpauth_middleware(settings_auth):
437 |     # httpauth middleware should enable auth for Splash, for backwards
438 |     # compatibility reasons
439 |     items, url, crawler = yield crawl_items(ScrapyAuthSpider, HelloWorld,
440 |                                             settings_auth)
441 |     response = assert_single_response(items)
442 |     assert 'Unauthorized' not in response.text
443 |     assert 'hello' in response.text
444 |     assert response.status == 200
445 |     assert response.splash_response_status == 200
446 | 
447 |     # but not for a remote website
448 |     items, url, crawler = yield crawl_items(ScrapyAuthSpider,
449 |                                             HelloWorldProtected,
450 |                                             settings_auth)
451 |     response = assert_single_response(items)
452 |     assert 'hello' not in response.text
453 |     assert response.status == 401
454 |     assert response.splash_response_status == 200
455 | 
456 |     # headers shouldn't be sent to robots.txt file
457 |     items, url, crawler = yield crawl_items(ScrapyAuthSpider,
458 |                                             HelloWorldDisallowAuth,
459 |                                             settings_auth)
460 |     response = assert_single_response(items)
461 |     assert 'hello' in response.text
462 |     assert response.status == 200
463 |     assert response.splash_response_status == 200
464 | 
465 |     # httpauth shouldn't be disabled for non-Splash requests
466 |     items, url, crawler = yield crawl_items(NonSplashSpider,
467 |                                             HelloWorldProtected,
468 |                                             settings_auth)
469 |     response = assert_single_response(items)
470 |     assert 'hello' in response.text
471 |     assert response.status == 200
472 |     assert not hasattr(response, 'splash_response_status')
473 | 
474 | 
475 | @pytest.mark.xfail(
476 |     parse_version(scrapy.__version__) < parse_version("1.1"),
477 |     reason="https://github.com/scrapy/scrapy/issues/1471",
478 |     strict=True,
479 |     run=True,
480 | )
481 | @requires_splash
482 | @inlineCallbacks
483 | def test_robotstxt_can_work(settings_auth):
484 | 
485 |     def assert_robots_disabled(items):
486 |         response = assert_single_response(items)
487 |         assert response.status == response.splash_response_status == 200
488 |         assert b'hello' in response.body
489 | 
490 |     def assert_robots_enabled(items, crawler):
491 |         assert len(items) == 0
492 |         assert crawler.stats.get_value('downloader/exception_type_count/scrapy.exceptions.IgnoreRequest') == 1
493 | 
494 |     def _crawl_items(spider, resource):
495 |         return crawl_items(
496 |             spider,
497 |             resource,
498 |             settings_auth,
499 |             url_path='/',  # https://github.com/scrapy/protego/issues/17
500 |         )
501 | 
502 |     # when old auth method is used, robots.txt should be disabled
503 |     items, url, crawler = yield _crawl_items(ScrapyAuthSpider,
504 |                                              HelloWorldDisallowByRobots)
505 |     assert_robots_disabled(items)
506 | 
507 |     # but robots.txt should still work for non-Splash requests
508 |     items, url, crawler = yield _crawl_items(NonSplashSpider,
509 |                                              HelloWorldDisallowByRobots)
510 |     assert_robots_enabled(items, crawler)
511 | 
512 |     # robots.txt should work when a proper auth method is used
513 |     settings_auth['SPLASH_USER'] = 'user'
514 |     settings_auth['SPLASH_PASS'] = 'userpass'
515 |     items, url, crawler = yield _crawl_items(LuaSpider,
516 |                                              HelloWorldDisallowByRobots)
517 |     assert_robots_enabled(items, crawler)
518 | 
519 |     # disable robotstxt middleware - robots middleware shouldn't work
520 |     class DontObeyRobotsSpider(LuaSpider):
521 |         custom_settings = {
522 |             'HTTPERROR_ALLOW_ALL': True,
523 |             'ROBOTSTXT_OBEY': False,
524 |         }
525 |     items, url, crawler = yield _crawl_items(DontObeyRobotsSpider,
526 |                                              HelloWorldDisallowByRobots)
527 |     assert_robots_disabled(items)
528 | 
529 |     # disable robotstxt middleware via request meta
530 |     class MetaDontObeyRobotsSpider(ResponseSpider):
531 |         def start_requests(self):
532 |             yield SplashRequest(self.url,
533 |                                 endpoint='execute',
534 |                                 meta={'dont_obey_robotstxt': True},
535 |                                 args={'lua_source': DEFAULT_SCRIPT})
536 | 
537 |     items, url, crawler = yield _crawl_items(MetaDontObeyRobotsSpider,
538 |                                              HelloWorldDisallowByRobots)
539 |     assert_robots_disabled(items)
540 | 


--------------------------------------------------------------------------------
/tests/test_middleware.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import copy
  4 | import json
  5 | import base64
  6 | 
  7 | import scrapy
  8 | from scrapy.core.engine import ExecutionEngine
  9 | from scrapy.utils.test import get_crawler
 10 | from scrapy.http import Response, TextResponse, JsonResponse
 11 | from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
 12 | 
 13 | import scrapy_splash
 14 | from scrapy_splash.utils import to_unicode
 15 | from scrapy_splash import (
 16 |     SplashRequest,
 17 |     SplashMiddleware,
 18 |     SlotPolicy,
 19 |     SplashCookiesMiddleware,
 20 |     SplashDeduplicateArgsMiddleware,
 21 | )
 22 | 
 23 | 
 24 | def _get_crawler(settings_dict):
 25 |     settings_dict = settings_dict.copy()
 26 |     settings_dict['DOWNLOAD_HANDLERS'] = {'s3': None}  # for faster test running
 27 |     crawler = get_crawler(settings_dict=settings_dict)
 28 |     if not hasattr(crawler, 'logformatter'):
 29 |         crawler.logformatter = None
 30 |     crawler.engine = ExecutionEngine(crawler, lambda _: None)
 31 |     # spider = crawler._create_spider("foo")
 32 |     return crawler
 33 | 
 34 | 
 35 | def _get_mw(settings_dict=None):
 36 |     crawler = _get_crawler(settings_dict or {})
 37 |     return SplashMiddleware.from_crawler(crawler)
 38 | 
 39 | 
 40 | def _get_cookie_mw():
 41 |     return SplashCookiesMiddleware(debug=True)
 42 | 
 43 | 
 44 | def test_nosplash():
 45 |     mw = _get_mw()
 46 |     cookie_mw = _get_cookie_mw()
 47 |     req = scrapy.Request("http://example.com")
 48 |     old_meta = copy.deepcopy(req.meta)
 49 | 
 50 |     assert cookie_mw.process_request(req, None) is None
 51 |     assert mw.process_request(req, None) is None
 52 |     assert old_meta == req.meta
 53 | 
 54 |     # response is not changed
 55 |     response = Response("http://example.com", request=req)
 56 |     response2 = mw.process_response(req, response, None)
 57 |     response3 = cookie_mw.process_response(req, response, None)
 58 |     assert response2 is response
 59 |     assert response3 is response
 60 |     assert response3.url == "http://example.com"
 61 | 
 62 | 
 63 | def test_splash_request():
 64 |     mw = _get_mw()
 65 |     cookie_mw = _get_cookie_mw()
 66 | 
 67 |     req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
 68 |     assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"
 69 | 
 70 |     # check request preprocessing
 71 |     req2 = cookie_mw.process_request(req, None) or req
 72 |     req2 = mw.process_request(req2, None) or req2
 73 | 
 74 |     assert req2 is not None
 75 |     assert req2 is not req
 76 |     assert req2.url == "http://127.0.0.1:8050/render.html"
 77 |     assert req2.headers == {b'Content-Type': [b'application/json']}
 78 |     assert req2.method == 'POST'
 79 |     assert isinstance(req2, SplashRequest)
 80 |     assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"
 81 | 
 82 |     expected_body = {'url': req.url}
 83 |     assert json.loads(to_unicode(req2.body)) == expected_body
 84 | 
 85 |     # check response post-processing
 86 |     response = TextResponse("http://127.0.0.1:8050/render.html",
 87 |                             # Scrapy doesn't pass request to constructor
 88 |                             # request=req2,
 89 |                             headers={b'Content-Type': b'text/html'},
 90 |                             body=b"<html><body>Hello</body></html>")
 91 |     response2 = mw.process_response(req2, response, None)
 92 |     response2 = cookie_mw.process_response(req2, response2, None)
 93 |     assert isinstance(response2, scrapy_splash.SplashTextResponse)
 94 |     assert response2 is not response
 95 |     assert response2.real_url == req2.url
 96 |     assert response2.url == req.url
 97 |     assert response2.body == b"<html><body>Hello</body></html>"
 98 |     assert response2.css("body").extract_first() == "<body>Hello</body>"
 99 |     assert response2.headers == {b'Content-Type': [b'text/html']}
100 | 
101 |     # check .replace method
102 |     response3 = response2.replace(status=404)
103 |     assert response3.status == 404
104 |     assert isinstance(response3, scrapy_splash.SplashTextResponse)
105 |     for attr in ['url', 'real_url', 'headers', 'body']:
106 |         assert getattr(response3, attr) == getattr(response2, attr)
107 | 
108 | 
109 | def test_dont_process_response():
110 |     mw = _get_mw()
111 |     req = SplashRequest("http://example.com/",
112 |         endpoint="render.html",
113 |         dont_process_response=True,
114 |     )
115 |     req2 = mw.process_request(req, None)
116 |     resp = Response("http://example.com/")
117 |     resp2 = mw.process_response(req2, resp, None)
118 |     assert resp2.__class__ is Response
119 |     assert resp2 is resp
120 | 
121 | 
122 | def test_splash_request_parameters():
123 |     mw = _get_mw()
124 |     cookie_mw = _get_cookie_mw()
125 | 
126 |     def cb():
127 |         pass
128 | 
129 |     req = SplashRequest("http://example.com/#!start", cb, 'POST',
130 |         body="foo=bar",
131 |         splash_url="http://mysplash.example.com",
132 |         slot_policy=SlotPolicy.SINGLE_SLOT,
133 |         endpoint="execute",
134 |         splash_headers={'X-My-Header': 'value'},
135 |         args={
136 |             "lua_source": "function main() end",
137 |             "myarg": 3.0,
138 |         },
139 |         magic_response=False,
140 |         headers={'X-My-Header': 'value'}
141 |     )
142 |     req2 = cookie_mw.process_request(req, None) or req
143 |     req2 = mw.process_request(req2, None) or req2
144 | 
145 |     assert req2.meta['ajax_crawlable'] is True
146 |     assert req2.meta['splash'] == {
147 |         'endpoint': 'execute',
148 |         'splash_url': "http://mysplash.example.com",
149 |         'slot_policy': SlotPolicy.SINGLE_SLOT,
150 |         'splash_headers': {'X-My-Header': 'value'},
151 |         'magic_response': False,
152 |         'session_id': 'default',
153 |         'http_status_from_error_code': True,
154 |         'args': {
155 |             'url': "http://example.com/#!start",
156 |             'http_method': 'POST',
157 |             'body': 'foo=bar',
158 |             'cookies': [],
159 |             'lua_source': 'function main() end',
160 |             'myarg': 3.0,
161 |             'headers': {
162 |                 'X-My-Header': 'value',
163 |             }
164 |         },
165 |     }
166 |     assert req2.callback == cb
167 |     assert req2.headers == {
168 |         b'Content-Type': [b'application/json'],
169 |         b'X-My-Header': [b'value'],
170 |     }
171 | 
172 |     # check response post-processing
173 |     res = {
174 |         'html': '<html><body>Hello</body></html>',
175 |         'num_divs': 0.0,
176 |     }
177 |     res_body = json.dumps(res)
178 |     response = TextResponse("http://mysplash.example.com/execute",
179 |                             # Scrapy doesn't pass request to constructor
180 |                             # request=req2,
181 |                             headers={b'Content-Type': b'application/json'},
182 |                             body=res_body.encode('utf8'))
183 |     response2 = mw.process_response(req2, response, None)
184 |     response2 = cookie_mw.process_response(req2, response2, None)
185 |     assert isinstance(response2, scrapy_splash.SplashJsonResponse)
186 |     assert response2 is not response
187 |     assert response2.real_url == req2.url
188 |     assert response2.url == req.meta['splash']['args']['url']
189 |     assert response2.data == res
190 |     assert response2.body == res_body.encode('utf8')
191 |     assert response2.text == response2.text == res_body
192 |     assert response2.encoding == 'utf8'
193 |     assert response2.headers == {b'Content-Type': [b'application/json']}
194 |     assert response2.splash_response_headers == response2.headers
195 |     assert response2.status == response2.splash_response_status == 200
196 | 
197 | 
198 | def test_magic_response():
199 |     mw = _get_mw()
200 |     cookie_mw = _get_cookie_mw()
201 | 
202 |     req = SplashRequest('http://example.com/',
203 |                         endpoint='execute',
204 |                         args={'lua_source': 'function main() end'},
205 |                         magic_response=True,
206 |                         cookies=[{'name': 'foo', 'value': 'bar'}])
207 |     req = cookie_mw.process_request(req, None) or req
208 |     req = mw.process_request(req, None) or req
209 | 
210 |     resp_data = {
211 |         'url': "http://exmaple.com/#id42",
212 |         'html': '<html><body>Hello 404</body></html>',
213 |         'http_status': 404,
214 |         'headers': [
215 |             {'name': 'Content-Type', 'value': "text/html"},
216 |             {'name': 'X-My-Header', 'value': "foo"},
217 |             {'name': 'Set-Cookie', 'value': "bar=baz"},
218 |         ],
219 |         'cookies': [
220 |             {'name': 'foo', 'value': 'bar'},
221 |             {'name': 'bar', 'value': 'baz', 'domain': '.example.com'},
222 |             {'name': 'session', 'value': '12345', 'path': '/',
223 |              'expires': '2055-07-24T19:20:30Z'},
224 |         ],
225 |     }
226 |     resp = TextResponse("http://mysplash.example.com/execute",
227 |                         headers={b'Content-Type': b'application/json'},
228 |                         body=json.dumps(resp_data).encode('utf8'))
229 |     resp2 = mw.process_response(req, resp, None)
230 |     resp2 = cookie_mw.process_response(req, resp2, None)
231 |     assert isinstance(resp2, scrapy_splash.SplashJsonResponse)
232 |     assert resp2.data == resp_data
233 |     assert resp2.body == b'<html><body>Hello 404</body></html>'
234 |     assert resp2.text == '<html><body>Hello 404</body></html>'
235 |     assert resp2.headers == {
236 |         b'Content-Type': [b'text/html'],
237 |         b'X-My-Header': [b'foo'],
238 |         b'Set-Cookie': [b'bar=baz'],
239 |     }
240 |     assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
241 |     assert resp2.status == 404
242 |     assert resp2.splash_response_status == 200
243 |     assert resp2.url == "http://exmaple.com/#id42"
244 |     assert len(resp2.cookiejar) == 3
245 |     cookies = [c for c in resp2.cookiejar]
246 |     assert {(c.name, c.value) for c in cookies} == {
247 |         ('bar', 'baz'),
248 |         ('foo', 'bar'),
249 |         ('session', '12345')
250 |     }
251 | 
252 |     # send second request using the same session and check the resulting cookies
253 |     req = SplashRequest('http://example.com/foo',
254 |                         endpoint='execute',
255 |                         args={'lua_source': 'function main() end'},
256 |                         magic_response=True,
257 |                         cookies={'spam': 'ham'})
258 |     req = cookie_mw.process_request(req, None) or req
259 |     req = mw.process_request(req, None) or req
260 | 
261 |     resp_data = {
262 |         'html': '<html><body>Hello</body></html>',
263 |         'headers': [
264 |             {'name': 'Content-Type', 'value': "text/html"},
265 |             {'name': 'X-My-Header', 'value': "foo"},
266 |             {'name': 'Set-Cookie', 'value': "bar=baz"},
267 |         ],
268 |         'cookies': [
269 |             {'name': 'spam', 'value': 'ham'},
270 |             {'name': 'egg', 'value': 'spam'},
271 |             {'name': 'bar', 'value': 'baz', 'domain': '.example.com'},
272 |            #{'name': 'foo', 'value': ''},  -- this won't be in response
273 |             {'name': 'session', 'value': '12345', 'path': '/',
274 |              'expires': '2056-07-24T19:20:30Z'},
275 |         ],
276 |     }
277 |     resp = TextResponse("http://mysplash.example.com/execute",
278 |                         headers={b'Content-Type': b'application/json'},
279 |                         body=json.dumps(resp_data).encode('utf8'))
280 |     resp2 = mw.process_response(req, resp, None)
281 |     resp2 = cookie_mw.process_response(req, resp2, None)
282 |     assert isinstance(resp2, scrapy_splash.SplashJsonResponse)
283 |     assert resp2.data == resp_data
284 |     cookies = [c for c in resp2.cookiejar]
285 |     assert {c.name for c in cookies} == {'session', 'egg', 'bar', 'spam'}
286 |     for c in cookies:
287 |         if c.name == 'session':
288 |             assert c.expires == 2731692030
289 |         if c.name == 'spam':
290 |             assert c.value == 'ham'
291 | 
292 | 
293 | def test_cookies():
294 |     mw = _get_mw()
295 |     cookie_mw = _get_cookie_mw()
296 | 
297 |     def request_with_cookies(cookies):
298 |         req = SplashRequest(
299 |             'http://example.com/foo',
300 |             endpoint='execute',
301 |             args={'lua_source': 'function main() end'},
302 |             magic_response=True,
303 |             cookies=cookies)
304 |         req = cookie_mw.process_request(req, None) or req
305 |         req = mw.process_request(req, None) or req
306 |         return req
307 | 
308 |     def response_with_cookies(req, cookies):
309 |         resp_data = {
310 |             'html': '<html><body>Hello</body></html>',
311 |             'headers': [],
312 |             'cookies': cookies,
313 |         }
314 |         resp = TextResponse(
315 |             'http://mysplash.example.com/execute',
316 |             headers={b'Content-Type': b'application/json'},
317 |             body=json.dumps(resp_data).encode('utf8'))
318 |         resp = mw.process_response(req, resp, None)
319 |         resp = cookie_mw.process_response(req, resp, None)
320 |         return resp
321 | 
322 |     # Concurent requests
323 |     req1 = request_with_cookies({'spam': 'ham'})
324 |     req2 = request_with_cookies({'bom': 'bam'})
325 |     resp1 = response_with_cookies(req1, [
326 |         {'name': 'spam', 'value': 'ham'},
327 |         {'name': 'spam_x', 'value': 'ham_x'},
328 |     ])
329 |     resp2 = response_with_cookies(req2, [
330 |         {'name': 'spam', 'value': 'ham'},  # because req2 was made after req1
331 |         {'name': 'bom_x', 'value': 'bam_x'},
332 |     ])
333 |     assert resp1.cookiejar is resp2.cookiejar
334 |     cookies = {c.name: c.value for c in resp1.cookiejar}
335 |     assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'}
336 | 
337 |     # Removing already removed
338 |     req1 = request_with_cookies({'spam': 'ham'})
339 |     req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'})
340 |     resp2 = response_with_cookies(req2, [
341 |         {'name': 'pom', 'value': 'pam'},
342 |     ])
343 |     resp1 = response_with_cookies(req1, [])
344 |     assert resp1.cookiejar is resp2.cookiejar
345 |     cookies = {c.name: c.value for c in resp1.cookiejar}
346 |     assert cookies == {'pom': 'pam'}
347 | 
348 | 
349 | def test_magic_response2():
350 |     # check 'body' handling and another 'headers' format
351 |     mw = _get_mw()
352 |     req = SplashRequest('http://example.com/', magic_response=True,
353 |                         headers={'foo': 'bar'}, dont_send_headers=True)
354 |     req = mw.process_request(req, None) or req
355 |     assert 'headers' not in req.meta['splash']['args']
356 | 
357 |     resp_data = {
358 |         'body': base64.b64encode(b"binary data").decode('ascii'),
359 |         'headers': {'Content-Type': 'text/plain'},
360 |     }
361 |     resp = TextResponse("http://mysplash.example.com/execute",
362 |                         headers={b'Content-Type': b'application/json'},
363 |                         body=json.dumps(resp_data).encode('utf8'))
364 |     resp2 = mw.process_response(req, resp, None)
365 |     assert resp2.data == resp_data
366 |     assert resp2.body == b'binary data'
367 |     assert resp2.headers == {b'Content-Type': [b'text/plain']}
368 |     assert resp2.splash_response_headers == {b'Content-Type': [b'application/json']}
369 |     assert resp2.status == resp2.splash_response_status == 200
370 |     assert resp2.url == "http://example.com/"
371 | 
372 | 
373 | def test_unicode_url():
374 |     mw = _get_mw()
375 |     req = SplashRequest(
376 |         # note unicode URL
377 |         u"http://example.com/", endpoint='execute')
378 |     req2 = mw.process_request(req, None) or req
379 |     res = {'html': '<html><body>Hello</body></html>'}
380 |     res_body = json.dumps(res)
381 |     response = TextResponse("http://mysplash.example.com/execute",
382 |                             # Scrapy doesn't pass request to constructor
383 |                             # request=req2,
384 |                             headers={b'Content-Type': b'application/json'},
385 |                             body=res_body.encode('utf8'))
386 |     response2 = mw.process_response(req2, response, None)
387 |     assert response2.url == "http://example.com/"
388 | 
389 | 
390 | def test_magic_response_http_error():
391 |     mw = _get_mw()
392 |     req = SplashRequest('http://example.com/foo')
393 |     req = mw.process_request(req, None) or req
394 | 
395 |     resp_data = {
396 |         "info": {
397 |             "error": "http404",
398 |             "message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
399 |             "line_number": 3,
400 |             "type": "LUA_ERROR",
401 |             "source": "[string \"function main(splash)\r...\"]"
402 |         },
403 |         "description": "Error happened while executing Lua script",
404 |         "error": 400,
405 |         "type": "ScriptError"
406 |     }
407 |     resp = TextResponse("http://mysplash.example.com/execute", status=400,
408 |                         headers={b'Content-Type': b'application/json'},
409 |                         body=json.dumps(resp_data).encode('utf8'))
410 |     resp = mw.process_response(req, resp, None)
411 |     assert resp.data == resp_data
412 |     assert resp.status == 404
413 |     assert resp.splash_response_status == 400
414 |     assert resp.url == "http://example.com/foo"
415 | 
416 | 
417 | def test_change_response_class_to_text():
418 |     mw = _get_mw()
419 |     req = SplashRequest('http://example.com/', magic_response=True)
420 |     req = mw.process_request(req, None) or req
421 |     # Such response can come when downloading a file,
422 |     # or returning splash:html(): the headers say it's binary,
423 |     # but it can be decoded so it becomes a TextResponse.
424 |     resp = TextResponse('http://mysplash.example.com/execute',
425 |                         headers={b'Content-Type': b'application/pdf'},
426 |                         body=b'ascii binary data',
427 |                         encoding='utf-8')
428 |     resp2 = mw.process_response(req, resp, None)
429 |     assert isinstance(resp2, TextResponse)
430 |     assert resp2.url == 'http://example.com/'
431 |     assert resp2.headers == {b'Content-Type': [b'application/pdf']}
432 |     assert resp2.body == b'ascii binary data'
433 | 
434 | 
435 | def test_change_response_class_to_json_binary():
436 |     mw = _get_mw()
437 |     # We set magic_response to False, because it's not a kind of data we would
438 |     # expect from splash: we just return binary data.
439 |     # If we set magic_response to True, the middleware will fail,
440 |     # but this is ok because magic_response presumes we are expecting
441 |     # a valid splash json response.
442 |     req = SplashRequest('http://example.com/', magic_response=False)
443 |     req = mw.process_request(req, None) or req
444 |     resp = Response('http://mysplash.example.com/execute',
445 |                     headers={b'Content-Type': b'application/json'},
446 |                     body=b'non-decodable data: \x98\x11\xe7\x17\x8f',
447 |                     )
448 |     resp2 = mw.process_response(req, resp, None)
449 |     assert isinstance(resp2, Response)
450 |     assert resp2.url == 'http://example.com/'
451 |     assert resp2.headers == {b'Content-Type': [b'application/json']}
452 |     assert resp2.body == b'non-decodable data: \x98\x11\xe7\x17\x8f'
453 | 
454 | 
455 | def test_magic_response_caching(tmpdir):
456 |     # prepare middlewares
457 |     spider = scrapy.Spider(name='foo')
458 |     crawler = _get_crawler({
459 |         'HTTPCACHE_DIR': str(tmpdir.join('cache')),
460 |         'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage',
461 |         'HTTPCACHE_ENABLED': True
462 |     })
463 |     cache_mw = HttpCacheMiddleware.from_crawler(crawler)
464 |     mw = _get_mw()
465 |     cookie_mw = _get_cookie_mw()
466 | 
467 |     def _get_req():
468 |         return SplashRequest(
469 |             url="http://example.com",
470 |             endpoint='execute',
471 |             magic_response=True,
472 |             args={'lua_source': 'function main(splash) end'},
473 |         )
474 | 
475 |     # Emulate Scrapy middleware chain.
476 | 
477 |     # first call
478 |     req = _get_req()
479 |     req = cookie_mw.process_request(req, spider) or req
480 |     req = mw.process_request(req, spider) or req
481 |     req = cache_mw.process_request(req, spider) or req
482 |     assert isinstance(req, scrapy.Request)  # first call; the cache is empty
483 | 
484 |     resp_data = {
485 |         'html': "<html><body>Hello</body></html>",
486 |         'render_time': 0.5,
487 |     }
488 |     resp_body = json.dumps(resp_data).encode('utf8')
489 |     resp = TextResponse("http://example.com",
490 |                         headers={b'Content-Type': b'application/json'},
491 |                         body=resp_body)
492 | 
493 |     resp2 = cache_mw.process_response(req, resp, spider)
494 |     resp3 = mw.process_response(req, resp2, spider)
495 |     resp3 = cookie_mw.process_response(req, resp3, spider)
496 | 
497 |     assert resp3.text == "<html><body>Hello</body></html>"
498 |     assert resp3.css("body").extract_first() == "<body>Hello</body>"
499 |     assert resp3.data['render_time'] == 0.5
500 | 
501 |     # second call
502 |     req = _get_req()
503 |     req = cookie_mw.process_request(req, spider) or req
504 |     req = mw.process_request(req, spider) or req
505 |     cached_resp = cache_mw.process_request(req, spider) or req
506 | 
507 |     # response should be from cache:
508 |     assert cached_resp.__class__ is JsonResponse
509 |     assert cached_resp.body == resp_body
510 |     resp2_1 = cache_mw.process_response(req, cached_resp, spider)
511 |     resp3_1 = mw.process_response(req, resp2_1, spider)
512 |     resp3_1 = cookie_mw.process_response(req, resp3_1, spider)
513 | 
514 |     assert isinstance(resp3_1, scrapy_splash.SplashJsonResponse)
515 |     assert resp3_1.body == b"<html><body>Hello</body></html>"
516 |     assert resp3_1.text == "<html><body>Hello</body></html>"
517 |     assert resp3_1.css("body").extract_first() == "<body>Hello</body>"
518 |     assert resp3_1.data['render_time'] == 0.5
519 |     assert resp3_1.headers[b'Content-Type'] == b'text/html; charset=utf-8'
520 | 
521 | 
522 | def test_cache_args():
523 |     spider = scrapy.Spider(name='foo')
524 |     mw = _get_mw()
525 |     mw.crawler.spider = spider
526 |     mw.spider_opened(spider)
527 |     dedupe_mw = SplashDeduplicateArgsMiddleware()
528 | 
529 |     # ========= Send first request - it should use save_args:
530 |     lua_source = 'function main(splash) end'
531 |     req = SplashRequest('http://example.com/foo',
532 |                         endpoint='execute',
533 |                         args={'lua_source': lua_source},
534 |                         cache_args=['lua_source'])
535 | 
536 |     assert req.meta['splash']['args']['lua_source'] == lua_source
537 |     # <---- spider
538 |     req, = list(dedupe_mw.process_start_requests([req], spider))
539 |     # ----> scheduler
540 |     assert req.meta['splash']['args']['lua_source'] != lua_source
541 |     assert list(mw._argument_values.values()) == [lua_source]
542 |     assert list(mw._argument_values.keys()) == [req.meta['splash']['args']['lua_source']]
543 |     # <---- scheduler
544 |     # process request before sending it to the downloader
545 |     req = mw.process_request(req, spider) or req
546 |     # -----> downloader
547 |     assert req.meta['splash']['args']['lua_source'] == lua_source
548 |     assert req.meta['splash']['args']['save_args'] == ['lua_source']
549 |     assert 'load_args' not in req.meta['splash']['args']
550 |     assert req.meta['splash']['_local_arg_fingerprints'] == {
551 |         'lua_source': list(mw._argument_values.keys())[0]
552 |     }
553 |     # <---- downloader
554 |     resp_body = b'{}'
555 |     resp = TextResponse("http://example.com",
556 |                         headers={
557 |                             b'Content-Type': b'application/json',
558 |                             b'X-Splash-Saved-Arguments': b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3'
559 |                         },
560 |                         body=resp_body)
561 |     resp = mw.process_response(req, resp, None)
562 | 
563 |     # ============ Send second request - it should use load_args
564 |     req2 = SplashRequest('http://example.com/bar',
565 |                         endpoint='execute',
566 |                         args={'lua_source': lua_source},
567 |                         cache_args=['lua_source'])
568 |     req2, item = list(dedupe_mw.process_spider_output(resp, [req2, {'key': 'value'}], spider))
569 |     assert item == {'key': 'value'}
570 |     # ----> scheduler
571 |     assert req2.meta['splash']['args']['lua_source'] != lua_source
572 |     # <---- scheduler
573 |     # process request before sending it to the downloader
574 |     req2 = mw.process_request(req2, spider) or req2
575 |     # -----> downloader
576 |     assert req2.meta['splash']['args']['load_args'] == {"lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3"}
577 |     assert "lua_source" not in req2.meta['splash']['args']
578 |     assert "save_args" not in req2.meta['splash']['args']
579 |     assert json.loads(req2.body.decode('utf8')) == {
580 |         'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'},
581 |         'url': 'http://example.com/bar'
582 |     }
583 |     # <---- downloader
584 |     resp = TextResponse("http://example.com/bar",
585 |                         headers={b'Content-Type': b'application/json'},
586 |                         body=b'{}')
587 |     resp = mw.process_response(req, resp, spider)
588 | 
589 |     # =========== Third request is dispatched to another server where
590 |     # =========== arguments are expired:
591 |     req3 = SplashRequest('http://example.com/baz',
592 |                          endpoint='execute',
593 |                          args={'lua_source': lua_source},
594 |                          cache_args=['lua_source'])
595 |     req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider))
596 |     # ----> scheduler
597 |     assert req3.meta['splash']['args']['lua_source'] != lua_source
598 |     # <---- scheduler
599 |     req3 = mw.process_request(req3, spider) or req3
600 |     # -----> downloader
601 |     assert json.loads(req3.body.decode('utf8')) == {
602 |         'load_args': {'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'},
603 |         'url': 'http://example.com/baz'
604 |     }
605 |     # <---- downloader
606 | 
607 |     resp_body = json.dumps({
608 |         "type": "ExpiredArguments",
609 |         "description": "Arguments stored with ``save_args`` are expired",
610 |         "info": {"expired": ["html"]},
611 |         "error": 498
612 |     })
613 |     resp = TextResponse("127.0.0.1:8050",
614 |                         headers={b'Content-Type': b'application/json'},
615 |                         status=498,
616 |                         body=resp_body.encode('utf8'))
617 |     req4 = mw.process_response(req3, resp, spider)
618 |     assert isinstance(req4, SplashRequest)
619 | 
620 |     # process this request again
621 |     req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider))
622 |     req4 = mw.process_request(req4, spider) or req4
623 | 
624 |     # it should become save_args request after all middlewares
625 |     assert json.loads(req4.body.decode('utf8')) == {
626 |         'lua_source': 'function main(splash) end',
627 |         'save_args': ['lua_source'],
628 |         'url': 'http://example.com/baz'
629 |     }
630 |     assert mw._remote_keys == {}
631 | 
632 | 
633 | def test_splash_request_no_url():
634 |     mw = _get_mw()
635 |     lua_source = "function main(splash) return {result='ok'} end"
636 |     req1 = SplashRequest(meta={'splash': {
637 |         'args': {'lua_source': lua_source},
638 |         'endpoint': 'execute',
639 |     }})
640 |     req = mw.process_request(req1, None)
641 |     assert req.url == 'http://127.0.0.1:8050/execute'
642 |     assert json.loads(to_unicode(req.body)) == {
643 |         'url': 'about:blank',
644 |         'lua_source': lua_source
645 |     }
646 | 
647 | 
648 | def test_post_request():
649 |     mw = _get_mw()
650 |     for body in [b'', b'foo=bar']:
651 |         req1 = scrapy.Request("http://example.com",
652 |                               method="POST",
653 |                               body=body,
654 |                               meta={'splash': {'endpoint': 'render.html'}})
655 |         req = mw.process_request(req1, None)
656 |         assert json.loads(to_unicode(req.body)) == {
657 |             'url': 'http://example.com',
658 |             'http_method': 'POST',
659 |             'body': to_unicode(body),
660 |         }
661 | 
662 | 
663 | def test_override_splash_url():
664 |     mw = _get_mw()
665 |     req1 = scrapy.Request("http://example.com", meta={
666 |         'splash': {
667 |             'endpoint': 'render.png',
668 |             'splash_url': 'http://splash.example.com'
669 |         }
670 |     })
671 |     req = mw.process_request(req1, None)
672 |     req = mw.process_request(req, None) or req
673 |     assert req.url == 'http://splash.example.com/render.png'
674 |     assert json.loads(to_unicode(req.body)) == {'url': req1.url}
675 | 
676 | 
677 | def test_url_with_fragment():
678 |     mw = _get_mw()
679 |     url = "http://example.com#id1"
680 |     req = scrapy.Request("http://example.com", meta={
681 |         'splash': {'args': {'url': url}}
682 |     })
683 |     req = mw.process_request(req, None) or req
684 |     assert json.loads(to_unicode(req.body)) == {'url': url}
685 | 
686 | 
687 | def test_splash_request_url_with_fragment():
688 |     mw = _get_mw()
689 |     url = "http://example.com#id1"
690 |     req = SplashRequest(url)
691 |     req = mw.process_request(req, None) or req
692 |     assert json.loads(to_unicode(req.body)) == {'url': url}
693 | 
694 | 
695 | def test_float_wait_arg():
696 |     mw = _get_mw()
697 |     req1 = scrapy.Request("http://example.com", meta={
698 |         'splash': {
699 |             'endpoint': 'render.html',
700 |             'args': {'wait': 0.5}
701 |         }
702 |     })
703 |     req = mw.process_request(req1, None)
704 |     assert json.loads(to_unicode(req.body)) == {'url': req1.url, 'wait': 0.5}
705 | 
706 | 
707 | def test_slot_policy_single_slot():
708 |     mw = _get_mw()
709 |     meta = {'splash': {
710 |         'slot_policy': scrapy_splash.SlotPolicy.SINGLE_SLOT
711 |     }}
712 | 
713 |     req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
714 |     req1 = mw.process_request(req1, None)
715 | 
716 |     req2 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
717 |     req2 = mw.process_request(req2, None)
718 | 
719 |     assert req1.meta.get('download_slot')
720 |     assert req1.meta['download_slot'] == req2.meta['download_slot']
721 | 
722 | 
723 | def test_slot_policy_per_domain():
724 |     mw = _get_mw()
725 |     meta = {'splash': {
726 |         'slot_policy': scrapy_splash.SlotPolicy.PER_DOMAIN
727 |     }}
728 | 
729 |     req1 = scrapy.Request("http://example.com/path?key=value", meta=meta)
730 |     req1 = mw.process_request(req1, None)
731 | 
732 |     req2 = scrapy.Request("http://example.com/path2", meta=meta)
733 |     req2 = mw.process_request(req2, None)
734 | 
735 |     req3 = scrapy.Request("http://fooexample.com/path?key=value", meta=meta)
736 |     req3 = mw.process_request(req3, None)
737 | 
738 |     assert req1.meta.get('download_slot')
739 |     assert req3.meta.get('download_slot')
740 | 
741 |     assert req1.meta['download_slot'] == req2.meta['download_slot']
742 |     assert req1.meta['download_slot'] != req3.meta['download_slot']
743 | 
744 | 
745 | def test_slot_policy_scrapy_default():
746 |     mw = _get_mw()
747 |     req = scrapy.Request("http://example.com", meta={'splash': {
748 |         'slot_policy': scrapy_splash.SlotPolicy.SCRAPY_DEFAULT
749 |     }})
750 |     req = mw.process_request(req, None)
751 |     assert 'download_slot' not in req.meta
752 | 
753 | 
754 | def test_adjust_timeout():
755 |     mw = _get_mw()
756 |     req1 = scrapy.Request("http://example.com", meta={
757 |         'splash': {'args': {'timeout': 60, 'html': 1}},
758 | 
759 |         # download_timeout is always present,
760 |         # it is set by DownloadTimeoutMiddleware
761 |         'download_timeout': 30,
762 |     })
763 |     req1 = mw.process_request(req1, None)
764 |     assert req1.meta['download_timeout'] > 60
765 | 
766 |     req2 = scrapy.Request("http://example.com", meta={
767 |         'splash': {'args': {'html': 1}},
768 |         'download_timeout': 30,
769 |     })
770 |     req2 = mw.process_request(req2, None)
771 |     assert req2.meta['download_timeout'] == 30
772 | 
773 | 
774 | def test_auth():
775 |     def assert_auth_header(user, pwd, header):
776 |         mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd})
777 |         req = mw.process_request(SplashRequest("http://example.com"), None)
778 |         assert 'Authorization' in req.headers
779 |         assert req.headers['Authorization'] == header
780 | 
781 |     def assert_no_auth_header(user, pwd):
782 |         if user is not None or pwd is not None:
783 |             mw = _get_mw({'SPLASH_USER': user, 'SPLASH_PASS': pwd})
784 |         else:
785 |             mw = _get_mw()
786 |         req = mw.process_request(SplashRequest("http://example.com"), None)
787 |         assert 'Authorization' not in req.headers
788 | 
789 |     assert_auth_header('root', '', b'Basic cm9vdDo=')
790 |     assert_auth_header('root', 'pwd', b'Basic cm9vdDpwd2Q=')
791 |     assert_auth_header('', 'pwd', b'Basic OnB3ZA==')
792 | 
793 |     assert_no_auth_header('', '')
794 |     assert_no_auth_header(None, None)


--------------------------------------------------------------------------------
/tests/test_request.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from urllib.parse import parse_qs
 3 | except ImportError:
 4 |     from urlparse import parse_qs
 5 | 
 6 | from scrapy.http import HtmlResponse
 7 | from scrapy_splash import SplashRequest, SplashFormRequest
 8 | 
 9 | 
10 | def test_meta_None():
11 |     req1 = SplashRequest('http://example.com')
12 |     req2 = SplashRequest('http://example.com', meta=None)
13 |     assert req1.meta == req2.meta
14 | 
15 | 
16 | def test_splash_form_request():
17 |     req = SplashFormRequest(
18 |         'http://example.com', formdata={'foo': 'bar'})
19 |     assert req.method == 'POST'
20 |     assert req.body == b'foo=bar'
21 |     assert req.meta['splash']['args']['url'] == 'http://example.com'
22 | 
23 |     req = SplashFormRequest(
24 |         'http://example.com', method='GET', formdata={'foo': 'bar'},
25 |         endpoint='execute')
26 |     assert req.method == 'GET'
27 |     assert req.body == b''
28 |     assert req.url == req.meta['splash']['args']['url'] ==\
29 |         'http://example.com?foo=bar'
30 |     assert req.meta['splash']['endpoint'] == 'execute'
31 | 
32 | 
33 | def test_form_request_from_response():
34 |     # Copied from scrapy tests (test_from_response_submit_not_first_clickable)
35 |     def _buildresponse(body, **kwargs):
36 |         kwargs.setdefault('body', body)
37 |         kwargs.setdefault('url', 'http://example.com')
38 |         kwargs.setdefault('encoding', 'utf-8')
39 |         return HtmlResponse(**kwargs)
40 |     response = _buildresponse(
41 |         """<form action="get.php" method="GET">
42 |         <input type="submit" name="clickable1" value="clicked1">
43 |         <input type="hidden" name="one" value="1">
44 |         <input type="hidden" name="two" value="3">
45 |         <input type="submit" name="clickable2" value="clicked2">
46 |         </form>""")
47 |     req = SplashFormRequest.from_response(
48 |         response, formdata={'two': '2'}, clickdata={'name': 'clickable2'})
49 |     assert req.method == 'GET'
50 |     assert req.meta['splash']['args']['url'] == req.url
51 |     fs = parse_qs(req.url.partition('?')[2], True)
52 |     assert fs['clickable2'] == ['clicked2']
53 |     assert 'clickable1' not in fs
54 |     assert fs['one'] == ['1']
55 |     assert fs['two'] == ['2']
56 | 
57 | 
58 | def test_splash_request_meta():
59 |     meta = {'foo': 'bar'}
60 |     req = SplashRequest('http://example.com', meta=meta)
61 |     assert 'splash' in req.meta
62 |     assert req.meta['foo'] == 'bar'
63 |     assert meta == {'foo': 'bar'}
64 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | import json
 4 | 
 5 | from hypothesis import given, assume
 6 | from hypothesis import strategies as st
 7 | from scrapy.http import Headers
 8 | from scrapy_splash.utils import (
 9 |     headers_to_scrapy,
10 |     _fast_hash,
11 |     json_based_hash,
12 |     dict_hash
13 | )
14 | 
15 | 
16 | def test_headers_to_scrapy():
17 |     assert headers_to_scrapy(None) == Headers()
18 |     assert headers_to_scrapy({}) == Headers()
19 |     assert headers_to_scrapy([]) == Headers()
20 | 
21 |     html_headers = Headers({'Content-Type': 'text/html'})
22 | 
23 |     assert headers_to_scrapy({'Content-Type': 'text/html'}) == html_headers
24 |     assert headers_to_scrapy([('Content-Type', 'text/html')]) == html_headers
25 |     assert headers_to_scrapy([{'name': 'Content-Type', 'value': 'text/html'}]) == html_headers
26 | 
27 | 
28 | _primitive = (
29 |     st.floats(allow_infinity=False, allow_nan=False) |
30 |     st.booleans() |
31 |     st.text() |
32 |     st.none() |
33 |     st.integers()
34 | )
35 | _data = st.recursive(_primitive,
36 |     lambda children: (
37 |         children |
38 |         st.lists(children) |
39 |         st.tuples(children) |
40 |         st.dictionaries(st.text(), children) |
41 |         st.tuples(st.just('h'), children)
42 |     ),
43 |     max_leaves=5,
44 | )
45 | _data_notuples = st.recursive(_primitive,
46 |     lambda children: (
47 |         children |
48 |         st.lists(children) |
49 |         st.dictionaries(st.text(), children)
50 |     ),
51 |     max_leaves=5,
52 | )
53 | 
54 | 
55 | @given(_data, _data)
56 | def test_fast_hash(val1, val2):
57 |     def _dump(v):
58 |         return json.dumps(v, sort_keys=True)
59 |     assume(_dump(val1) != _dump(val2))
60 |     assert _fast_hash(val1) == _fast_hash(val1)
61 |     assert _fast_hash(val1) != _fast_hash(val2)
62 | 
63 | 
64 | @given(_data, _data)
65 | def test_dict_hash(val1, val2):
66 |     assume(val1 != val2)
67 |     assert dict_hash(val1) == dict_hash(val1)
68 |     assert dict_hash(val1) != dict_hash(val2)
69 | 
70 | 
71 | @given(_data_notuples, _data_notuples)
72 | def test_json_based_hash(val1, val2):
73 |     assume(val1 != val2)
74 |     assert json_based_hash(val1) == json_based_hash(val1)
75 |     assert json_based_hash(val1) != json_based_hash(val2)
76 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import pytest
 4 | from pytest_twisted import inlineCallbacks
 5 | from twisted.internet.defer import returnValue
 6 | from scrapy.crawler import Crawler
 7 | 
 8 | from .mockserver import MockServer
 9 | 
10 | 
11 | requires_splash = pytest.mark.skipif(
12 |     not os.environ.get('SPLASH_URL', ''),
13 |     reason="set SPLASH_URL environment variable to run integrational tests"
14 | )
15 | 
16 | 
17 | @inlineCallbacks
18 | def crawl_items(
19 |     spider_cls,
20 |     resource_cls,
21 |     settings,
22 |     spider_kwargs=None,
23 |     url_path="",
24 | ):
25 |     """ Use spider_cls to crawl resource_cls. URL of the resource is passed
26 |     to the spider as ``url`` argument.
27 |     Return ``(items, resource_url, crawler)`` tuple.
28 |     """
29 |     spider_kwargs = {} if spider_kwargs is None else spider_kwargs
30 |     crawler = make_crawler(spider_cls, settings)
31 |     with MockServer(resource_cls) as s:
32 |         print("mock server", s.root_url)
33 |         root_url = s.root_url + url_path
34 |         yield crawler.crawl(url=root_url, **spider_kwargs)
35 |     items = getattr(crawler.spider, 'collected_items', [])
36 |     result = items, root_url, crawler
37 |     returnValue(result)
38 | 
39 | 
40 | def make_crawler(spider_cls, settings):
41 |     if not getattr(spider_cls, 'name', None):
42 |         class Spider(spider_cls):
43 |             name = 'test_spider'
44 |         Spider.__name__ = spider_cls.__name__
45 |         Spider.__module__ = spider_cls.__module__
46 |         spider_cls = Spider
47 |     return Crawler(spider_cls, settings)
48 | 
49 | 
50 | class CollectorPipeline:
51 |     def process_item(self, item, spider):
52 |         if not hasattr(spider, 'collected_items'):
53 |             spider.collected_items = []
54 |         spider.collected_items.append(item)
55 |         return item
56 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py37,py38,py39,py310,py311
 8 | 
 9 | [common]
10 | deps =
11 |     pytest >= 3.3.2
12 |     pytest-cov >= 2.5.1
13 |     pytest-twisted >= 1.6
14 |     pytest-xdist >= 1.22
15 |     hypothesis >= 3.44.14
16 |     hypothesis-pytest
17 |     service_identity
18 | 
19 | [testenv]
20 | passenv = SPLASH_URL
21 | deps =
22 |     {[common]deps}
23 |     scrapy
24 | commands =
25 |     pip install -e .
26 |     py.test --doctest-modules --doctest-glob '*.py,*.rst' --cov=scrapy_splash --cov-report=xml {posargs:README.rst scrapy_splash tests}
27 | 


--------------------------------------------------------------------------------