├── .coveragerc ├── .gitignore ├── .travis.yml ├── .whitesource ├── AUTHORS ├── INSTALL ├── LICENSE ├── MANIFEST.in ├── Pipfile ├── README.rst ├── codecov.yml ├── docker ├── MongoDB │ └── docker-compose.yml └── Redis │ └── docker-compose.yml ├── docs ├── Makefile ├── _ext │ └── scrapydocs.py ├── _static │ └── selectors-sample1.html ├── _templates │ └── layout.html ├── conf.py ├── conf.py.bak ├── index.rst ├── intro │ ├── examples.rst │ ├── installation.rst │ ├── overview.rst │ └── tutorial.rst ├── make.bat ├── requirements.txt └── topics │ ├── cookiesmiddleware.rst │ ├── settings.rst │ └── storage.rst ├── pytest.ini ├── renovate.json ├── requirements.txt ├── scrapy_cookies ├── VERSION ├── __init__.py ├── downloadermiddlewares │ ├── __init__.py │ └── cookies.py ├── settings │ ├── __init__.py │ └── default_settings.py ├── signals.py └── storage │ ├── __init__.py │ ├── in_memory.py │ ├── mongo.py │ ├── redis_.py │ └── sqlite.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── requirements.txt ├── test_downloadermiddleware_cookies.py └── test_storages │ ├── __init__.py │ ├── confest.py │ ├── docker-compose.yml │ ├── test_storage_in_memory.py │ ├── test_storage_mongo.py │ ├── test_storage_redis.py │ └── test_storage_sqlite.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | include = scrapy_cookies/* 4 | omit = 5 | tests/* 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # .gitignore contents are coming from the project gitignore: 2 | # https://github.com/github/gitignore 3 | 4 | 5 | # ----------------------------------------------------------------------------- 6 | # Python 7 | # ----------------------------------------------------------------------------- 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .env 94 | .venv 95 | env/ 96 | venv/ 97 | ENV/ 98 | env.bak/ 99 | venv.bak/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | 115 | # ----------------------------------------------------------------------------- 116 | # JetBrains 117 | # ----------------------------------------------------------------------------- 118 | 119 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, 120 | # Android Studio and WebStorm 121 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 122 | 123 | # User-specific stuff 124 | .idea/**/workspace.xml 125 | .idea/**/tasks.xml 126 | .idea/**/usage.statistics.xml 127 | .idea/**/dictionaries 128 | .idea/**/shelf 129 | 130 | # Sensitive or high-churn files 131 | .idea/**/dataSources/ 132 | .idea/**/dataSources.ids 133 | .idea/**/dataSources.local.xml 134 | .idea/**/sqlDataSources.xml 135 | .idea/**/dynamic.xml 136 | .idea/**/uiDesigner.xml 137 | .idea/**/dbnavigator.xml 138 | 139 | # Gradle 140 | .idea/**/gradle.xml 141 | .idea/**/libraries 142 | 143 | # Gradle and Maven with auto-import 144 | # When using Gradle or Maven with auto-import, you should exclude module files, 145 | # since they will be recreated, and may cause churn. Uncomment if using 146 | # auto-import. 147 | # .idea/modules.xml 148 | # .idea/*.iml 149 | # .idea/modules 150 | 151 | # CMake 152 | cmake-build-*/ 153 | 154 | # Mongo Explorer plugin 155 | .idea/**/mongoSettings.xml 156 | 157 | # File-based project format 158 | *.iws 159 | 160 | # IntelliJ 161 | out/ 162 | 163 | # mpeltonen/sbt-idea plugin 164 | .idea_modules/ 165 | 166 | # JIRA plugin 167 | atlassian-ide-plugin.xml 168 | 169 | # Cursive Clojure plugin 170 | .idea/replstate.xml 171 | 172 | # Crashlytics plugin (for Android Studio and IntelliJ) 173 | com_crashlytics_export_strings.xml 174 | crashlytics.properties 175 | crashlytics-build.properties 176 | fabric.properties 177 | 178 | # Editor-based Rest Client 179 | .idea/httpRequests 180 | 181 | 182 | # ----------------------------------------------------------------------------- 183 | # Linux 184 | # ----------------------------------------------------------------------------- 185 | 186 | *~ 187 | 188 | # temporary files which can be created if a process still has a handle open of 189 | # a deleted file 190 | .fuse_hidden* 191 | 192 | # KDE directory preferences 193 | .directory 194 | 195 | # Linux trash folder which might appear on any partition or disk 196 | .Trash-* 197 | 198 | # .nfs files are created when an open file is removed but is still being 199 | # accessed 200 | .nfs* 201 | 202 | 203 | # ----------------------------------------------------------------------------- 204 | # Vim 205 | # ----------------------------------------------------------------------------- 206 | 207 | # Swap 208 | [._]*.s[a-v][a-z] 209 | [._]*.sw[a-p] 210 | [._]s[a-rt-v][a-z] 211 | [._]ss[a-gi-z] 212 | [._]sw[a-p] 213 | 214 | # Session 215 | Session.vim 216 | 217 | # Temporary 218 | .netrwhist 219 | *~ 220 | # Auto-generated tag files 221 | tags 222 | # Persistent undo 223 | [._]*.un~ 224 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | services: 3 | - mongodb 4 | - redis-server 5 | sudo: false 6 | branches: 7 | only: 8 | - master 9 | - /^\d\.\d+$/ 10 | - /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/ 11 | matrix: 12 | include: 13 | - python: 3.6 14 | env: TOXENV=py36 15 | - python: 3.7 16 | env: TOXENV=py37 17 | dist: xenial 18 | - python: 3.6 19 | env: TOXENV=docs 20 | install: 21 | - pip install -U tox twine wheel codecov 22 | script: tox 23 | after_success: 24 | - codecov 25 | notifications: 26 | slack: 27 | rooms: 28 | - secure: zsDJgHzhPoAIs8OsOiv5wmNsck++hjZljeYAfKh25UwW8X97Rqvq5r9LMlQzIHf2a638AbsubDNeSbrxxu6cFDeeIFngG1EO5mOSWzKr18LM5pFb0GVlImKzZpKgLqKmaD5ATYXnvUaHjEHgO45TzjIsbwo9P4vRU5C/lGYwfdv/J82hP0OUo02HqWGkwpG0aeuzs1bJZKjS/RdHROt0SQpRfVB8hi4HHrQILgliuVcpvIk46FgRB49VmzpAGuQfJtB06gj8o6tL/1JlXQ9/ElrHwEJyGjiyeP/nP8qit+i9TTlHGT9k0s9oYuXWM8OlgKfKE13Mo8fVRaAhVv9DRcwtNpX5M0RtC5bEjCPQIL14ky4ymeSlGchmy37jTKJCNHm4St4CtodCrF5J77h8Gkjx9tkZOhf4Rd8veMMgv/gj8pyt3asJ8PMDvREjF4n4mRPy5SB53anEhrFXE801KOpqb4Ffsjv2DBJmuAId+OmHLs69jHeiwxkBDaeKDr6rpiiKQaZNbDw5KxjEafEtclVmSdprq57Og2SPaCR1TrUu3SVyUjVoWNj6olKS9ALoiDAVLBprbyBsSS9gYwfTlSBNxCsMApQksjmo0/S6n/FwyCvn4AZZVziLVtVxPBY0sUpRBNySkTTrQzpCiEPNmv7tU8d0ZcVI508/WALzIQ0= 29 | cache: 30 | directories: 31 | - $HOME/.cache/pip 32 | #deploy: 33 | # provider: pypi 34 | # distributions: sdist bdist_wheel 35 | # user: grammy.jiang 36 | # password: 37 | # secure: nUWjH3+9D9I+Xrsz7isjVKpzXwxlJuWFi2OrWCMSilxUNaMrV/4fA0TShmS40TCxTGSasmApjZgZz+Qu93Z9KlHnP2nmBsEXnqtrrCMIhI52wLFdnMcTCNCutzOiKzVSMK/SvEvRP6+fcWRbsE0n0hVwUZc/Lwz4083OXoMQIuMs3NbVD0rAPcHTBthTwabQjSp8WwYv9wZj/pZQ7qYw+QOe+b8XhQIIA10Oy9rAcyaOGASMsbBithKap91ayj2yRqmM6kb+nwi4aEJo/+XwQuncJWleTOy88Rt+YtoYkDxoHopjwBR2RAoevfq0Y1Mjl9e1mssunzQ053qmXfKAFB77Xn99iR0bmwSwwCtyBgoY/Ed5+wywwNdE6tfNB8/pYXg3z6mTmIwqXCQhn2+ORdD5RFn9RZAV4IoR1z8WRKU6clsVF2Msc9QUsj5wUA7LXkBg4HlVJurZurbFh58ViVTO2aNo6c+7fBiBwbm2aupeB+RlL9kCz14pbJcd89H6ViWByE6O9pFlyNcEt28FaKLIuyWAsAsYOPnj74oYuoV2hZ7y0259ncGX0UsDVzPwaJ/NlQsi4yh2d300mRvOSbiELhBZdABbkN+pgGmE1mlqUkY9GHb070JsOavzedzsuEgBLAaWgTAxeDd9LqFfIE7iFLj+U9v9d73ZtKy4VeE= 38 | # on: 39 | # branch: master 40 | ## tags: true 41 | # repo: grammy-jiang/scrapy-cookies 42 | # condition: "$TOXENV == py36" 43 | -------------------------------------------------------------------------------- /.whitesource: -------------------------------------------------------------------------------- 1 | { 2 | "generalSettings": { 3 | "shouldScanRepo": true 4 | }, 5 | "checkRunSettings": { 6 | "vulnerableCheckRunConclusionLevel": "failure" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Scrapy-Cookies is a part of Scrapy Enhancement, which intends to explore more 2 | possibility of Scrapy. Once the code is proved useful and stable, it will be 3 | merged back to Scrapy or contributed back to the Scrapy Plugins. 4 | 5 | Here is the list of the primary authors & contributors: 6 | 7 | * Grammy Jiang 8 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | For information about installing Scrapy-Cookies see: 2 | 3 | * docs/intro/installation.rst (local file) 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Scrapy Enhancement developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions, and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions, and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Scrapy nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include AUTHORS 3 | include INSTALL 4 | include LICENSE 5 | include MANIFEST.in 6 | include scrapy_cookies/VERSION 7 | recursive-include docs * 8 | prune docs/build 9 | recursive-include tests * 10 | global-exclude __pycache__ *.py[cod] 11 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | bandit = "*" 8 | black = "*" 9 | flake8 = "*" 10 | flake8-bugbear = "*" 11 | ipython = "*" 12 | isort = "*" 13 | mitmproxy = "*" 14 | mypy = "*" 15 | pre-commit = "*" 16 | prospector = "*" 17 | pylint = "*" 18 | pytest = "*" 19 | pytest-benchmark = "*" 20 | pytest-black = "*" 21 | pytest-cov = "*" 22 | pytest-docker-compose = "*" 23 | pytest-env = "*" 24 | pytest-instafail = "*" 25 | pytest-mypy = "*" 26 | pytest-pycharm = "*" 27 | pytest-pylint = "*" 28 | pytest-sugar = "*" 29 | pytest-twisted = "*" 30 | pytest-watch = "*" 31 | pytest-xdist = "*" 32 | radon = "*" 33 | tox = "*" 34 | testfixtures = "*" 35 | 36 | [packages] 37 | hiredis = "*" 38 | pymongo = "*" 39 | redis = "*" 40 | six = "*" 41 | ujson = "*" 42 | Scrapy = "*" 43 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Scrapy Cookies 3 | ============== 4 | 5 | .. image:: https://img.shields.io/pypi/v/scrapy-cookies.svg 6 | :target: https://pypi.python.org/pypi/scrapy-cookies 7 | :alt: PyPI 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-cookies.svg 10 | :target: https://pypi.python.org/pypi/scrapy-cookies 11 | :alt: PyPI - Python Version 12 | 13 | .. image:: https://img.shields.io/travis/scrapedia/scrapy-cookies/master.svg 14 | :target: http://travis-ci.org/scrapedia/scrapy-cookies 15 | :alt: Travis branch 16 | 17 | .. image:: https://img.shields.io/pypi/wheel/scrapy-cookies.svg 18 | :target: https://pypi.python.org/pypi/scrapy-cookies 19 | :alt: PyPI - Wheel 20 | 21 | .. image:: https://img.shields.io/codecov/c/github/scrapedia/scrapy-cookies/master.svg 22 | :target: http://codecov.io/github/scrapedia/scrapy-cookies?branch=master 23 | :alt: Codecov branch 24 | 25 | Overview 26 | ======== 27 | 28 | This middleware enable Scrapy manage, save and restore cookies in various ways. 29 | With this middleware Scrapy can easily re-use cookies which saved before or 30 | in multiple spiders, and share cookies between spiders, even in spider-cluster. 31 | 32 | Requirements 33 | ============ 34 | 35 | * Python 2.7 or Python 3.4+ 36 | * Works on Linux, Windows, Mac OSX, BSD 37 | 38 | Installation 39 | ============ 40 | 41 | The quick way: 42 | 43 | pip install scrapy-cookies 44 | 45 | For more details see the installation section in the documentation: 46 | https://scrapy-cookies.readthedocs.io/en/latest/intro/installation.html 47 | 48 | Documentation 49 | ============= 50 | 51 | Documentation is available online at 52 | https://scrapy-cookies.readthedocs.io/en/latest/ and in the ``docs`` directory. 53 | 54 | Releases 55 | ======== 56 | 57 | You can find release notes at 58 | https://scrapy-cookies.readthedocs.io/en/latest/news.html 59 | 60 | Community (blog, twitter, mail list, IRC) 61 | ========================================= 62 | 63 | *Keeping this section same as Scrapy is intending to benefit back to Scrapy.* 64 | 65 | See https://scrapy.org/community/ 66 | 67 | Contributing 68 | ============ 69 | 70 | *Keeping this section same as Scrapy is intending to be easier when this repo 71 | merge back to Scrapy.* 72 | 73 | See https://doc.scrapy.org/en/master/contributing.html 74 | 75 | Code of Conduct 76 | --------------- 77 | 78 | Please note that this project is released with a Contributor Code of Conduct 79 | (see https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md). 80 | 81 | By participating in this project you agree to abide by its terms. 82 | Please report unacceptable behavior to opensource@scrapinghub.com. 83 | 84 | 85 | Companies using Scrapy 86 | ====================== 87 | 88 | *Keeping this section same as Scrapy is intending to benefit back to Scrapy.* 89 | 90 | See https://scrapy.org/companies/ 91 | 92 | Commercial Support 93 | ================== 94 | 95 | *Keeping this section same as Scrapy is intending to benefit back to Scrapy.* 96 | 97 | See https://scrapy.org/support/ 98 | 99 | TODO 100 | ==== 101 | 102 | * [X] Remove the support lower than python 3.6 103 | * [ ] Use JSON1 extension in sqlite backend 104 | * [ ] Update backend arguments calling way 105 | * [ ] Replace pymongo with txmongo in MongoDB backend 106 | * [ ] Replace redis sync driver with async driver in Redis backend 107 | * [ ] Change LICENSE to GPLv3 108 | * [ ] Use versioneer for version management 109 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "header, diff, tree" 3 | 4 | coverage: 5 | status: 6 | project: false 7 | notify: 8 | slack: 9 | default: 10 | url: "secret:KQc0qNe30SGOA3baphzz48aXGWPJlE6qDlk4qZGGdW8fAEEJG8lHubU9301vJCECqEhv5E+JNHXfWKd+bcKjhIc5nhgt2w2BaZyEXEawhaTx0MJZ8xjX/unaul2wA5rL3ZkV4loVbN34sOq7vFgEzSS" 11 | branches: null 12 | flags: null 13 | only_pulls: false 14 | paths: null 15 | threshold: 1% 16 | -------------------------------------------------------------------------------- /docker/MongoDB/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | # https://hub.docker.com/_/mongo/ 4 | mongo: 5 | container_name: dc-mongo 6 | image: mongo:latest 7 | networks: 8 | - mongo 9 | ports: 10 | - "127.0.0.1:27017:27017" 11 | restart: always 12 | tty: true 13 | # https://hub.docker.com/_/mongo-express/ 14 | mongo-express: 15 | container_name: dc-mongodb-express 16 | depends_on: 17 | - mongo 18 | environment: 19 | ME_CONFIG_MONGODB_PORT: 27017 20 | ME_CONFIG_MONGODB_SERVER: mongo 21 | image: mongo-express:latest 22 | links: 23 | - mongo 24 | networks: 25 | - mongo 26 | ports: 27 | - "127.0.0.1:8081:8081" 28 | restart: always 29 | tty: true 30 | 31 | networks: 32 | mongo: 33 | driver: bridge 34 | -------------------------------------------------------------------------------- /docker/Redis/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | redis: 4 | container_name: dc-redis 5 | image: redis:latest 6 | networks: 7 | - redis 8 | ports: 9 | - "127.0.0.1:6379:6379" 10 | restart: always 11 | tty: true 12 | redis-commander: 13 | command: --redis-host redis 14 | container_name: dc-redis-commander 15 | depends_on: 16 | - redis 17 | image: tenstartups/redis-commander:latest 18 | links: 19 | - redis 20 | networks: 21 | - redis 22 | ports: 23 | - "127.0.0.1:8181:8081" 24 | restart: always 25 | tty: true 26 | 27 | networks: 28 | redis: 29 | driver: bridge 30 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Scrapy-Cookies 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_ext/scrapydocs.py: -------------------------------------------------------------------------------- 1 | from docutils.parsers.rst.roles import set_classes 2 | from docutils import nodes 3 | from docutils.parsers.rst import Directive 4 | from sphinx.util.nodes import make_refnode 5 | from operator import itemgetter 6 | 7 | 8 | class settingslist_node(nodes.General, nodes.Element): 9 | pass 10 | 11 | 12 | class SettingsListDirective(Directive): 13 | def run(self): 14 | return [settingslist_node('')] 15 | 16 | 17 | def is_setting_index(node): 18 | if node.tagname == 'index': 19 | # index entries for setting directives look like: 20 | # [(u'pair', u'SETTING_NAME; setting', u'std:setting-SETTING_NAME', '')] 21 | entry_type, info, refid = node['entries'][0][:3] 22 | return entry_type == 'pair' and info.endswith('; setting') 23 | return False 24 | 25 | 26 | def get_setting_target(node): 27 | # target nodes are placed next to the node in the doc tree 28 | return node.parent[node.parent.index(node) + 1] 29 | 30 | 31 | def get_setting_name_and_refid(node): 32 | """Extract setting name from directive index node""" 33 | entry_type, info, refid = node['entries'][0][:3] 34 | return info.replace('; setting', ''), refid 35 | 36 | 37 | def collect_scrapy_settings_refs(app, doctree): 38 | env = app.builder.env 39 | 40 | if not hasattr(env, 'scrapy_all_settings'): 41 | env.scrapy_all_settings = [] 42 | 43 | for node in doctree.traverse(is_setting_index): 44 | targetnode = get_setting_target(node) 45 | assert isinstance(targetnode, nodes.target), "Next node is not a target" 46 | 47 | setting_name, refid = get_setting_name_and_refid(node) 48 | 49 | env.scrapy_all_settings.append({ 50 | 'docname': env.docname, 51 | 'setting_name': setting_name, 52 | 'refid': refid, 53 | }) 54 | 55 | 56 | def make_setting_element(setting_data, app, fromdocname): 57 | refnode = make_refnode(app.builder, fromdocname, 58 | todocname=setting_data['docname'], 59 | targetid=setting_data['refid'], 60 | child=nodes.Text(setting_data['setting_name'])) 61 | p = nodes.paragraph() 62 | p += refnode 63 | 64 | item = nodes.list_item() 65 | item += p 66 | return item 67 | 68 | 69 | def replace_settingslist_nodes(app, doctree, fromdocname): 70 | env = app.builder.env 71 | 72 | for node in doctree.traverse(settingslist_node): 73 | settings_list = nodes.bullet_list() 74 | settings_list.extend([make_setting_element(d, app, fromdocname) 75 | for d in sorted(env.scrapy_all_settings, 76 | key=itemgetter('setting_name')) 77 | if fromdocname != d['docname']]) 78 | node.replace_self(settings_list) 79 | 80 | 81 | def setup(app): 82 | app.add_crossref_type( 83 | directivename = "setting", 84 | rolename = "setting", 85 | indextemplate = "pair: %s; setting", 86 | ) 87 | app.add_crossref_type( 88 | directivename = "signal", 89 | rolename = "signal", 90 | indextemplate = "pair: %s; signal", 91 | ) 92 | app.add_crossref_type( 93 | directivename = "command", 94 | rolename = "command", 95 | indextemplate = "pair: %s; command", 96 | ) 97 | app.add_crossref_type( 98 | directivename = "reqmeta", 99 | rolename = "reqmeta", 100 | indextemplate = "pair: %s; reqmeta", 101 | ) 102 | app.add_role('source', source_role) 103 | app.add_role('commit', commit_role) 104 | app.add_role('issue', issue_role) 105 | app.add_role('rev', rev_role) 106 | 107 | app.add_node(settingslist_node) 108 | app.add_directive('settingslist', SettingsListDirective) 109 | 110 | app.connect('doctree-read', collect_scrapy_settings_refs) 111 | app.connect('doctree-resolved', replace_settingslist_nodes) 112 | 113 | 114 | def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]): 115 | ref = 'https://github.com/scrapy/scrapy/blob/master/' + text 116 | set_classes(options) 117 | node = nodes.reference(rawtext, text, refuri=ref, **options) 118 | return [node], [] 119 | 120 | 121 | def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]): 122 | ref = 'https://github.com/scrapy/scrapy/issues/' + text 123 | set_classes(options) 124 | node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options) 125 | return [node], [] 126 | 127 | 128 | def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]): 129 | ref = 'https://github.com/scrapy/scrapy/commit/' + text 130 | set_classes(options) 131 | node = nodes.reference(rawtext, 'commit ' + text, refuri=ref, **options) 132 | return [node], [] 133 | 134 | 135 | def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]): 136 | ref = 'http://hg.scrapy.org/scrapy/changeset/' + text 137 | set_classes(options) 138 | node = nodes.reference(rawtext, 'r' + text, refuri=ref, **options) 139 | return [node], [] 140 | -------------------------------------------------------------------------------- /docs/_static/selectors-sample1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example website 5 | 6 | 7 |
8 | Name: My image 1
9 | Name: My image 2
10 | Name: My image 3
11 | Name: My image 4
12 | Name: My image 5
13 |
14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {% block footer %} 4 | {{ super() }} 5 | 16 | {% endblock %} 17 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Scrapy-Cookies documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Nov 24 12:02:52 2008. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # The contents of this file are pickled, so don't put values in the namespace 9 | # that aren't pickleable (module imports are okay, they're removed 10 | # automatically). 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | from os import path 17 | 18 | # If your extensions are in another directory, add it here. If the directory 19 | # is relative to the documentation root, use os.path.abspath to make it 20 | # absolute, like shown here. 21 | sys.path.append(path.join(path.dirname(__file__), "_ext")) 22 | sys.path.insert(0, path.dirname(path.dirname(__file__))) 23 | 24 | 25 | # General configuration 26 | # --------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be extensions 29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 30 | extensions = [ 31 | 'scrapydocs', 32 | 'sphinx.ext.autodoc' 33 | ] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ['_templates'] 37 | 38 | # The suffix of source filenames. 39 | source_suffix = '.rst' 40 | 41 | # The encoding of source files. 42 | #source_encoding = 'utf-8' 43 | 44 | # The master toctree document. 45 | master_doc = 'index' 46 | 47 | # General information about the project. 48 | project = 'Scrapy-Cookies' 49 | copyright = '2018, Scrapy Enhancement developers' 50 | 51 | # The version info for the project you're documenting, acts as replacement for 52 | # |version| and |release|, also used in various other places throughout the 53 | # built documents. 54 | # 55 | # The short X.Y version. 56 | try: 57 | import scrapy_cookies 58 | version = '.'.join(map(str, scrapy_cookies.version_info[:2])) 59 | release = scrapy_cookies.__version__ 60 | except ImportError: 61 | version = '' 62 | release = '' 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | language = 'en' 67 | 68 | # There are two options for replacing |today|: either, you set today to some 69 | # non-false value, then it is used: 70 | #today = '' 71 | # Else, today_fmt is used as the format for a strftime call. 72 | #today_fmt = '%B %d, %Y' 73 | 74 | # List of documents that shouldn't be included in the build. 75 | #unused_docs = [] 76 | 77 | # List of directories, relative to source directory, that shouldn't be searched 78 | # for source files. 79 | exclude_trees = ['.build'] 80 | 81 | # The reST default role (used for this markup: `text`) to use for all documents. 82 | #default_role = None 83 | 84 | # If true, '()' will be appended to :func: etc. cross-reference text. 85 | #add_function_parentheses = True 86 | 87 | # If true, the current module name will be prepended to all description 88 | # unit titles (such as .. function::). 89 | #add_module_names = True 90 | 91 | # If true, sectionauthor and moduleauthor directives will be shown in the 92 | # output. They are ignored by default. 93 | #show_authors = False 94 | 95 | # The name of the Pygments (syntax highlighting) style to use. 96 | pygments_style = 'sphinx' 97 | 98 | 99 | # Options for HTML output 100 | # ----------------------- 101 | 102 | # The theme to use for HTML and HTML Help pages. See the documentation for 103 | # a list of builtin themes. 104 | html_theme = 'sphinx_rtd_theme' 105 | 106 | # Theme options are theme-specific and customize the look and feel of a theme 107 | # further. For a list of options available for each theme, see the 108 | # documentation. 109 | #html_theme_options = {} 110 | 111 | # Add any paths that contain custom themes here, relative to this directory. 112 | # Add path to the RTD explicitly to robustify builds (otherwise might 113 | # fail in a clean Debian build env) 114 | import sphinx_rtd_theme 115 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 116 | 117 | 118 | # The style sheet to use for HTML and HTML Help pages. A file of that name 119 | # must exist either in Sphinx' static/ path, or in one of the custom paths 120 | # given in html_static_path. 121 | # html_style = 'scrapydoc.css' 122 | 123 | # The name for this set of Sphinx documents. If None, it defaults to 124 | # " v documentation". 125 | #html_title = None 126 | 127 | # A shorter title for the navigation bar. Default is the same as html_title. 128 | #html_short_title = None 129 | 130 | # The name of an image file (relative to this directory) to place at the top 131 | # of the sidebar. 132 | #html_logo = None 133 | 134 | # The name of an image file (within the static path) to use as favicon of the 135 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 136 | # pixels large. 137 | #html_favicon = None 138 | 139 | # Add any paths that contain custom static files (such as style sheets) here, 140 | # relative to this directory. They are copied after the builtin static files, 141 | # so a file named "default.css" will overwrite the builtin "default.css". 142 | html_static_path = ['_static'] 143 | 144 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 145 | # using the given strftime format. 146 | html_last_updated_fmt = '%b %d, %Y' 147 | 148 | # Custom sidebar templates, maps document names to template names. 149 | #html_sidebars = {} 150 | 151 | # Additional templates that should be rendered to pages, maps page names to 152 | # template names. 153 | #html_additional_pages = {} 154 | 155 | # If false, no module index is generated. 156 | #html_use_modindex = True 157 | 158 | # If false, no index is generated. 159 | #html_use_index = True 160 | 161 | # If true, the index is split into individual pages for each letter. 162 | #html_split_index = False 163 | 164 | # If true, the reST sources are included in the HTML build as _sources/. 165 | html_copy_source = True 166 | 167 | # If true, an OpenSearch description file will be output, and all pages will 168 | # contain a tag referring to it. The value of this option must be the 169 | # base URL from which the finished HTML is served. 170 | #html_use_opensearch = '' 171 | 172 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 173 | #html_file_suffix = '' 174 | 175 | # Output file base name for HTML help builder. 176 | htmlhelp_basename = 'Scrapydoc' 177 | 178 | 179 | # Options for LaTeX output 180 | # ------------------------ 181 | 182 | # The paper size ('letter' or 'a4'). 183 | #latex_paper_size = 'letter' 184 | 185 | # The font size ('10pt', '11pt' or '12pt'). 186 | #latex_font_size = '10pt' 187 | 188 | # Grouping the document tree into LaTeX files. List of tuples 189 | # (source start file, target name, title, author, document class [howto/manual]). 190 | latex_documents = [ 191 | ('index', 'Scrapy.tex', u'Scrapy Documentation', 192 | u'Scrapy developers', 'manual'), 193 | ] 194 | 195 | # The name of an image file (relative to this directory) to place at the top of 196 | # the title page. 197 | #latex_logo = None 198 | 199 | # For "manual" documents, if this is true, then toplevel headings are parts, 200 | # not chapters. 201 | #latex_use_parts = False 202 | 203 | # Additional stuff for the LaTeX preamble. 204 | #latex_preamble = '' 205 | 206 | # Documents to append as an appendix to all manuals. 207 | #latex_appendices = [] 208 | 209 | # If false, no module index is generated. 210 | #latex_use_modindex = True 211 | 212 | 213 | # Options for the linkcheck builder 214 | # --------------------------------- 215 | 216 | # A list of regular expressions that match URIs that should not be checked when 217 | # doing a linkcheck build. 218 | linkcheck_ignore = [ 219 | 'http://localhost:\d+', 'http://hg.scrapy.org', 220 | 'http://directory.google.com/' 221 | ] 222 | -------------------------------------------------------------------------------- /docs/conf.py.bak: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | sys.path.append(path.join(path.dirname(__file__), "_ext")) 20 | sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = 'Scrapy-Cookies' 25 | copyright = '2018, Grammy Jiang' 26 | author = 'Grammy Jiang' 27 | 28 | try: 29 | import scrapy 30 | version = '.'.join(map(str, scrapy.version_info[:2])) 31 | release = scrapy.__version__ 32 | except ImportError: 33 | version = '' 34 | release = '' 35 | 36 | # The short X.Y version 37 | # version = '' 38 | # The full version, including alpha/beta/rc tags 39 | # release = '0.0.1' 40 | 41 | 42 | # -- General configuration --------------------------------------------------- 43 | 44 | # If your documentation needs a minimal Sphinx version, state it here. 45 | # 46 | # needs_sphinx = '1.0' 47 | 48 | # Add any Sphinx extension module names here, as strings. They can be 49 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 50 | # ones. 51 | extensions = [ 52 | 'scrapydocs', 53 | 'sphinx.ext.autodoc', 54 | 'sphinx.ext.doctest', 55 | 'sphinx.ext.intersphinx', 56 | 'sphinx.ext.todo', 57 | 'sphinx.ext.coverage', 58 | 'sphinx.ext.mathjax', 59 | 'sphinx.ext.ifconfig', 60 | 'sphinx.ext.viewcode', 61 | 'sphinx.ext.githubpages', 62 | ] 63 | 64 | # Add any paths that contain templates here, relative to this directory. 65 | templates_path = ['_templates'] 66 | 67 | # The suffix(es) of source filenames. 68 | # You can specify multiple suffix as a list of string: 69 | # 70 | # source_suffix = ['.rst', '.md'] 71 | source_suffix = '.rst' 72 | 73 | # The master toctree document. 74 | master_doc = 'index' 75 | 76 | # The language for content autogenerated by Sphinx. Refer to documentation 77 | # for a list of supported languages. 78 | # 79 | # This is also used if you do content translation via gettext catalogs. 80 | # Usually you set "language" from the command line for these cases. 81 | language = None 82 | 83 | # List of patterns, relative to source directory, that match files and 84 | # directories to ignore when looking for source files. 85 | # This pattern also affects html_static_path and html_extra_path . 86 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 87 | 88 | # The name of the Pygments (syntax highlighting) style to use. 89 | pygments_style = 'sphinx' 90 | 91 | 92 | # -- Options for HTML output ------------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | # 97 | html_theme = 'alabaster' 98 | 99 | # Theme options are theme-specific and customize the look and feel of a theme 100 | # further. For a list of options available for each theme, see the 101 | # documentation. 102 | # 103 | # html_theme_options = {} 104 | 105 | # Add any paths that contain custom static files (such as style sheets) here, 106 | # relative to this directory. They are copied after the builtin static files, 107 | # so a file named "default.css" will overwrite the builtin "default.css". 108 | html_static_path = ['_static'] 109 | 110 | # Custom sidebar templates, must be a dictionary that maps document names 111 | # to template names. 112 | # 113 | # The default sidebars (for documents that don't match any pattern) are 114 | # defined by theme itself. Builtin themes are using these templates by 115 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 116 | # 'searchbox.html']``. 117 | # 118 | # html_sidebars = {} 119 | 120 | 121 | # -- Options for HTMLHelp output --------------------------------------------- 122 | 123 | # Output file base name for HTML help builder. 124 | htmlhelp_basename = 'Scrapy-Cookiesdoc' 125 | 126 | 127 | # -- Options for LaTeX output ------------------------------------------------ 128 | 129 | latex_elements = { 130 | # The paper size ('letterpaper' or 'a4paper'). 131 | # 132 | # 'papersize': 'letterpaper', 133 | 134 | # The font size ('10pt', '11pt' or '12pt'). 135 | # 136 | # 'pointsize': '10pt', 137 | 138 | # Additional stuff for the LaTeX preamble. 139 | # 140 | # 'preamble': '', 141 | 142 | # Latex figure (float) alignment 143 | # 144 | # 'figure_align': 'htbp', 145 | } 146 | 147 | # Grouping the document tree into LaTeX files. List of tuples 148 | # (source start file, target name, title, 149 | # author, documentclass [howto, manual, or own class]). 150 | latex_documents = [ 151 | (master_doc, 'Scrapy-Cookies.tex', 'Scrapy-Cookies Documentation', 152 | 'Grammy Jiang', 'manual'), 153 | ] 154 | 155 | 156 | # -- Options for manual page output ------------------------------------------ 157 | 158 | # One entry per manual page. List of tuples 159 | # (source start file, name, description, authors, manual section). 160 | man_pages = [ 161 | (master_doc, 'scrapy-cookies', 'Scrapy-Cookies Documentation', 162 | [author], 1) 163 | ] 164 | 165 | 166 | # -- Options for Texinfo output ---------------------------------------------- 167 | 168 | # Grouping the document tree into Texinfo files. List of tuples 169 | # (source start file, target name, title, author, 170 | # dir menu entry, description, category) 171 | texinfo_documents = [ 172 | (master_doc, 'Scrapy-Cookies', 'Scrapy-Cookies Documentation', 173 | author, 'Scrapy-Cookies', 'One line description of project.', 174 | 'Miscellaneous'), 175 | ] 176 | 177 | 178 | # -- Options for Epub output ------------------------------------------------- 179 | 180 | # Bibliographic Dublin Core info. 181 | epub_title = project 182 | epub_author = author 183 | epub_publisher = author 184 | epub_copyright = copyright 185 | 186 | # The unique identifier of the text. This can be a ISBN number 187 | # or the project homepage. 188 | # 189 | # epub_identifier = '' 190 | 191 | # A unique identification for the text. 192 | # 193 | # epub_uid = '' 194 | 195 | # A list of files that should not be packed into the epub file. 196 | epub_exclude_files = ['search.html'] 197 | 198 | 199 | # -- Extension configuration ------------------------------------------------- 200 | 201 | # -- Options for intersphinx extension --------------------------------------- 202 | 203 | # Example configuration for intersphinx: refer to the Python standard library. 204 | intersphinx_mapping = {'https://docs.python.org/': None} 205 | 206 | # -- Options for todo extension ---------------------------------------------- 207 | 208 | # If true, `todo` and `todoList` produce output, else they produce nothing. 209 | todo_include_todos = True 210 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _topics-index: 2 | 3 | ====================================== 4 | Scrapy-Cookies |version| documentation 5 | ====================================== 6 | 7 | This documentation contains everything you need to know about Scrapy-Cookies. 8 | 9 | First steps 10 | =========== 11 | 12 | .. toctree:: 13 | :caption: First steps 14 | :hidden: 15 | 16 | intro/overview 17 | intro/installation 18 | intro/tutorial 19 | intro/examples 20 | 21 | :doc:`intro/overview` 22 | Understand what Scrapy-Cookies is and how it can help you. 23 | 24 | :doc:`intro/installation` 25 | Get Scrapy-Cookies installed on your computer. 26 | 27 | :doc:`intro/tutorial` 28 | Write your first project with Scrapy-Cookies. 29 | 30 | :doc:`intro/examples` 31 | Learn more by playing with a pre-made project with Scrapy-Cookies. 32 | 33 | .. _section-basics: 34 | 35 | Basic concepts 36 | ============== 37 | 38 | .. toctree:: 39 | :caption: Basic concepts 40 | :hidden: 41 | 42 | topics/cookiesmiddleware 43 | topics/storage 44 | topics/settings 45 | 46 | 47 | :doc:`topics/cookiesmiddleware` 48 | Extract cookies from response and Restore cookies to request. 49 | 50 | :doc:`topics/storage` 51 | Save ,restore and share the cookies. 52 | 53 | :doc:`topics/settings` 54 | Learn how to configure Scrapy-Cookies and see all available settings. 55 | 56 | 57 | .. _extending-scrapy: 58 | 59 | Extending Scrapy-Cookies 60 | ======================== 61 | 62 | .. toctree:: 63 | :caption: Extending Scrapy-Cookies 64 | :hidden: 65 | 66 | topics/storage 67 | 68 | 69 | :doc:`topics/storage` 70 | Customize how the storage save, restore and share the cookies 71 | -------------------------------------------------------------------------------- /docs/intro/examples.rst: -------------------------------------------------------------------------------- 1 | .. _intro-examples: 2 | 3 | ======== 4 | Examples 5 | ======== 6 | 7 | The best way to learn is with examples, and Scrapy-Cookies is no exception. For 8 | this reason, there is an example project with Scrapy-Cookies named grouponbot_, 9 | that you can use to play and learn more about Scrapy-Cookies. It contains one 10 | spiders for https://www.groupon.com.au, only crawl the first page and save the 11 | cookies. 12 | 13 | The grouponbot_ project is available at: 14 | https://github.com/grammy-jiang/scrapy-enhancement-examples. You can find more 15 | information about it in the project's README. 16 | 17 | If you're familiar with git, you can checkout the code. Otherwise you can 18 | download the project as a zip file by clicking 19 | `here `_. 20 | 21 | .. _grouponbot: https://github.com/grammy-jiang/scrapy-enhancement-examples 22 | -------------------------------------------------------------------------------- /docs/intro/installation.rst: -------------------------------------------------------------------------------- 1 | .. _intro-installation: 2 | 3 | ================== 4 | Installation guide 5 | ================== 6 | 7 | Installing Scrapy 8 | ================= 9 | 10 | Scrapy-Cookies runs on Python 2.7 and Python 3.4 or above under CPython (default 11 | Python implementation) and PyPy (starting with PyPy 5.9). 12 | 13 | You can install Scrapy-Cookies and its dependencies from PyPI with:: 14 | 15 | pip install Scrapy-Cookies 16 | 17 | We strongly recommend that you install Scrapy and Scrapy-Cookies in 18 | :ref:`a dedicated virtualenv `, to avoid conflicting 19 | with your system packages. 20 | 21 | For more detailed and platform specifics instructions, read on. 22 | 23 | 24 | Things that are good to know 25 | ---------------------------- 26 | 27 | Scrapy-Cookies is written in pure Python and depends on a few key Python 28 | packages (among others): 29 | 30 | * `Scrapy`_, of course 31 | * `PyMongo`_ 32 | * `redis-py`_ 33 | * `ujson`_ 34 | 35 | The minimal versions which Scrapy-Cookies is tested against are: 36 | 37 | * Scrapy 1.5.0 38 | 39 | Scrapy-Cookies may work with older versions of these packages but it is not 40 | guaranteed it will continue working because it’s not being tested against them. 41 | 42 | .. _Scrapy: https://scrapy.org/ 43 | .. _PyMongo: http://api.mongodb.com/python/current/ 44 | .. _redis-py: https://redis-py.readthedocs.io/en/latest/ 45 | .. _ujson: https://github.com/esnme/ultrajson 46 | 47 | 48 | .. _intro-using-virtualenv: 49 | 50 | Using a virtual environment (recommended) 51 | ----------------------------------------- 52 | 53 | TL;DR: We recommend installing Scrapy-Cookies inside a virtual environment on 54 | all platforms. 55 | 56 | Python packages can be installed either globally (a.k.a system wide), or in 57 | user-space. We do not recommend installing Scrapy and Scrapy-Cookies 58 | system wide. 59 | 60 | Instead, we recommend that you install Scrapy and Scrapy-Cookies within a 61 | so-called "virtual environment" (`virtualenv`_). Virtualenvs allow you to not 62 | conflict with already-installed Python system packages (which could break some 63 | of your system tools and scripts), and still install packages normally with 64 | ``pip`` (without ``sudo`` and the likes). 65 | 66 | To get started with virtual environments, see 67 | `virtualenv installation instructions`_. To install it globally (having it 68 | globally installed actually helps here), it should be a matter of running:: 69 | 70 | $ [sudo] pip install virtualenv 71 | 72 | Check this `user guide`_ on how to create your virtualenv. 73 | 74 | .. note:: 75 | If you use Linux or OS X, `virtualenvwrapper`_ is a handy tool to create 76 | virtualenvs. 77 | 78 | Once you have created a virtualenv, you can install Scrapy-Cookies inside it 79 | with ``pip``, just like any other Python package. 80 | (See :ref:`platform-specific guides ` 81 | below for non-Python dependencies that you may need to install beforehand). 82 | 83 | Python virtualenvs can be created to use Python 2 by default, or Python 3 by 84 | default. 85 | 86 | * If you want to install Scrapy-Cookies with Python 3, install Scrapy-Cookies 87 | within a Python 3 virtualenv. 88 | * And if you want to install Scrapy-Cookies with Python 2, install 89 | Scrapy-Cookies within a Python 2 virtualenv. 90 | 91 | .. _virtualenv: https://virtualenv.pypa.io 92 | .. _virtualenv installation instructions: https://virtualenv.pypa.io/en/stable/installation/ 93 | .. _virtualenvwrapper: https://virtualenvwrapper.readthedocs.io/en/latest/install.html 94 | .. _user guide: https://virtualenv.pypa.io/en/stable/userguide/ 95 | 96 | 97 | .. _intro-install-platform-notes: 98 | 99 | Platform specific installation notes 100 | ==================================== 101 | 102 | .. _intro-install-windows: 103 | 104 | Windows 105 | ------- 106 | 107 | Same as Scrapy. 108 | 109 | 110 | .. _intro-install-ubuntu: 111 | 112 | Ubuntu 14.04 or above 113 | --------------------- 114 | 115 | Same as Scrapy. 116 | 117 | 118 | .. _intro-install-macos: 119 | 120 | Mac OS X 121 | -------- 122 | 123 | Same as Scrapy. 124 | 125 | 126 | PyPy 127 | ---- 128 | 129 | Same as Scrapy. 130 | -------------------------------------------------------------------------------- /docs/intro/overview.rst: -------------------------------------------------------------------------------- 1 | .. _intro-overview: 2 | 3 | ========================== 4 | Scrapy-Cookies at a glance 5 | ========================== 6 | 7 | Scrapy-Cookies is a downloader middleware for Scrapy. 8 | 9 | Even though Scrapy-Cookies was originally designed for cookies save and restore 10 | (manage the login session), it can also be used to share cookies between various 11 | spider nodes. 12 | 13 | 14 | Walk-through of an example spider 15 | ================================= 16 | 17 | In order to show you what Scrapy-Cookies brings to the table, we'll walk you 18 | through an example of a Scrapy project's settings with Scrapy-Cookies using the 19 | simplest way to save and restore the cookies. 20 | 21 | Here's the code for settings that uses in memory as storage:: 22 | 23 | DOWNLOADER_MIDDLEWARES.update({ 24 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 25 | 'scrapy_cookies.downloadermiddlewares.cookies.CookiesMiddleware': 700, 26 | }) 27 | 28 | COOKIES_ENABLED = True 29 | 30 | COOKIES_PERSISTENCE = True 31 | COOKIES_PERSISTENCE_DIR = 'cookies' 32 | 33 | # ------------------------------------------------------------------------------ 34 | # IN MEMORY STORAGE 35 | # ------------------------------------------------------------------------------ 36 | 37 | COOKIES_STORAGE = 'scrapy_cookies.storage.in_memory.InMemoryStorage' 38 | 39 | Put this in your project's settings, and run your spider. 40 | 41 | When this finishes you will have a ``cookies`` file in the folder ``.scrapy`` 42 | under your project folder. The file ``cookies`` is the pickled object contained 43 | cookies from your spider. 44 | 45 | 46 | What just happened? 47 | ------------------- 48 | 49 | When you run your spider, this middleware initializes all objects related to 50 | maintaining cookies. 51 | 52 | The crawl starts to send requests and receive responses, at the same time this 53 | middleware extracts and sets the cookies from and to requests and responses. 54 | 55 | When the spider stopped, this middleware will save the cookies to the path 56 | defined in ``COOKIES_PERSISTENCE_DIR``. 57 | 58 | 59 | .. _topics-whatelse: 60 | 61 | What else? 62 | ========== 63 | 64 | You've seen how to save and store cookies with Scrapy-Cookies. And this 65 | middleware provides an interface to let you customize your own cookies storage 66 | ways, such as: 67 | 68 | 69 | * In-memory storage, with ultra-fast speed to process 70 | 71 | * SQLite storage, with ultra-fast speed when uses memory database, and easy to 72 | read and sharing with other process on disk databases 73 | 74 | * Other database like MongoDB, MySQL, even HBase to integrate with other 75 | programmes across your 76 | 77 | 78 | What's next? 79 | ============ 80 | 81 | The next steps for you are to 82 | :ref:`install Scrapy-Cookies `, 83 | :ref:`follow through the tutorial ` to learn how to create 84 | a project with Scrapy-Cookies and `join the community`_. Thanks for your 85 | interest! 86 | 87 | .. _join the community: https://scrapy.org/community/ 88 | -------------------------------------------------------------------------------- /docs/intro/tutorial.rst: -------------------------------------------------------------------------------- 1 | .. _intro-tutorial: 2 | 3 | ======================= 4 | Scrapy-Cookies Tutorial 5 | ======================= 6 | 7 | In this tutorial, we'll assume that Scrapy-Cookies is already installed on your 8 | system. If that's not the case, see :ref:`intro-installation`. 9 | 10 | This tutorial will walk you through these tasks: 11 | 12 | 1. Use various storage classes in this middleware 13 | 2. Save cookies on disk 14 | 15 | 16 | Use various storage classes in this middleware 17 | ============================================== 18 | 19 | Before you start scraping, just put the following code into your settings.py:: 20 | 21 | DOWNLOADER_MIDDLEWARES.update({ 22 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 23 | 'scrapy_cookies.downloadermiddlewares.cookies.CookiesMiddleware': 700, 24 | }) 25 | 26 | With the default settings of this middleware, a in-memory storage will be used. 27 | 28 | There is a storage named SQLiteStorage. If you want to use it instead of the 29 | in-memory one, simple put the following code below the previous one:: 30 | 31 | COOKIES_STORAGE = 'scrapy_cookies.storage.sqlite.SQLiteStorage' 32 | COOKIES_SQLITE_DATABASE = ':memory:' 33 | 34 | There are other storage classes provided with this middleware, please refer to 35 | :ref:`topics-storage`. 36 | 37 | When you implement your own storage, you can set ``COOKIES_STORAGE`` to your own 38 | one. 39 | 40 | 41 | Save cookies and restore in your next run 42 | ========================================= 43 | 44 | By default this middleware would not save the cookies. When you need to keep 45 | the cookies for further usage, for example a login cookie, you wish to save the 46 | cookies on disk for next run. 47 | 48 | This middleware provides this ability with one setting:: 49 | 50 | COOKIES_PERSISTENCE = True 51 | 52 | Most of time the file saved cookies is named ``cookies`` under the folder 53 | ``.scrapy``. If you want to change it, use this setting:: 54 | 55 | COOKIES_PERSISTENCE_DIR = 'your-cookies-path' 56 | 57 | After these settings, this middleware would load the previous saved cookies in 58 | the next run. 59 | 60 | .. note:: Please keep the storage is the same class when you want save the 61 | cookies and restore them. The cookies persistence file is not compatible 62 | between different storage classes. 63 | 64 | .. note:: This feature depends on the storage class used. 65 | 66 | Next steps 67 | ========== 68 | 69 | This tutorial covered only the basics of Scrapy-Cookies, but there's a lot of 70 | other features not mentioned here. Check the :ref:`topics-whatelse` section in 71 | :ref:`intro-overview` chapter for a quick overview of the most important ones. 72 | 73 | You can continue from the section :ref:`section-basics` to know more about this 74 | middleware, storage and other things this tutorial hasn't covered. If you prefer 75 | to play with an example project, check the :ref:`intro-examples` section. 76 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=Scrapy-Cookies 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx==3.2.1 2 | sphinx_rtd_theme 3 | -------------------------------------------------------------------------------- /docs/topics/cookiesmiddleware.rst: -------------------------------------------------------------------------------- 1 | .. _topics-cookiesmiddleware: 2 | 3 | ================= 4 | CookiesMiddleware 5 | ================= 6 | 7 | This is the downloader middleware to inject cookies into requests and extract 8 | cookies from responses. 9 | 10 | This middleware mostly inherits the one from Scrapy, which implements the 11 | interface of `downloader middleware`_. With minimum changes, now 12 | it supports the storage class which implements a certain interface (actually 13 | MutableMapping_). 14 | 15 | .. _downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 16 | .. _MutableMapping: https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping 17 | -------------------------------------------------------------------------------- /docs/topics/settings.rst: -------------------------------------------------------------------------------- 1 | .. _topic-settings: 2 | 3 | ======== 4 | Settings 5 | ======== 6 | 7 | The default settings of this middleware keeps the same behaviour as the one in 8 | Scrapy. 9 | 10 | As an enhancement, there are some settings added in this middleware: 11 | 12 | .. setting:: COOKIES_PERSISTENCE 13 | 14 | COOKIES_PERSISTENCE 15 | ~~~~~~~~~~~~~~~~~~~ 16 | 17 | Default: ``False`` 18 | 19 | Whether to enable this cookies middleware save the cookies on disk. If disabled, 20 | no cookies will be saved on disk. 21 | 22 | Notice that this setting only affects when the storage uses memory as cookies 23 | container. 24 | 25 | .. setting:: COOKIES_DEBUG 26 | 27 | COOKIES_PERSISTENCE_DIR 28 | ~~~~~~~~~~~~~~~~~~~~~~~ 29 | 30 | Default: ``cookies`` 31 | 32 | When ``COOKIES_PERSISTENCE`` is True, the storage which use memory as cookies 33 | container will save the cookies in the file ``cookies`` under the folder 34 | ``.scrapy`` in your project, while if the storage does not use memory as cookies 35 | container will not affect by this setting. 36 | 37 | .. setting:: COOKIES_STORAGE 38 | 39 | COOKIES_STORAGE 40 | ~~~~~~~~~~~~~~~ 41 | 42 | Default: ``scrapy_cookies.storage.in_memory.InMemoryStorage`` 43 | 44 | With this setting, the storage can be specified. There are some storage classes 45 | provided with this middleware by default: 46 | 47 | * :ref:`scrapy_cookies.storage.in_memory.InMemoryStorage` 48 | * :ref:`scrapy_cookies.storage.sqlite.SQLiteStorage` 49 | * :ref:`scrapy_cookies.storage.mongo.MongoStorage` 50 | 51 | .. setting:: COOKIES_MONGO_MONGOCLIENT_HOST 52 | 53 | COOKIES_MONGO_MONGOCLIENT_HOST 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | Default: ``localhost`` 57 | 58 | Hostname or IP address or Unix domain socket path of a single mongod or mongos 59 | instance to connect to, or a mongodb URI, or a list of hostnames / mongodb URIs. 60 | If host is an IPv6 literal it must be enclosed in ‘[‘ and ‘]’ characters 61 | following the RFC2732 URL syntax (e.g. ‘[::1]’ for localhost). Multihomed and 62 | round robin DNS addresses are not supported. 63 | 64 | Please refer to mongo_client_. 65 | 66 | .. setting:: COOKIES_MONGO_MONGOCLIENT_PORT 67 | 68 | COOKIES_MONGO_MONGOCLIENT_PORT 69 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 70 | 71 | Default: ``27017`` 72 | 73 | Port number on which to connect. 74 | 75 | Please refer to mongo_client_. 76 | 77 | .. setting:: COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS 78 | 79 | COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS 80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 81 | 82 | Default: ``dict`` 83 | 84 | Default class to use for documents returned from queries on this client. 85 | 86 | Please refer to mongo_client_. 87 | 88 | .. setting:: COOKIES_MONGO_MONGOCLIENT_TZ_AWARE 89 | 90 | COOKIES_MONGO_MONGOCLIENT_TZ_AWARE 91 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 92 | 93 | Default: ``False`` 94 | 95 | If True, datetime instances returned as values in a document by this MongoClient 96 | will be timezone aware (otherwise they will be naive). 97 | 98 | Please refer to mongo_client_. 99 | 100 | .. setting:: COOKIES_MONGO_MONGOCLIENT_CONNECT 101 | 102 | COOKIES_MONGO_MONGOCLIENT_CONNECT 103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 104 | 105 | Default: ``True`` 106 | 107 | If True (the default), immediately begin connecting to MongoDB in the 108 | background. Otherwise connect on the first operation. 109 | 110 | Please refer to mongo_client_. 111 | 112 | .. setting:: COOKIES_MONGO_MONGOCLIENT_KWARGS 113 | 114 | COOKIES_MONGO_MONGOCLIENT_KWARGS 115 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 116 | 117 | Please refer to mongo_client_. 118 | 119 | .. setting:: COOKIES_MONGO_DATABASE 120 | 121 | COOKIES_MONGO_DATABASE 122 | ~~~~~~~~~~~~~~~~~~~~~~ 123 | 124 | Default: ``cookies`` 125 | 126 | The name of the database - a string. If None (the default) the database named in 127 | the MongoDB connection URI is returned. 128 | 129 | Please refer to get_database_. 130 | 131 | .. setting:: COOKIES_MONGO_COLLECTION 132 | 133 | COOKIES_MONGO_COLLECTION 134 | ~~~~~~~~~~~~~~~~~~~~~~~~ 135 | 136 | Default: ``cookies`` 137 | 138 | The name of the collection - a string. 139 | 140 | Please refer to get_collection_. 141 | 142 | 143 | .. _mongo_client: http://api.mongodb.com/python/current/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient 144 | .. _get_database: http://api.mongodb.com/python/current/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient.get_database 145 | .. _get_collection: http://api.mongodb.com/python/current/api/pymongo/database.html#pymongo.database.Database.get_collection 146 | 147 | 148 | .. setting:: COOKIES_REDIS_HOST 149 | 150 | COOKIES_REDIS_HOST 151 | ~~~~~~~~~~~~~~~~~~ 152 | 153 | Please refer to `redis-py's documentation`_. 154 | 155 | .. setting:: COOKIES_REDIS_PORT 156 | 157 | COOKIES_REDIS_PORT 158 | ~~~~~~~~~~~~~~~~~~ 159 | 160 | Please refer to `redis-py's documentation`_. 161 | 162 | .. setting:: COOKIES_REDIS_DB 163 | 164 | COOKIES_REDIS_DB 165 | ~~~~~~~~~~~~~~~~ 166 | 167 | Please refer to `redis-py's documentation`_. 168 | 169 | .. setting:: COOKIES_REDIS_PASSWORD 170 | 171 | COOKIES_REDIS_PASSWORD 172 | ~~~~~~~~~~~~~~~~~~~~~~ 173 | 174 | Please refer to `redis-py's documentation`_. 175 | 176 | .. setting:: COOKIES_REDIS_SOCKET_TIMEOUT 177 | 178 | COOKIES_REDIS_SOCKET_TIMEOUT 179 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | 181 | Please refer to `redis-py's documentation`_. 182 | 183 | .. setting:: COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT 184 | 185 | COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT 186 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 187 | 188 | Please refer to `redis-py's documentation`_. 189 | 190 | .. setting:: COOKIES_REDIS_SOCKET_KEEPALIVE 191 | 192 | COOKIES_REDIS_SOCKET_KEEPALIVE 193 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 194 | 195 | Please refer to `redis-py's documentation`_. 196 | 197 | .. setting:: COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS 198 | 199 | COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS 200 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 201 | 202 | Please refer to `redis-py's documentation`_. 203 | 204 | .. setting:: COOKIES_REDIS_CONNECTION_POOL 205 | 206 | COOKIES_REDIS_CONNECTION_POOL 207 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 208 | 209 | Please refer to `redis-py's documentation`_. 210 | 211 | .. setting:: COOKIES_REDIS_UNIX_SOCKET_PATH 212 | 213 | COOKIES_REDIS_UNIX_SOCKET_PATH 214 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 215 | 216 | Please refer to `redis-py's documentation`_. 217 | 218 | .. setting:: COOKIES_REDIS_ENCODING 219 | 220 | COOKIES_REDIS_ENCODING 221 | ~~~~~~~~~~~~~~~~~~~~~~ 222 | 223 | Please refer to `redis-py's documentation`_. 224 | 225 | .. setting:: COOKIES_REDIS_ENCODING_ERRORS 226 | 227 | COOKIES_REDIS_ENCODING_ERRORS 228 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 229 | 230 | Please refer to `redis-py's documentation`_. 231 | 232 | .. setting:: COOKIES_REDIS_CHARSET 233 | 234 | COOKIES_REDIS_CHARSET 235 | ~~~~~~~~~~~~~~~~~~~~~ 236 | 237 | Please refer to `redis-py's documentation`_. 238 | 239 | .. setting:: COOKIES_REDIS_ERRORS 240 | 241 | COOKIES_REDIS_ERRORS 242 | ~~~~~~~~~~~~~~~~~~~~ 243 | 244 | Please refer to `redis-py's documentation`_. 245 | 246 | .. setting:: COOKIES_REDIS_DECODE_RESPONSES 247 | 248 | COOKIES_REDIS_DECODE_RESPONSES 249 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 250 | 251 | Please refer to `redis-py's documentation`_. 252 | 253 | .. setting:: COOKIES_REDIS_RETRY_ON_TIMEOUT 254 | 255 | COOKIES_REDIS_RETRY_ON_TIMEOUT 256 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 257 | 258 | Please refer to `redis-py's documentation`_. 259 | 260 | .. setting:: COOKIES_REDIS_SSL 261 | 262 | COOKIES_REDIS_SSL 263 | ~~~~~~~~~~~~~~~~~ 264 | 265 | Please refer to `redis-py's documentation`_. 266 | 267 | .. setting:: COOKIES_REDIS_SSL_KEYFILE 268 | 269 | COOKIES_REDIS_SSL_KEYFILE 270 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 271 | 272 | Please refer to `redis-py's documentation`_. 273 | 274 | .. setting:: COOKIES_REDIS_SSL_CERTFILE 275 | 276 | COOKIES_REDIS_SSL_CERTFILE 277 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 278 | 279 | Please refer to `redis-py's documentation`_. 280 | 281 | .. setting:: COOKIES_REDIS_SSL_CERT_REQS 282 | 283 | COOKIES_REDIS_SSL_CERT_REQS 284 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 285 | 286 | Please refer to `redis-py's documentation`_. 287 | 288 | .. setting:: COOKIES_REDIS_SSL_CA_CERTS 289 | 290 | COOKIES_REDIS_SSL_CA_CERTS 291 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 292 | 293 | Please refer to `redis-py's documentation`_. 294 | 295 | .. setting:: COOKIES_REDIS_MAX_CONNECTIONS 296 | 297 | COOKIES_REDIS_MAX_CONNECTIONS 298 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 299 | 300 | Please refer to `redis-py's documentation`_. 301 | 302 | .. _redis-py's documentation: https://redis-py.readthedocs.io/en/latest/ 303 | -------------------------------------------------------------------------------- /docs/topics/storage.rst: -------------------------------------------------------------------------------- 1 | .. _topics-storage: 2 | 3 | ======= 4 | Storage 5 | ======= 6 | 7 | The class of storage is the one implementing MutableMapping_ interface. There 8 | are some storage classes provided with this middleware: 9 | 10 | .. _MutableMapping: https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping 11 | 12 | .. _storage-inmemory: 13 | 14 | InMemoryStorage 15 | --------------- 16 | 17 | .. module:: scrapy_cookies.storage.in_memory 18 | :synopsis: In Memory Storage 19 | 20 | .. class:: InMemoryStorage 21 | 22 | This storage enables keeping cookies inside the memory, to provide ultra fast 23 | read and write cookies performance. 24 | 25 | .. _storage-sqlite: 26 | 27 | SQLiteStorage 28 | ------------- 29 | 30 | .. module:: scrapy_cookies.storage.sqlite 31 | :synopsis: SQLite Storage 32 | 33 | .. class:: SQLiteStorage 34 | 35 | This storage enables keeping cookies in SQLite, which supports already by 36 | Python. 37 | 38 | The following settings can be used to configure this storage: 39 | 40 | * |COOKIES_SQLITE_DATABASE|_ 41 | 42 | .. |COOKIES_SQLITE_DATABASE| replace:: ``COOKIES_SQLITE_DATABASE`` 43 | .. _COOKIES_SQLITE_DATABASE: https://docs.python.org/3/library/sqlite3.html#sqlite3.connect 44 | 45 | .. _storage-mongo: 46 | 47 | MongoStorage 48 | ------------ 49 | 50 | .. module:: scrapy_cookies.storage.mongo 51 | :synopsis: Mongo Storage 52 | 53 | .. class:: MongoStorage 54 | 55 | This storage enables keeping cookies in MongoDB. 56 | 57 | The following settings can be used to configure this storage: 58 | 59 | * :setting:`COOKIES_MONGO_MONGOCLIENT_HOST` 60 | * :setting:`COOKIES_MONGO_MONGOCLIENT_PORT` 61 | * :setting:`COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS` 62 | * :setting:`COOKIES_MONGO_MONGOCLIENT_TZ_AWARE` 63 | * :setting:`COOKIES_MONGO_MONGOCLIENT_CONNECT` 64 | * :setting:`COOKIES_MONGO_MONGOCLIENT_KWARGS` 65 | * :setting:`COOKIES_MONGO_DATABASE` 66 | * :setting:`COOKIES_MONGO_COLLECTION` 67 | 68 | .. _storage-redis: 69 | 70 | RedisStorage 71 | ------------ 72 | 73 | .. module:: scrapy_cookies.storage.redis 74 | :synopsis: Redis Storage 75 | 76 | .. class:: RedisStorage 77 | 78 | This storage enables keeping cookies in Redis. 79 | 80 | The following settings can be used to configure this storage: 81 | 82 | * :setting:`COOKIES_REDIS_HOST` 83 | * :setting:`COOKIES_REDIS_PORT` 84 | * :setting:`COOKIES_REDIS_DB` 85 | * :setting:`COOKIES_REDIS_PASSWORD` 86 | * :setting:`COOKIES_REDIS_SOCKET_TIMEOUT` 87 | * :setting:`COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT` 88 | * :setting:`COOKIES_REDIS_SOCKET_KEEPALIVE` 89 | * :setting:`COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS` 90 | * :setting:`COOKIES_REDIS_CONNECTION_POOL` 91 | * :setting:`COOKIES_REDIS_UNIX_SOCKET_PATH` 92 | * :setting:`COOKIES_REDIS_ENCODING` 93 | * :setting:`COOKIES_REDIS_ENCODING_ERRORS` 94 | * :setting:`COOKIES_REDIS_CHARSET` 95 | * :setting:`COOKIES_REDIS_ERRORS` 96 | * :setting:`COOKIES_REDIS_DECODE_RESPONSES` 97 | * :setting:`COOKIES_REDIS_RETRY_ON_TIMEOUT` 98 | * :setting:`COOKIES_REDIS_SSL` 99 | * :setting:`COOKIES_REDIS_SSL_KEYFILE` 100 | * :setting:`COOKIES_REDIS_SSL_CERTFILE` 101 | * :setting:`COOKIES_REDIS_SSL_CERT_REQS` 102 | * :setting:`COOKIES_REDIS_SSL_CA_CERTS` 103 | * :setting:`COOKIES_REDIS_MAX_CONNECTIONS` 104 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = 3 | --cov=scrapy_cookies 4 | --cov-report=html 5 | --cov-report=term 6 | --docker-compose=tests/test_storages/docker-compose.yml 7 | --docker-compose-remove-volumes 8 | testpaths = tests 9 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hiredis 2 | pymongo 3 | redis 4 | scrapy 5 | six 6 | ujson 7 | -------------------------------------------------------------------------------- /scrapy_cookies/VERSION: -------------------------------------------------------------------------------- 1 | 0.3 2 | -------------------------------------------------------------------------------- /scrapy_cookies/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrapy-Cookies - A middleware of cookies persistence for Scrapy 3 | """ 4 | 5 | __all__ = ["__version__", "version_info"] 6 | 7 | # Scrapy version 8 | import pkgutil 9 | 10 | __version__ = pkgutil.get_data(__package__, "VERSION").decode("ascii").strip() 11 | version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split(".")) 12 | del pkgutil 13 | -------------------------------------------------------------------------------- /scrapy_cookies/downloadermiddlewares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapedia/scrapy-cookies/342eaada3b84db4971be09862c34db9f207c0fb7/scrapy_cookies/downloadermiddlewares/__init__.py -------------------------------------------------------------------------------- /scrapy_cookies/downloadermiddlewares/cookies.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from http.cookiejar import Cookie 3 | from typing import Dict, List 4 | 5 | from scrapy.crawler import Crawler 6 | from scrapy.exceptions import NotConfigured 7 | from scrapy.http import Request, Response 8 | from scrapy.http.cookies import CookieJar 9 | from scrapy.settings import SETTINGS_PRIORITIES, Settings 10 | from scrapy.signals import spider_closed, spider_opened 11 | from scrapy.spiders import Spider 12 | from scrapy.utils.misc import load_object 13 | try: 14 | from scrapy.utils.python import to_native_str 15 | except ImportError: 16 | # to_native_str is deprecated since version 2.8 17 | # https://docs.scrapy.org/en/2.8/news.html#deprecation-removals 18 | from scrapy.utils.python import to_unicode as to_native_str 19 | 20 | from scrapy_cookies.settings import default_settings, unfreeze_settings 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def format_cookie(cookie: Dict) -> str: 26 | # build cookie string 27 | cookie_str: str = "{}={}".format(cookie["name"], cookie["value"]) 28 | 29 | if cookie.get("path", None): 30 | cookie_str += "; Path={}".format(cookie["path"]) 31 | if cookie.get("domain", None): 32 | cookie_str += "; Domain={}".format(cookie["domain"]) 33 | 34 | return cookie_str 35 | 36 | 37 | def get_request_cookies(jar: CookieJar, request: Request) -> List[Cookie]: 38 | if isinstance(request.cookies, dict): 39 | cookie_list: List[Dict] = [ 40 | {"name": k, "value": v} for k, v in request.cookies.items() 41 | ] 42 | else: 43 | cookie_list: List[Dict] = request.cookies 44 | 45 | cookies: List[str] = [format_cookie(x) for x in cookie_list] 46 | headers: Dict[str, List[str]] = {"Set-Cookie": cookies} 47 | response: Response = Response(request.url, headers=headers) 48 | 49 | return jar.make_cookies(response, request) 50 | 51 | 52 | class CookiesMiddleware: 53 | """This middleware enables working with sites that need cookies""" 54 | 55 | def __init__(self, settings: Settings): 56 | self.settings: Settings = settings 57 | self.jars = load_object(settings["COOKIES_STORAGE"]).from_middleware(self) 58 | self.debug: bool = settings["COOKIES_DEBUG"] 59 | 60 | @classmethod 61 | def from_crawler(cls, crawler: Crawler): 62 | with unfreeze_settings(crawler.settings) as settings: 63 | settings.setmodule( 64 | module=default_settings, priority=SETTINGS_PRIORITIES["default"] 65 | ) 66 | if not crawler.settings.getbool("COOKIES_ENABLED"): 67 | raise NotConfigured 68 | obj = cls(crawler.settings) 69 | crawler.signals.connect(obj.spider_opened, signal=spider_opened) 70 | crawler.signals.connect(obj.spider_closed, signal=spider_closed) 71 | return obj 72 | 73 | def spider_opened(self, spider: Spider): 74 | logger.info( 75 | "%s is used as the cookies storage.", self.settings["COOKIES_STORAGE"] 76 | ) 77 | self.jars.open_spider(spider) 78 | 79 | def spider_closed(self, spider: Spider): 80 | self.jars.close_spider(spider) 81 | 82 | def process_request(self, request: Request, spider: Spider) -> None: 83 | if request.meta.get("dont_merge_cookies", False): 84 | return 85 | 86 | cookiejar_key = request.meta.get("cookiejar") 87 | jar: CookieJar = self.jars[cookiejar_key] 88 | cookies: List[Cookie] = get_request_cookies(jar, request) 89 | for cookie in cookies: 90 | jar.set_cookie_if_ok(cookie, request) 91 | self.jars[cookiejar_key] = jar 92 | 93 | # set Cookie header 94 | request.headers.pop("Cookie", None) 95 | jar.add_cookie_header(request) 96 | self._debug_cookie(request, spider) 97 | 98 | def process_response( 99 | self, request: Request, response: Response, spider: Spider 100 | ) -> Response: 101 | if request.meta.get("dont_merge_cookies", False): 102 | return response 103 | 104 | # extract cookies from Set-Cookie and drop invalid/expired cookies 105 | cookiejar_key = request.meta.get("cookiejar") 106 | jar: CookieJar = self.jars[cookiejar_key] 107 | jar.extract_cookies(response, request) 108 | self.jars[cookiejar_key] = jar 109 | self._debug_set_cookie(response, spider) 110 | 111 | return response 112 | 113 | def _debug_cookie(self, request: Request, spider: Spider): 114 | if self.debug: 115 | cl = [ 116 | to_native_str(c, errors="replace") 117 | for c in request.headers.getlist("Cookie") 118 | ] 119 | if cl: 120 | cookies: str = "\n".join("Cookie: {}\n".format(c) for c in cl) 121 | msg: str = "Sending cookies to: {}\n{}".format(request, cookies) 122 | logger.debug(msg, extra={"spider": spider}) 123 | 124 | def _debug_set_cookie(self, response: Response, spider: Spider): 125 | if self.debug: 126 | cl = [ 127 | to_native_str(c, errors="replace") 128 | for c in response.headers.getlist("Set-Cookie") 129 | ] 130 | if cl: 131 | cookies: str = "\n".join("Set-Cookie: {}\n".format(c) for c in cl) 132 | msg: str = "Received cookies from: {}\n{}".format(response, cookies) 133 | logger.debug(msg, extra={"spider": spider}) 134 | -------------------------------------------------------------------------------- /scrapy_cookies/settings/__init__.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | 4 | @contextmanager 5 | def unfreeze_settings(settings): 6 | original_status = settings.frozen 7 | settings.frozen = False 8 | try: 9 | yield settings 10 | finally: 11 | settings.frozen = original_status 12 | -------------------------------------------------------------------------------- /scrapy_cookies/settings/default_settings.py: -------------------------------------------------------------------------------- 1 | COOKIES_ENABLED = True 2 | COOKIES_DEBUG = False 3 | 4 | COOKIES_PERSISTENCE = False 5 | COOKIES_PERSISTENCE_DIR = "cookies" 6 | 7 | # ------------------------------------------------------------------------------ 8 | # IN MEMORY STORAGE 9 | # ------------------------------------------------------------------------------ 10 | 11 | COOKIES_STORAGE = "scrapy_cookies.storage.in_memory.InMemoryStorage" 12 | 13 | # ------------------------------------------------------------------------------ 14 | # SQLITE STORAGE 15 | # ------------------------------------------------------------------------------ 16 | 17 | # COOKIES_STORAGE = 'scrapy_cookies.storage.sqlite.SQLiteStorage' 18 | COOKIES_SQLITE_DATABASE = ":memory:" 19 | 20 | # ------------------------------------------------------------------------------ 21 | # MONGODB 22 | # ------------------------------------------------------------------------------ 23 | 24 | # http://api.mongodb.com/python/current/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient 25 | 26 | # COOKIES_STORAGE = 'scrapy_cookies.storage.mongo.MongoStorage' 27 | COOKIES_MONGO_MONGOCLIENT_HOST = "localhost" 28 | COOKIES_MONGO_MONGOCLIENT_PORT = 27017 29 | COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS = dict 30 | COOKIES_MONGO_MONGOCLIENT_TZ_AWARE = False 31 | COOKIES_MONGO_MONGOCLIENT_CONNECT = True 32 | 33 | COOKIES_MONGO_MONGOCLIENT_KWARGS = { 34 | # 'username': 'username', 35 | # 'password': 'password', 36 | # 'authSource': 'admin', 37 | # 'authMechanism': 'SCRAM-SHA-1', 38 | } 39 | 40 | COOKIES_MONGO_DATABASE = "cookies" 41 | # or 42 | # COOKIES_MONGO_DATABASE = { 43 | # 'name': 'cookies', 44 | # 'codec_options': None, 45 | # 'read_preference': None, 46 | # 'write_concern': None, 47 | # 'read_concern': None 48 | # } 49 | 50 | COOKIES_MONGO_COLLECTION = "cookies" 51 | # or 52 | # COOKIES_MONGO_COLLECTION = { 53 | # 'name': 'cookies', 54 | # 'codec_options': None, 55 | # 'read_preference': None, 56 | # 'write_concern': None, 57 | # 'read_concern': None 58 | # } 59 | 60 | # ------------------------------------------------------------------------------ 61 | # REDIS STORAGE 62 | # ------------------------------------------------------------------------------ 63 | 64 | # COOKIES_STORAGE = 'scrapy_cookies.storage.redis.RedisStorage' 65 | COOKIES_REDIS_HOST = "localhost" 66 | COOKIES_REDIS_PORT = 6379 67 | COOKIES_REDIS_DB = 0 68 | COOKIES_REDIS_PASSWORD = None 69 | COOKIES_REDIS_SOCKET_TIMEOUT = None 70 | COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT = None 71 | COOKIES_REDIS_SOCKET_KEEPALIVE = None 72 | COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS = None 73 | COOKIES_REDIS_CONNECTION_POOL = None 74 | COOKIES_REDIS_UNIX_SOCKET_PATH = None 75 | COOKIES_REDIS_ENCODING = "utf-8" 76 | COOKIES_REDIS_ENCODING_ERRORS = "strict" 77 | COOKIES_REDIS_CHARSET = None 78 | COOKIES_REDIS_ERRORS = None 79 | COOKIES_REDIS_DECODE_RESPONSES = False 80 | COOKIES_REDIS_RETRY_ON_TIMEOUT = False 81 | COOKIES_REDIS_SSL = False 82 | COOKIES_REDIS_SSL_KEYFILE = None 83 | COOKIES_REDIS_SSL_CERTFILE = None 84 | COOKIES_REDIS_SSL_CERT_REQS = None 85 | COOKIES_REDIS_SSL_CA_CERTS = None 86 | COOKIES_REDIS_MAX_CONNECTIONS = None 87 | -------------------------------------------------------------------------------- /scrapy_cookies/signals.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scrapy-Cookies signals 3 | 4 | These signals are documented in docs/topics/signals.rst. Please don't add new 5 | signals here without documenting them there. 6 | """ 7 | 8 | cookies_invalidated = object() 9 | -------------------------------------------------------------------------------- /scrapy_cookies/storage/__init__.py: -------------------------------------------------------------------------------- 1 | from collections.abc import MutableMapping 2 | 3 | from scrapy.settings import Settings 4 | from scrapy.spiders import Spider 5 | 6 | from scrapy_cookies.downloadermiddlewares.cookies import CookiesMiddleware 7 | 8 | 9 | class BaseStorage(MutableMapping): 10 | name = None 11 | 12 | def __init__(self, settings: Settings): 13 | self.settings: Settings = settings 14 | 15 | @classmethod 16 | def from_middleware(cls, middleware: CookiesMiddleware): 17 | obj = cls(middleware.settings) 18 | return obj 19 | 20 | def open_spider(self, spider: Spider): 21 | pass 22 | 23 | def close_spider(self, spider: Spider): 24 | pass 25 | 26 | def __delitem__(self, v): 27 | pass 28 | 29 | def __getitem__(self, k): 30 | pass 31 | 32 | def __iter__(self): 33 | pass 34 | 35 | def __len__(self): 36 | pass 37 | 38 | def __setitem__(self, k, v): 39 | pass 40 | -------------------------------------------------------------------------------- /scrapy_cookies/storage/in_memory.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import os 4 | import pickle 5 | from collections import UserDict 6 | from typing import Dict 7 | 8 | from scrapy.http.cookies import CookieJar 9 | from scrapy.settings import Settings 10 | from scrapy.spiders import Spider 11 | from scrapy.utils.project import data_path 12 | 13 | from scrapy_cookies.storage import BaseStorage 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class InMemoryStorage(UserDict, BaseStorage): 19 | def __init__(self, settings: Settings): 20 | super(InMemoryStorage, self).__init__() 21 | self.settings: Settings = settings 22 | self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"]) 23 | 24 | def open_spider(self, spider: Spider): 25 | logger.info("COOKIES_PERSISTENCE is %s.", self.settings["COOKIES_PERSISTENCE"]) 26 | if not self.settings["COOKIES_PERSISTENCE"]: 27 | return 28 | if not os.path.exists(self.cookies_dir): 29 | logger.info("Cookies dir does not exist.") 30 | return 31 | with io.open(self.cookies_dir, "br") as f: 32 | self.data: Dict = pickle.load(f) 33 | logger.info("The number of restored cookies is %d.", len(self.data)) 34 | 35 | def close_spider(self, spider: Spider): 36 | if self.settings["COOKIES_PERSISTENCE"]: 37 | with io.open(self.cookies_dir, "bw") as f: 38 | pickle.dump(self.data, f) 39 | logger.info("The number of saved cookies is %d.", len(self.data)) 40 | 41 | def __missing__(self, key) -> CookieJar: 42 | self.data.update({key: CookieJar()}) 43 | return self.data[key] 44 | -------------------------------------------------------------------------------- /scrapy_cookies/storage/mongo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | import re 4 | from http.cookiejar import Cookie 5 | from itertools import starmap 6 | from typing import Dict 7 | 8 | import pymongo 9 | from pymongo import MongoClient 10 | from pymongo.collection import Collection 11 | from pymongo.database import Database 12 | from scrapy.http.cookies import CookieJar 13 | from scrapy.settings import Settings 14 | from scrapy.spiders import Spider 15 | 16 | from scrapy_cookies.storage import BaseStorage 17 | 18 | logger = logging.getLogger(__name__) 19 | pattern = re.compile("^COOKIES_MONGO_MONGOCLIENT_(?P(?!KWARGS).*)$") 20 | 21 | 22 | def get_arguments(var): 23 | return {str: {"name": var}, dict: var}[type(var)] 24 | 25 | 26 | def write_cookiejar(cookiejar: CookieJar): 27 | return pickle.dumps(cookiejar) 28 | 29 | 30 | def read_cookiejar(document): 31 | try: 32 | return pickle.loads(document["cookiejar"]) 33 | except TypeError: 34 | return None 35 | 36 | 37 | def convert_cookiejar(cookiejar): 38 | def _convert_cookies(x): 39 | if isinstance(x, (str, int, bool)): 40 | return x 41 | elif isinstance(x, Cookie): 42 | return dict( 43 | map( 44 | lambda attr: (attr, getattr(x, attr)), 45 | ( 46 | "version", 47 | "name", 48 | "value", 49 | "port", 50 | "port_specified", 51 | "domain", 52 | "domain_specified", 53 | "domain_initial_dot", 54 | "path", 55 | "path_specified", 56 | "secure", 57 | "expires", 58 | "discard", 59 | "comment", 60 | "comment_url", 61 | ), 62 | ) 63 | ) 64 | 65 | elif isinstance(x, dict): 66 | return dict( 67 | starmap( 68 | lambda k, v: (_convert_cookies(k), _convert_cookies(v)), x.items() 69 | ) 70 | ) 71 | 72 | return _convert_cookies(cookiejar._cookies) 73 | 74 | 75 | class MongoStorage(BaseStorage): 76 | def __init__(self, settings: Settings): 77 | super(MongoStorage, self).__init__(settings) 78 | self.mongo_settings: Dict[str, str] = dict( 79 | starmap( 80 | lambda k, v: (pattern.sub(lambda x: x.group(1).lower(), k), v), 81 | filter( 82 | lambda pair: pattern.match(pair[0]), settings.copy_to_dict().items() 83 | ), 84 | ) 85 | ) 86 | self.mongo_settings.update(self.settings["COOKIES_MONGO_MONGOCLIENT_KWARGS"]) 87 | self.client: MongoClient = None 88 | self.db: Database = None 89 | self.coll: Collection = None 90 | 91 | @classmethod 92 | def from_middleware(cls, middleware): 93 | obj = cls(middleware.settings) 94 | return obj 95 | 96 | def open_spider(self, spider: Spider): 97 | self.client: MongoClient = MongoClient(**self.mongo_settings) 98 | 99 | self.db: Database = self.client.get_database( 100 | **get_arguments(self.settings["COOKIES_MONGO_DATABASE"]) 101 | ) 102 | self.coll: Collection = self.db.get_collection( 103 | **get_arguments(self.settings["COOKIES_MONGO_COLLECTION"]) 104 | ) 105 | self.coll.create_index([("key", pymongo.ASCENDING)], unique=True) 106 | 107 | def close_spider(self, spider: Spider): 108 | self.client.close() 109 | 110 | def __missing__(self, k) -> CookieJar: 111 | cookiejar: CookieJar = CookieJar() 112 | self[k] = cookiejar 113 | return cookiejar 114 | 115 | def __delitem__(self, v): 116 | # TODO: finish this method 117 | self.coll.delete_one({}) 118 | 119 | def __getitem__(self, k) -> CookieJar: 120 | v: CookieJar = read_cookiejar(self.coll.find_one({"key": k})) 121 | if isinstance(v, CookieJar): 122 | return v 123 | if hasattr(self.__class__, "__missing__"): 124 | return self.__class__.__missing__(self, k) 125 | raise KeyError(k) 126 | 127 | def __iter__(self): 128 | return iter(self.coll.find()) 129 | 130 | def __len__(self) -> int: 131 | return self.coll.count_documents({}) 132 | 133 | def __setitem__(self, k, v): 134 | self.coll.update_one( 135 | {"key": k}, 136 | { 137 | "$set": { 138 | "key": k, 139 | "cookiejar": write_cookiejar(v), 140 | "cookies": convert_cookiejar(v), 141 | } 142 | }, 143 | upsert=True, 144 | ) 145 | -------------------------------------------------------------------------------- /scrapy_cookies/storage/redis_.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pickle 3 | import re 4 | from itertools import starmap 5 | from typing import Dict 6 | 7 | import ujson 8 | from redis.client import Redis 9 | from scrapy.http.cookies import CookieJar 10 | from scrapy.settings import Settings 11 | from scrapy.spiders import Spider 12 | 13 | from scrapy_cookies.storage import BaseStorage 14 | 15 | logger = logging.getLogger(__name__) 16 | pattern = re.compile("^COOKIES_REDIS_(?P(?!KWARGS).*)$") 17 | 18 | 19 | def get_arguments(var): 20 | return {str: {"name": var}, dict: var}[type(var)] 21 | 22 | 23 | def write_cookiejar(cookiejar): 24 | return { 25 | "cookiejar": pickle.dumps(cookiejar), 26 | "cookies": ujson.dumps(cookiejar._cookies), 27 | } 28 | 29 | 30 | def read_cookiejar(document): 31 | try: 32 | return pickle.loads(document["cookiejar"]) 33 | except (TypeError, KeyError): 34 | return None 35 | 36 | 37 | class RedisStorage(BaseStorage): 38 | def __init__(self, settings: Settings): 39 | super(RedisStorage, self).__init__(settings) 40 | self.redis_settings: Dict[str, str] = dict( 41 | starmap( 42 | lambda k, v: (pattern.sub(lambda x: x.group(1).lower(), k), v), 43 | filter( 44 | lambda pair: pattern.match(pair[0]), settings.copy_to_dict().items() 45 | ), 46 | ) 47 | ) 48 | self.r: Redis = None 49 | 50 | @classmethod 51 | def from_middleware(cls, middleware): 52 | obj = cls(middleware.settings) 53 | return obj 54 | 55 | def open_spider(self, spider: Spider): 56 | self.r: Redis = Redis(**self.redis_settings) 57 | 58 | def close_spider(self, spider: Spider): 59 | pass 60 | 61 | def __missing__(self, k) -> CookieJar: 62 | cookiejar: CookieJar = CookieJar() 63 | self[k] = cookiejar 64 | return cookiejar 65 | 66 | def __delitem__(self, v): 67 | self.r.delete(v) 68 | 69 | def __getitem__(self, k) -> CookieJar: 70 | v: CookieJar = read_cookiejar(self.r.hgetall(k)) 71 | if isinstance(v, CookieJar): 72 | return v 73 | if hasattr(self.__class__, "__missing__"): 74 | return self.__class__.__missing__(self, k) 75 | raise KeyError(k) 76 | 77 | def __iter__(self): 78 | return self.r.scan_iter() 79 | 80 | def __len__(self) -> int: 81 | return self.r.dbsize() 82 | 83 | def __setitem__(self, k, v: CookieJar): 84 | self.r.hmset(name=k, mapping=write_cookiejar(v)) 85 | -------------------------------------------------------------------------------- /scrapy_cookies/storage/sqlite.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import os 4 | import pickle 5 | import sqlite3 6 | from sqlite3 import Connection, Cursor, Row 7 | 8 | from scrapy.http.cookies import CookieJar 9 | from scrapy.settings import Settings 10 | from scrapy.spiders import Spider 11 | from scrapy.utils.project import data_path 12 | 13 | from scrapy_cookies.storage import BaseStorage 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def adapt_cookiejar(cookiejar: CookieJar) -> bytes: 19 | return pickle.dumps(cookiejar) 20 | 21 | 22 | def convert_cookiejar_and_its_key(cookiejar_or_its_key: bytes): 23 | return pickle.loads(cookiejar_or_its_key) 24 | 25 | 26 | sqlite3.register_adapter(CookieJar, adapt_cookiejar) 27 | sqlite3.register_converter("cookiejar", convert_cookiejar_and_its_key) 28 | sqlite3.register_converter("cookiejar_key", convert_cookiejar_and_its_key) 29 | 30 | 31 | class SQLiteStorage(BaseStorage): 32 | def __init__(self, settings: Settings): 33 | super(SQLiteStorage, self).__init__(settings) 34 | self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"]) 35 | self.database: str = settings["COOKIES_SQLITE_DATABASE"] 36 | self.conn: Connection = None 37 | self.cur: Cursor = None 38 | 39 | def open_spider(self, spider: Spider): 40 | self.conn: Connection = sqlite3.connect( 41 | self.database, detect_types=sqlite3.PARSE_COLNAMES, isolation_level=None 42 | ) 43 | self.conn.row_factory = sqlite3.Row 44 | self.cur: Cursor = self.conn.cursor() 45 | if self.database == ":memory:": 46 | if self.settings["COOKIES_PERSISTENCE"] and os.path.isfile( 47 | self.cookies_dir 48 | ): 49 | with io.open(self.cookies_dir, "r") as f: 50 | self.cur.executescript(f.read()) 51 | return 52 | self.cur.execute( 53 | "CREATE TABLE IF NOT EXISTS cookies (" 54 | "cookiejar_key BLOB PRIMARY KEY UNIQUE, cookiejar BLOB, str TEXT" 55 | ")" 56 | ) 57 | 58 | def close_spider(self, spider: Spider): 59 | if self.database == ":memory:" and self.settings["COOKIES_PERSISTENCE"]: 60 | with open(self.cookies_dir, "w") as f: 61 | for line in self.conn.iterdump(): 62 | f.write("%s\n" % line) 63 | self.conn.close() 64 | 65 | def __delitem__(self, v): 66 | self.cur.execute("DELETE FROM cookies WHERE cookiejar_key=?", pickle.dumps(v)) 67 | 68 | def __getitem__(self, k) -> CookieJar: 69 | result: Row = self.cur.execute( 70 | 'SELECT cookiejar as "cookiejar [CookieJar]" ' 71 | "FROM cookies " 72 | "WHERE cookiejar_key=?", 73 | (pickle.dumps(k),), 74 | ).fetchone() 75 | if result: 76 | return result["cookiejar"] 77 | if hasattr(self.__class__, "__missing__"): 78 | return self.__class__.__missing__(self, k) 79 | raise KeyError(k) 80 | 81 | def __iter__(self): 82 | return iter( 83 | self.cur.execute( 84 | 'SELECT cookiejar_key as "cookiejar_key [CookieJar_key]", cookiejar as "cookiejar [CookieJar]" ' 85 | "FROM cookies" 86 | ).fetchall() 87 | ) 88 | 89 | def __len__(self) -> int: 90 | return self.cur.execute("SELECT COUNT(*) FROM cookies").fetchone()[0] 91 | 92 | def __setitem__(self, k, v: CookieJar) -> None: 93 | self.cur.execute( 94 | "INSERT OR REPLACE INTO cookies (cookiejar_key, cookiejar, str) VALUES (?, ?, ?)", 95 | (pickle.dumps(k), v, str(k)), 96 | ) 97 | 98 | def __missing__(self, k) -> CookieJar: 99 | v: CookieJar = CookieJar() 100 | self.__setitem__(k, v) 101 | return v 102 | 103 | def __contains__(self, k) -> bool: 104 | self.cur.execute( 105 | 'SELECT cookiejar as "cookiejar [CookieJar]" ' 106 | "FROM cookies " 107 | "WHERE cookiejar_key=?", 108 | (pickle.dumps(k),), 109 | ) 110 | return bool(self.cur.fetchone()) 111 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_rpm] 2 | doc_files = docs AUTHORS INSTALL LICENSE README.rst 3 | 4 | [bdist_wheel] 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | from setuptools import find_packages, setup 4 | 5 | with open(join(dirname(__file__), "scrapy_cookies/VERSION"), "rb") as f: 6 | version = f.read().decode("ascii").strip() 7 | 8 | 9 | extras_require = {} 10 | 11 | setup( 12 | name="Scrapy-Cookies", 13 | version=version, 14 | url="https://github.com/grammy-jiang/scrapy-cookies", 15 | description="A middleware of cookies persistence for Scrapy", 16 | long_description=open("README.rst").read(), 17 | author="Scrapedia", 18 | author_email="Scrapedia@outlook.com", 19 | maintainer="Scrapedia", 20 | maintainer_email="Scrapedia@outlook.com", 21 | license="BSD", 22 | packages=find_packages(exclude=("tests", "tests.*")), 23 | include_package_data=True, 24 | zip_safe=False, 25 | classifiers=[ 26 | "Framework :: Scrapy", 27 | "Development Status :: 2 - Pre-Alpha", 28 | "Environment :: Plugins", 29 | "Intended Audience :: Developers", 30 | "License :: OSI Approved :: BSD License", 31 | "Operating System :: OS Independent", 32 | "Programming Language :: Python", 33 | "Programming Language :: Python :: 2", 34 | "Programming Language :: Python :: 2.7", 35 | "Programming Language :: Python :: 3", 36 | "Programming Language :: Python :: 3.4", 37 | "Programming Language :: Python :: 3.5", 38 | "Programming Language :: Python :: 3.6", 39 | "Programming Language :: Python :: Implementation :: CPython", 40 | "Programming Language :: Python :: Implementation :: PyPy", 41 | "Topic :: Internet :: WWW/HTTP", 42 | "Topic :: Software Development :: Libraries :: Application Frameworks", 43 | "Topic :: Software Development :: Libraries :: Python Modules", 44 | ], 45 | python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", 46 | install_requires=["hiredis", "pymongo", "redis", "scrapy", "ujson"], 47 | extras_require=extras_require, 48 | ) 49 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapedia/scrapy-cookies/342eaada3b84db4971be09862c34db9f207c0fb7/tests/__init__.py -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-benchmark 3 | pytest-cov 4 | pytest-docker-compose 5 | pytest-sugar 6 | pytest-twisted 7 | pytest-xdist 8 | testfixtures 9 | -------------------------------------------------------------------------------- /tests/test_downloadermiddleware_cookies.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from unittest import TestCase 4 | 5 | from scrapy.crawler import Crawler 6 | from scrapy.exceptions import NotConfigured 7 | from scrapy.http import Request, Response 8 | from scrapy.settings import Settings 9 | from scrapy.spiders import Spider 10 | from scrapy.utils.test import get_crawler 11 | from testfixtures import LogCapture 12 | 13 | from scrapy_cookies.downloadermiddlewares.cookies import CookiesMiddleware 14 | from scrapy_cookies.settings import default_settings 15 | 16 | 17 | class CookiesMiddlewareTest(TestCase): 18 | def assertCookieValEqual(self, first, second, msg=None): 19 | cookievaleq = lambda cv: re.split(";\s*", cv.decode("latin1")) 20 | return self.assertEqual( 21 | sorted(cookievaleq(first)), sorted(cookievaleq(second)), msg 22 | ) 23 | 24 | def setUp(self): 25 | self.spider = Spider("foo") 26 | settings = Settings() 27 | settings.setmodule(default_settings) 28 | self.crawler = Crawler(Spider, settings) 29 | self.mw = CookiesMiddleware.from_crawler(self.crawler) 30 | self.mw.spider_opened(self.spider) 31 | 32 | def tearDown(self): 33 | self.mw.spider_closed(self.spider) 34 | del self.mw 35 | 36 | def test_basic(self): 37 | req = Request("http://scrapytest.org/") 38 | assert self.mw.process_request(req, self.spider) is None 39 | assert "Cookie" not in req.headers 40 | 41 | headers = {"Set-Cookie": "C1=value1; path=/"} 42 | res = Response("http://scrapytest.org/", headers=headers) 43 | assert self.mw.process_response(req, res, self.spider) is res 44 | 45 | req2 = Request("http://scrapytest.org/sub1/") 46 | assert self.mw.process_request(req2, self.spider) is None 47 | self.assertEqual(req2.headers.get("Cookie"), b"C1=value1") 48 | 49 | def test_setting_false_cookies_enabled(self): 50 | self.assertRaises( 51 | NotConfigured, 52 | CookiesMiddleware.from_crawler, 53 | get_crawler(settings_dict={"COOKIES_ENABLED": False}), 54 | ) 55 | 56 | def test_setting_default_cookies_enabled(self): 57 | self.assertIsInstance( 58 | CookiesMiddleware.from_crawler(get_crawler()), CookiesMiddleware 59 | ) 60 | 61 | def test_setting_true_cookies_enabled(self): 62 | self.assertIsInstance( 63 | CookiesMiddleware.from_crawler( 64 | get_crawler(settings_dict={"COOKIES_ENABLED": True}) 65 | ), 66 | CookiesMiddleware, 67 | ) 68 | 69 | def test_setting_enabled_cookies_debug(self): 70 | crawler = get_crawler(settings_dict={"COOKIES_DEBUG": True}) 71 | mw = CookiesMiddleware.from_crawler(crawler) 72 | mw.spider_opened(self.spider) 73 | with LogCapture( 74 | "scrapy_cookies.downloadermiddlewares.cookies", 75 | propagate=False, 76 | level=logging.DEBUG, 77 | ) as l: 78 | req = Request("http://scrapytest.org/") 79 | res = Response( 80 | "http://scrapytest.org/", headers={"Set-Cookie": "C1=value1; path=/"} 81 | ) 82 | mw.process_response(req, res, crawler.spider) 83 | req2 = Request("http://scrapytest.org/sub1/") 84 | mw.process_request(req2, crawler.spider) 85 | 86 | l.check( 87 | ( 88 | "scrapy_cookies.downloadermiddlewares.cookies", 89 | "DEBUG", 90 | "Received cookies from: <200 http://scrapytest.org/>\n" 91 | "Set-Cookie: C1=value1; path=/\n", 92 | ), 93 | ( 94 | "scrapy_cookies.downloadermiddlewares.cookies", 95 | "DEBUG", 96 | "Sending cookies to: \n" 97 | "Cookie: C1=value1\n", 98 | ), 99 | ) 100 | 101 | def test_setting_disabled_cookies_debug(self): 102 | crawler = get_crawler(settings_dict={"COOKIES_DEBUG": False}) 103 | mw = CookiesMiddleware.from_crawler(crawler) 104 | mw.spider_opened(self.spider) 105 | with LogCapture( 106 | "scrapy_cookies.downloadermiddlewares.cookies", 107 | propagate=False, 108 | level=logging.DEBUG, 109 | ) as l: 110 | req = Request("http://scrapytest.org/") 111 | res = Response( 112 | "http://scrapytest.org/", headers={"Set-Cookie": "C1=value1; path=/"} 113 | ) 114 | mw.process_response(req, res, crawler.spider) 115 | req2 = Request("http://scrapytest.org/sub1/") 116 | mw.process_request(req2, crawler.spider) 117 | 118 | l.check() 119 | 120 | def test_do_not_break_on_non_utf8_header(self): 121 | req = Request("http://scrapytest.org/") 122 | assert self.mw.process_request(req, self.spider) is None 123 | assert "Cookie" not in req.headers 124 | 125 | headers = {"Set-Cookie": b"C1=in\xa3valid; path=/", "Other": b"ignore\xa3me"} 126 | res = Response("http://scrapytest.org/", headers=headers) 127 | assert self.mw.process_response(req, res, self.spider) is res 128 | 129 | req2 = Request("http://scrapytest.org/sub1/") 130 | assert self.mw.process_request(req2, self.spider) is None 131 | self.assertIn("Cookie", req2.headers) 132 | 133 | def test_dont_merge_cookies(self): 134 | # merge some cookies into jar 135 | headers = {"Set-Cookie": "C1=value1; path=/"} 136 | req = Request("http://scrapytest.org/") 137 | res = Response("http://scrapytest.org/", headers=headers) 138 | assert self.mw.process_response(req, res, self.spider) is res 139 | 140 | # test Cookie header is not seted to request 141 | req = Request("http://scrapytest.org/dontmerge", meta={"dont_merge_cookies": 1}) 142 | assert self.mw.process_request(req, self.spider) is None 143 | assert "Cookie" not in req.headers 144 | 145 | # check that returned cookies are not merged back to jar 146 | res = Response( 147 | "http://scrapytest.org/dontmerge", 148 | headers={"Set-Cookie": "dont=mergeme; path=/"}, 149 | ) 150 | assert self.mw.process_response(req, res, self.spider) is res 151 | 152 | # check that cookies are merged back 153 | req = Request("http://scrapytest.org/mergeme") 154 | assert self.mw.process_request(req, self.spider) is None 155 | self.assertEqual(req.headers.get("Cookie"), b"C1=value1") 156 | 157 | # check that cookies are merged when dont_merge_cookies is passed as 0 158 | req = Request("http://scrapytest.org/mergeme", meta={"dont_merge_cookies": 0}) 159 | assert self.mw.process_request(req, self.spider) is None 160 | self.assertEqual(req.headers.get("Cookie"), b"C1=value1") 161 | 162 | def test_complex_cookies(self): 163 | # merge some cookies into jar 164 | cookies = [ 165 | { 166 | "name": "C1", 167 | "value": "value1", 168 | "path": "/foo", 169 | "domain": "scrapytest.org", 170 | }, 171 | { 172 | "name": "C2", 173 | "value": "value2", 174 | "path": "/bar", 175 | "domain": "scrapytest.org", 176 | }, 177 | { 178 | "name": "C3", 179 | "value": "value3", 180 | "path": "/foo", 181 | "domain": "scrapytest.org", 182 | }, 183 | {"name": "C4", "value": "value4", "path": "/foo", "domain": "scrapy.org"}, 184 | ] 185 | 186 | req = Request("http://scrapytest.org/", cookies=cookies) 187 | self.mw.process_request(req, self.spider) 188 | 189 | # embed C1 and C3 for scrapytest.org/foo 190 | req = Request("http://scrapytest.org/foo") 191 | self.mw.process_request(req, self.spider) 192 | assert req.headers.get("Cookie") in ( 193 | b"C1=value1; C3=value3", 194 | b"C3=value3; C1=value1", 195 | ) 196 | 197 | # embed C2 for scrapytest.org/bar 198 | req = Request("http://scrapytest.org/bar") 199 | self.mw.process_request(req, self.spider) 200 | self.assertEqual(req.headers.get("Cookie"), b"C2=value2") 201 | 202 | # embed nothing for scrapytest.org/baz 203 | req = Request("http://scrapytest.org/baz") 204 | self.mw.process_request(req, self.spider) 205 | assert "Cookie" not in req.headers 206 | 207 | def test_merge_request_cookies(self): 208 | req = Request("http://scrapytest.org/", cookies={"galleta": "salada"}) 209 | assert self.mw.process_request(req, self.spider) is None 210 | self.assertEqual(req.headers.get("Cookie"), b"galleta=salada") 211 | 212 | headers = {"Set-Cookie": "C1=value1; path=/"} 213 | res = Response("http://scrapytest.org/", headers=headers) 214 | assert self.mw.process_response(req, res, self.spider) is res 215 | 216 | req2 = Request("http://scrapytest.org/sub1/") 217 | assert self.mw.process_request(req2, self.spider) is None 218 | 219 | self.assertCookieValEqual( 220 | req2.headers.get("Cookie"), b"C1=value1; galleta=salada" 221 | ) 222 | 223 | def test_cookiejar_key(self): 224 | req = Request( 225 | "http://scrapytest.org/", 226 | cookies={"galleta": "salada"}, 227 | meta={"cookiejar": "store1"}, 228 | ) 229 | assert self.mw.process_request(req, self.spider) is None 230 | self.assertEqual(req.headers.get("Cookie"), b"galleta=salada") 231 | 232 | headers = {"Set-Cookie": "C1=value1; path=/"} 233 | res = Response("http://scrapytest.org/", headers=headers, request=req) 234 | assert self.mw.process_response(req, res, self.spider) is res 235 | 236 | req2 = Request("http://scrapytest.org/", meta=res.meta) 237 | assert self.mw.process_request(req2, self.spider) is None 238 | self.assertCookieValEqual( 239 | req2.headers.get("Cookie"), b"C1=value1; galleta=salada" 240 | ) 241 | 242 | req3 = Request( 243 | "http://scrapytest.org/", 244 | cookies={"galleta": "dulce"}, 245 | meta={"cookiejar": "store2"}, 246 | ) 247 | assert self.mw.process_request(req3, self.spider) is None 248 | self.assertEqual(req3.headers.get("Cookie"), b"galleta=dulce") 249 | 250 | headers = {"Set-Cookie": "C2=value2; path=/"} 251 | res2 = Response("http://scrapytest.org/", headers=headers, request=req3) 252 | assert self.mw.process_response(req3, res2, self.spider) is res2 253 | 254 | req4 = Request("http://scrapytest.org/", meta=res2.meta) 255 | assert self.mw.process_request(req4, self.spider) is None 256 | self.assertCookieValEqual( 257 | req4.headers.get("Cookie"), b"C2=value2; galleta=dulce" 258 | ) 259 | 260 | # cookies from hosts with port 261 | req5_1 = Request("http://scrapytest.org:1104/") 262 | assert self.mw.process_request(req5_1, self.spider) is None 263 | 264 | headers = {"Set-Cookie": "C1=value1; path=/"} 265 | res5_1 = Response( 266 | "http://scrapytest.org:1104/", headers=headers, request=req5_1 267 | ) 268 | assert self.mw.process_response(req5_1, res5_1, self.spider) is res5_1 269 | 270 | req5_2 = Request("http://scrapytest.org:1104/some-redirected-path") 271 | assert self.mw.process_request(req5_2, self.spider) is None 272 | self.assertEqual(req5_2.headers.get("Cookie"), b"C1=value1") 273 | 274 | req5_3 = Request("http://scrapytest.org/some-redirected-path") 275 | assert self.mw.process_request(req5_3, self.spider) is None 276 | self.assertEqual(req5_3.headers.get("Cookie"), b"C1=value1") 277 | 278 | # skip cookie retrieval for not http request 279 | req6 = Request("file:///scrapy/sometempfile") 280 | assert self.mw.process_request(req6, self.spider) is None 281 | self.assertEqual(req6.headers.get("Cookie"), None) 282 | 283 | def test_local_domain(self): 284 | request = Request("http://example-host/", cookies={"currencyCookie": "USD"}) 285 | assert self.mw.process_request(request, self.spider) is None 286 | self.assertIn("Cookie", request.headers) 287 | self.assertEqual(b"currencyCookie=USD", request.headers["Cookie"]) 288 | -------------------------------------------------------------------------------- /tests/test_storages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapedia/scrapy-cookies/342eaada3b84db4971be09862c34db9f207c0fb7/tests/test_storages/__init__.py -------------------------------------------------------------------------------- /tests/test_storages/confest.py: -------------------------------------------------------------------------------- 1 | pytest_plugins = ["docker_compose"] 2 | -------------------------------------------------------------------------------- /tests/test_storages/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | mongo: 4 | container_name: dc-pytest-scrapy-cookies-mongo 5 | image: mongo:latest 6 | networks: 7 | - pytest_scrapy_cookies 8 | ports: 9 | - "127.0.0.1:27017:27017" 10 | restart: always 11 | tty: true 12 | redis: 13 | container_name: dc-pytest-scrapy-cookies-redis 14 | image: redis:latest 15 | networks: 16 | - pytest_scrapy_cookies 17 | ports: 18 | - "127.0.0.1:6379:6379" 19 | restart: always 20 | tty: true 21 | 22 | networks: 23 | pytest_scrapy_cookies: 24 | driver: bridge 25 | -------------------------------------------------------------------------------- /tests/test_storages/test_storage_in_memory.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from copy import deepcopy 4 | from unittest import TestCase 5 | 6 | from scrapy import Spider 7 | from scrapy.http.cookies import CookieJar 8 | from scrapy.settings import Settings 9 | 10 | from scrapy_cookies.settings import default_settings 11 | from scrapy_cookies.storage.in_memory import InMemoryStorage 12 | 13 | 14 | class StorageTest(TestCase): 15 | def setUp(self): 16 | self.spider = Spider("foo") 17 | self.settings = Settings() 18 | self.settings.setmodule(default_settings) 19 | 20 | def tearDown(self): 21 | pass 22 | 23 | def test_in_memory(self): 24 | tmpdir = tempfile.mkdtemp() 25 | local_settings = { 26 | "COOKIES_PERSISTENCE": True, 27 | "COOKIES_PERSISTENCE_DIR": tmpdir + "/cookies", 28 | } 29 | settings = deepcopy(self.settings) 30 | settings.setdict(local_settings) 31 | 32 | storage = InMemoryStorage(settings) 33 | storage.open_spider(self.spider) 34 | 35 | cookie = storage["no_key"] 36 | self.assertIsInstance(cookie, CookieJar) 37 | self.assertDictEqual(cookie._cookies, CookieJar()._cookies) 38 | 39 | storage["key_1"] = CookieJar() 40 | self.assertIn("key_1", storage) 41 | self.assertEqual(storage["key_1"]._cookies, CookieJar()._cookies) 42 | 43 | storage.close_spider(self.spider) 44 | self.assertTrue(os.path.isfile(tmpdir + "/cookies")) 45 | -------------------------------------------------------------------------------- /tests/test_storages/test_storage_mongo.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from collections.abc import Iterable 3 | from unittest import TestCase 4 | 5 | from pytest import mark 6 | from scrapy import Spider 7 | from scrapy.http.cookies import CookieJar 8 | from scrapy.settings import Settings 9 | 10 | from scrapy_cookies.settings import default_settings 11 | from scrapy_cookies.storage.mongo import MongoStorage 12 | 13 | 14 | @mark.usefixtures("class_scoped_container_getter") 15 | class MongoStorageTest(TestCase): 16 | local_settings = { 17 | "COOKIES_STORAGE": "scrapy_cookies.storage.mongo.MongoStorage", 18 | "COOKIES_MONGO_MONGOCLIENT_HOST": "localhost", 19 | "COOKIES_MONGO_MONGOCLIENT_PORT": 27017, 20 | "COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS": dict, 21 | "COOKIES_MONGO_MONGOCLIENT_TZ_AWARE": False, 22 | "COOKIES_MONGO_MONGOCLIENT_CONNECT": True, 23 | "COOKIES_MONGO_MONGOCLIENT_KWARGS": {}, 24 | "COOKIES_MONGO_DATABASE": "cookies", 25 | "COOKIES_MONGO_COLLECTION": "cookies", 26 | } 27 | 28 | def setUp(self): 29 | self.spider = Spider("foo") 30 | self.settings = Settings() 31 | self.settings.setmodule(default_settings) 32 | self.settings.setdict(self.local_settings) 33 | self.storage = MongoStorage(self.settings) 34 | self.storage.open_spider(self.spider) 35 | 36 | def tearDown(self): 37 | self.storage.close_spider(self.spider) 38 | self.storage.coll.delete_many({}) 39 | 40 | def test_getitem(self): 41 | cookies = CookieJar() 42 | self.storage.coll.insert_one( 43 | { 44 | "key": "new_cookies", 45 | "cookiejar": pickle.dumps(cookies), 46 | "cookies": cookies._cookies, 47 | } 48 | ) 49 | 50 | self.assertDictEqual(self.storage["new_cookies"]._cookies, cookies._cookies) 51 | 52 | def test_missing(self): 53 | self.assertDictEqual( 54 | self.storage["no_exist_cookies"]._cookies, CookieJar()._cookies 55 | ) 56 | 57 | def test_setitem(self): 58 | cookies = CookieJar() 59 | self.storage["new_cookies"] = cookies 60 | self.assertDictEqual( 61 | self.storage.coll.find_one({"key": "new_cookies"}, {"_id": 0}), 62 | { 63 | "key": "new_cookies", 64 | "cookiejar": pickle.dumps(cookies), 65 | "cookies": cookies._cookies, 66 | }, 67 | ) 68 | 69 | def test_iter(self): 70 | self.assertIsInstance(self.storage, Iterable) 71 | -------------------------------------------------------------------------------- /tests/test_storages/test_storage_redis.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from collections.abc import Iterable 3 | from unittest import TestCase 4 | 5 | import ujson 6 | from pytest import mark 7 | from scrapy import Spider 8 | from scrapy.http.cookies import CookieJar 9 | from scrapy.settings import Settings 10 | 11 | from scrapy_cookies.settings import default_settings 12 | from scrapy_cookies.storage.redis_ import RedisStorage 13 | 14 | 15 | @mark.usefixtures("class_scoped_container_getter") 16 | class RedisStorageTest(TestCase): 17 | maxDiff = None 18 | local_settings = {} 19 | 20 | def setUp(self): 21 | self.spider = Spider("foo") 22 | self.settings = Settings() 23 | self.settings.setmodule(default_settings) 24 | self.settings.setdict(self.local_settings) 25 | self.storage = RedisStorage(self.settings) 26 | self.storage.open_spider(self.spider) 27 | 28 | def tearDown(self): 29 | self.storage.close_spider(self.spider) 30 | self.storage.r.flushall() 31 | 32 | def test_getitem(self): 33 | cookies = CookieJar() 34 | self.storage.r.hmset( 35 | "new_cookies", 36 | { 37 | "cookiejar": pickle.dumps(cookies), 38 | "cookies": ujson.dumps(cookies._cookies), 39 | }, 40 | ) 41 | self.assertDictEqual(self.storage["new_cookies"]._cookies, cookies._cookies) 42 | 43 | def test_missing(self): 44 | self.assertDictEqual( 45 | self.storage["no_exist_cookies"]._cookies, CookieJar()._cookies 46 | ) 47 | 48 | def test_setitem(self): 49 | cookies = CookieJar() 50 | self.storage["new_cookies"] = cookies 51 | _ = self.storage.r.hgetall("new_cookies") 52 | self.assertDictEqual( 53 | pickle.loads(self.storage.r.hgetall("new_cookies")[b"cookiejar"])._cookies, 54 | cookies._cookies, 55 | ) 56 | self.assertDictEqual( 57 | self.storage.r.hgetall("new_cookies"), 58 | { 59 | b"cookiejar": pickle.dumps(cookies), 60 | b"cookies": ujson.dumps(cookies._cookies).encode(), 61 | }, 62 | ) 63 | 64 | def test_iter(self): 65 | self.assertIsInstance(self.storage, Iterable) 66 | 67 | def test_len(self): 68 | self.assertEqual(len(self.storage), 0) 69 | self.storage["new_cookies_1"] = CookieJar() 70 | self.assertEqual(len(self.storage), 1) 71 | self.storage["new_cookies_2"] = CookieJar() 72 | self.assertEqual(len(self.storage), 2) 73 | 74 | def test_delitem(self): 75 | self.storage["new_cookies"] = CookieJar() 76 | del self.storage["new_cookies"] 77 | self.assertFalse(self.storage.r.hgetall("new_cookies")) 78 | -------------------------------------------------------------------------------- /tests/test_storages/test_storage_sqlite.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from copy import deepcopy 4 | from unittest import TestCase 5 | 6 | from scrapy import Spider 7 | from scrapy.http.cookies import CookieJar 8 | from scrapy.settings import Settings 9 | 10 | from scrapy_cookies.settings import default_settings 11 | from scrapy_cookies.storage.sqlite import SQLiteStorage 12 | 13 | 14 | class StorageTest(TestCase): 15 | def setUp(self): 16 | self.spider = Spider("foo") 17 | self.settings = Settings() 18 | self.settings.setmodule(default_settings) 19 | 20 | def tearDown(self): 21 | pass 22 | 23 | def test_sqlite(self): 24 | tmpdir = tempfile.mkdtemp() 25 | local_settings = { 26 | "COOKIES_STORAGE": "scrapy_cookies.storage.sqlite.SQLiteStorage", 27 | "COOKIES_SQLITE_DATABASE": ":memory:", 28 | "COOKIES_PERSISTENCE": True, 29 | "COOKIES_PERSISTENCE_DIR": tmpdir + "/cookies", 30 | } 31 | settings = deepcopy(self.settings) 32 | settings.setdict(local_settings) 33 | 34 | storage = SQLiteStorage(settings) 35 | storage.open_spider(self.spider) 36 | 37 | cookie = storage["no_key"] 38 | self.assertIn("no_key", storage) 39 | self.assertIsInstance(cookie, CookieJar) 40 | self.assertEqual(cookie._cookies, CookieJar()._cookies) 41 | 42 | storage["key_1"] = CookieJar() 43 | self.assertIn("key_1", storage) 44 | self.assertEqual(storage["key_1"]._cookies, CookieJar()._cookies) 45 | 46 | self.assertNotIn("key_2", storage) 47 | 48 | self.assertEqual(len(storage), 2) 49 | 50 | _dict = {"no_key": CookieJar()._cookies, "key_1": CookieJar()._cookies} 51 | for k, v in storage: 52 | self.assertDictEqual(v._cookies, _dict[k]) 53 | 54 | storage.close_spider(self.spider) 55 | self.assertTrue(os.path.isfile(tmpdir + "/cookies")) 56 | 57 | storage_2 = SQLiteStorage(settings) 58 | storage_2.open_spider(self.spider) 59 | self.assertIn("key_1", storage_2) 60 | self.assertDictEqual(storage_2["key_1"]._cookies, CookieJar()._cookies) 61 | 62 | storage_2.close_spider(self.spider) 63 | self.assertTrue(os.path.isfile(tmpdir + "/cookies")) 64 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (https://tox.readthedocs.io/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py36,py37 8 | 9 | [testenv] 10 | commands = 11 | pytest 12 | deps = 13 | -r requirements.txt 14 | -r tests/requirements.txt 15 | passenv = 16 | PYTHONPATH 17 | 18 | [docs] 19 | changedir = docs 20 | deps = 21 | -r docs/requirements.txt 22 | 23 | [testenv:docs] 24 | changedir = {[docs]changedir} 25 | deps = {[docs]deps} 26 | commands = 27 | sphinx-build -W -b html . {envtmpdir}/html 28 | 29 | [testenv:docs-links] 30 | changedir = {[docs]changedir} 31 | deps = {[docs]deps} 32 | commands = 33 | sphinx-build -W -b linkcheck . {envtmpdir}/linkcheck 34 | --------------------------------------------------------------------------------