├── .coveragerc
├── .gitignore
├── .travis.yml
├── .whitesource
├── AUTHORS
├── INSTALL
├── LICENSE
├── MANIFEST.in
├── Pipfile
├── README.rst
├── codecov.yml
├── docker
├── MongoDB
│ └── docker-compose.yml
└── Redis
│ └── docker-compose.yml
├── docs
├── Makefile
├── _ext
│ └── scrapydocs.py
├── _static
│ └── selectors-sample1.html
├── _templates
│ └── layout.html
├── conf.py
├── conf.py.bak
├── index.rst
├── intro
│ ├── examples.rst
│ ├── installation.rst
│ ├── overview.rst
│ └── tutorial.rst
├── make.bat
├── requirements.txt
└── topics
│ ├── cookiesmiddleware.rst
│ ├── settings.rst
│ └── storage.rst
├── pytest.ini
├── renovate.json
├── requirements.txt
├── scrapy_cookies
├── VERSION
├── __init__.py
├── downloadermiddlewares
│ ├── __init__.py
│ └── cookies.py
├── settings
│ ├── __init__.py
│ └── default_settings.py
├── signals.py
└── storage
│ ├── __init__.py
│ ├── in_memory.py
│ ├── mongo.py
│ ├── redis_.py
│ └── sqlite.py
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── requirements.txt
├── test_downloadermiddleware_cookies.py
└── test_storages
│ ├── __init__.py
│ ├── confest.py
│ ├── docker-compose.yml
│ ├── test_storage_in_memory.py
│ ├── test_storage_mongo.py
│ ├── test_storage_redis.py
│ └── test_storage_sqlite.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = true
3 | include = scrapy_cookies/*
4 | omit =
5 | tests/*
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # .gitignore contents are coming from the project gitignore:
2 | # https://github.com/github/gitignore
3 |
4 |
5 | # -----------------------------------------------------------------------------
6 | # Python
7 | # -----------------------------------------------------------------------------
8 |
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Distribution / packaging
18 | .Python
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | .hypothesis/
56 | .pytest_cache/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # celery beat schedule file
87 | celerybeat-schedule
88 |
89 | # SageMath parsed files
90 | *.sage.py
91 |
92 | # Environments
93 | .env
94 | .venv
95 | env/
96 | venv/
97 | ENV/
98 | env.bak/
99 | venv.bak/
100 |
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 |
105 | # Rope project settings
106 | .ropeproject
107 |
108 | # mkdocs documentation
109 | /site
110 |
111 | # mypy
112 | .mypy_cache/
113 |
114 |
115 | # -----------------------------------------------------------------------------
116 | # JetBrains
117 | # -----------------------------------------------------------------------------
118 |
119 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion,
120 | # Android Studio and WebStorm
121 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
122 |
123 | # User-specific stuff
124 | .idea/**/workspace.xml
125 | .idea/**/tasks.xml
126 | .idea/**/usage.statistics.xml
127 | .idea/**/dictionaries
128 | .idea/**/shelf
129 |
130 | # Sensitive or high-churn files
131 | .idea/**/dataSources/
132 | .idea/**/dataSources.ids
133 | .idea/**/dataSources.local.xml
134 | .idea/**/sqlDataSources.xml
135 | .idea/**/dynamic.xml
136 | .idea/**/uiDesigner.xml
137 | .idea/**/dbnavigator.xml
138 |
139 | # Gradle
140 | .idea/**/gradle.xml
141 | .idea/**/libraries
142 |
143 | # Gradle and Maven with auto-import
144 | # When using Gradle or Maven with auto-import, you should exclude module files,
145 | # since they will be recreated, and may cause churn. Uncomment if using
146 | # auto-import.
147 | # .idea/modules.xml
148 | # .idea/*.iml
149 | # .idea/modules
150 |
151 | # CMake
152 | cmake-build-*/
153 |
154 | # Mongo Explorer plugin
155 | .idea/**/mongoSettings.xml
156 |
157 | # File-based project format
158 | *.iws
159 |
160 | # IntelliJ
161 | out/
162 |
163 | # mpeltonen/sbt-idea plugin
164 | .idea_modules/
165 |
166 | # JIRA plugin
167 | atlassian-ide-plugin.xml
168 |
169 | # Cursive Clojure plugin
170 | .idea/replstate.xml
171 |
172 | # Crashlytics plugin (for Android Studio and IntelliJ)
173 | com_crashlytics_export_strings.xml
174 | crashlytics.properties
175 | crashlytics-build.properties
176 | fabric.properties
177 |
178 | # Editor-based Rest Client
179 | .idea/httpRequests
180 |
181 |
182 | # -----------------------------------------------------------------------------
183 | # Linux
184 | # -----------------------------------------------------------------------------
185 |
186 | *~
187 |
188 | # temporary files which can be created if a process still has a handle open of
189 | # a deleted file
190 | .fuse_hidden*
191 |
192 | # KDE directory preferences
193 | .directory
194 |
195 | # Linux trash folder which might appear on any partition or disk
196 | .Trash-*
197 |
198 | # .nfs files are created when an open file is removed but is still being
199 | # accessed
200 | .nfs*
201 |
202 |
203 | # -----------------------------------------------------------------------------
204 | # Vim
205 | # -----------------------------------------------------------------------------
206 |
207 | # Swap
208 | [._]*.s[a-v][a-z]
209 | [._]*.sw[a-p]
210 | [._]s[a-rt-v][a-z]
211 | [._]ss[a-gi-z]
212 | [._]sw[a-p]
213 |
214 | # Session
215 | Session.vim
216 |
217 | # Temporary
218 | .netrwhist
219 | *~
220 | # Auto-generated tag files
221 | tags
222 | # Persistent undo
223 | [._]*.un~
224 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | services:
3 | - mongodb
4 | - redis-server
5 | sudo: false
6 | branches:
7 | only:
8 | - master
9 | - /^\d\.\d+$/
10 | - /^\d\.\d+\.\d+(rc\d+|\.dev\d+)?$/
11 | matrix:
12 | include:
13 | - python: 3.6
14 | env: TOXENV=py36
15 | - python: 3.7
16 | env: TOXENV=py37
17 | dist: xenial
18 | - python: 3.6
19 | env: TOXENV=docs
20 | install:
21 | - pip install -U tox twine wheel codecov
22 | script: tox
23 | after_success:
24 | - codecov
25 | notifications:
26 | slack:
27 | rooms:
28 | - secure: zsDJgHzhPoAIs8OsOiv5wmNsck++hjZljeYAfKh25UwW8X97Rqvq5r9LMlQzIHf2a638AbsubDNeSbrxxu6cFDeeIFngG1EO5mOSWzKr18LM5pFb0GVlImKzZpKgLqKmaD5ATYXnvUaHjEHgO45TzjIsbwo9P4vRU5C/lGYwfdv/J82hP0OUo02HqWGkwpG0aeuzs1bJZKjS/RdHROt0SQpRfVB8hi4HHrQILgliuVcpvIk46FgRB49VmzpAGuQfJtB06gj8o6tL/1JlXQ9/ElrHwEJyGjiyeP/nP8qit+i9TTlHGT9k0s9oYuXWM8OlgKfKE13Mo8fVRaAhVv9DRcwtNpX5M0RtC5bEjCPQIL14ky4ymeSlGchmy37jTKJCNHm4St4CtodCrF5J77h8Gkjx9tkZOhf4Rd8veMMgv/gj8pyt3asJ8PMDvREjF4n4mRPy5SB53anEhrFXE801KOpqb4Ffsjv2DBJmuAId+OmHLs69jHeiwxkBDaeKDr6rpiiKQaZNbDw5KxjEafEtclVmSdprq57Og2SPaCR1TrUu3SVyUjVoWNj6olKS9ALoiDAVLBprbyBsSS9gYwfTlSBNxCsMApQksjmo0/S6n/FwyCvn4AZZVziLVtVxPBY0sUpRBNySkTTrQzpCiEPNmv7tU8d0ZcVI508/WALzIQ0=
29 | cache:
30 | directories:
31 | - $HOME/.cache/pip
32 | #deploy:
33 | # provider: pypi
34 | # distributions: sdist bdist_wheel
35 | # user: grammy.jiang
36 | # password:
37 | # secure: nUWjH3+9D9I+Xrsz7isjVKpzXwxlJuWFi2OrWCMSilxUNaMrV/4fA0TShmS40TCxTGSasmApjZgZz+Qu93Z9KlHnP2nmBsEXnqtrrCMIhI52wLFdnMcTCNCutzOiKzVSMK/SvEvRP6+fcWRbsE0n0hVwUZc/Lwz4083OXoMQIuMs3NbVD0rAPcHTBthTwabQjSp8WwYv9wZj/pZQ7qYw+QOe+b8XhQIIA10Oy9rAcyaOGASMsbBithKap91ayj2yRqmM6kb+nwi4aEJo/+XwQuncJWleTOy88Rt+YtoYkDxoHopjwBR2RAoevfq0Y1Mjl9e1mssunzQ053qmXfKAFB77Xn99iR0bmwSwwCtyBgoY/Ed5+wywwNdE6tfNB8/pYXg3z6mTmIwqXCQhn2+ORdD5RFn9RZAV4IoR1z8WRKU6clsVF2Msc9QUsj5wUA7LXkBg4HlVJurZurbFh58ViVTO2aNo6c+7fBiBwbm2aupeB+RlL9kCz14pbJcd89H6ViWByE6O9pFlyNcEt28FaKLIuyWAsAsYOPnj74oYuoV2hZ7y0259ncGX0UsDVzPwaJ/NlQsi4yh2d300mRvOSbiELhBZdABbkN+pgGmE1mlqUkY9GHb070JsOavzedzsuEgBLAaWgTAxeDd9LqFfIE7iFLj+U9v9d73ZtKy4VeE=
38 | # on:
39 | # branch: master
40 | ## tags: true
41 | # repo: grammy-jiang/scrapy-cookies
42 | # condition: "$TOXENV == py36"
43 |
--------------------------------------------------------------------------------
/.whitesource:
--------------------------------------------------------------------------------
1 | {
2 | "generalSettings": {
3 | "shouldScanRepo": true
4 | },
5 | "checkRunSettings": {
6 | "vulnerableCheckRunConclusionLevel": "failure"
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Scrapy-Cookies is a part of Scrapy Enhancement, which intends to explore more
2 | possibility of Scrapy. Once the code is proved useful and stable, it will be
3 | merged back to Scrapy or contributed back to the Scrapy Plugins.
4 |
5 | Here is the list of the primary authors & contributors:
6 |
7 | * Grammy Jiang
8 |
--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
1 | For information about installing Scrapy-Cookies see:
2 |
3 | * docs/intro/installation.rst (local file)
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) Scrapy Enhancement developers.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice,
8 | this list of conditions, and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions, and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 |
14 | 3. Neither the name of Scrapy nor the names of its contributors may be used
15 | to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include AUTHORS
3 | include INSTALL
4 | include LICENSE
5 | include MANIFEST.in
6 | include scrapy_cookies/VERSION
7 | recursive-include docs *
8 | prune docs/build
9 | recursive-include tests *
10 | global-exclude __pycache__ *.py[cod]
11 |
--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
1 | [[source]]
2 | name = "pypi"
3 | url = "https://pypi.org/simple"
4 | verify_ssl = true
5 |
6 | [dev-packages]
7 | bandit = "*"
8 | black = "*"
9 | flake8 = "*"
10 | flake8-bugbear = "*"
11 | ipython = "*"
12 | isort = "*"
13 | mitmproxy = "*"
14 | mypy = "*"
15 | pre-commit = "*"
16 | prospector = "*"
17 | pylint = "*"
18 | pytest = "*"
19 | pytest-benchmark = "*"
20 | pytest-black = "*"
21 | pytest-cov = "*"
22 | pytest-docker-compose = "*"
23 | pytest-env = "*"
24 | pytest-instafail = "*"
25 | pytest-mypy = "*"
26 | pytest-pycharm = "*"
27 | pytest-pylint = "*"
28 | pytest-sugar = "*"
29 | pytest-twisted = "*"
30 | pytest-watch = "*"
31 | pytest-xdist = "*"
32 | radon = "*"
33 | tox = "*"
34 | testfixtures = "*"
35 |
36 | [packages]
37 | hiredis = "*"
38 | pymongo = "*"
39 | redis = "*"
40 | six = "*"
41 | ujson = "*"
42 | Scrapy = "*"
43 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ==============
2 | Scrapy Cookies
3 | ==============
4 |
5 | .. image:: https://img.shields.io/pypi/v/scrapy-cookies.svg
6 | :target: https://pypi.python.org/pypi/scrapy-cookies
7 | :alt: PyPI
8 |
9 | .. image:: https://img.shields.io/pypi/pyversions/scrapy-cookies.svg
10 | :target: https://pypi.python.org/pypi/scrapy-cookies
11 | :alt: PyPI - Python Version
12 |
13 | .. image:: https://img.shields.io/travis/scrapedia/scrapy-cookies/master.svg
14 | :target: http://travis-ci.org/scrapedia/scrapy-cookies
15 | :alt: Travis branch
16 |
17 | .. image:: https://img.shields.io/pypi/wheel/scrapy-cookies.svg
18 | :target: https://pypi.python.org/pypi/scrapy-cookies
19 | :alt: PyPI - Wheel
20 |
21 | .. image:: https://img.shields.io/codecov/c/github/scrapedia/scrapy-cookies/master.svg
22 | :target: http://codecov.io/github/scrapedia/scrapy-cookies?branch=master
23 | :alt: Codecov branch
24 |
25 | Overview
26 | ========
27 |
28 | This middleware enable Scrapy manage, save and restore cookies in various ways.
29 | With this middleware Scrapy can easily re-use cookies which saved before or
30 | in multiple spiders, and share cookies between spiders, even in spider-cluster.
31 |
32 | Requirements
33 | ============
34 |
35 | * Python 2.7 or Python 3.4+
36 | * Works on Linux, Windows, Mac OSX, BSD
37 |
38 | Installation
39 | ============
40 |
41 | The quick way:
42 |
43 | pip install scrapy-cookies
44 |
45 | For more details see the installation section in the documentation:
46 | https://scrapy-cookies.readthedocs.io/en/latest/intro/installation.html
47 |
48 | Documentation
49 | =============
50 |
51 | Documentation is available online at
52 | https://scrapy-cookies.readthedocs.io/en/latest/ and in the ``docs`` directory.
53 |
54 | Releases
55 | ========
56 |
57 | You can find release notes at
58 | https://scrapy-cookies.readthedocs.io/en/latest/news.html
59 |
60 | Community (blog, twitter, mail list, IRC)
61 | =========================================
62 |
63 | *Keeping this section same as Scrapy is intending to benefit back to Scrapy.*
64 |
65 | See https://scrapy.org/community/
66 |
67 | Contributing
68 | ============
69 |
70 | *Keeping this section same as Scrapy is intending to be easier when this repo
71 | merge back to Scrapy.*
72 |
73 | See https://doc.scrapy.org/en/master/contributing.html
74 |
75 | Code of Conduct
76 | ---------------
77 |
78 | Please note that this project is released with a Contributor Code of Conduct
79 | (see https://github.com/scrapy/scrapy/blob/master/CODE_OF_CONDUCT.md).
80 |
81 | By participating in this project you agree to abide by its terms.
82 | Please report unacceptable behavior to opensource@scrapinghub.com.
83 |
84 |
85 | Companies using Scrapy
86 | ======================
87 |
88 | *Keeping this section same as Scrapy is intending to benefit back to Scrapy.*
89 |
90 | See https://scrapy.org/companies/
91 |
92 | Commercial Support
93 | ==================
94 |
95 | *Keeping this section same as Scrapy is intending to benefit back to Scrapy.*
96 |
97 | See https://scrapy.org/support/
98 |
99 | TODO
100 | ====
101 |
102 | * [X] Remove the support lower than python 3.6
103 | * [ ] Use JSON1 extension in sqlite backend
104 | * [ ] Update backend arguments calling way
105 | * [ ] Replace pymongo with txmongo in MongoDB backend
106 | * [ ] Replace redis sync driver with async driver in Redis backend
107 | * [ ] Change LICENSE to GPLv3
108 | * [ ] Use versioneer for version management
109 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | comment:
2 | layout: "header, diff, tree"
3 |
4 | coverage:
5 | status:
6 | project: false
7 | notify:
8 | slack:
9 | default:
10 | url: "secret:KQc0qNe30SGOA3baphzz48aXGWPJlE6qDlk4qZGGdW8fAEEJG8lHubU9301vJCECqEhv5E+JNHXfWKd+bcKjhIc5nhgt2w2BaZyEXEawhaTx0MJZ8xjX/unaul2wA5rL3ZkV4loVbN34sOq7vFgEzSS"
11 | branches: null
12 | flags: null
13 | only_pulls: false
14 | paths: null
15 | threshold: 1%
16 |
--------------------------------------------------------------------------------
/docker/MongoDB/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 | # https://hub.docker.com/_/mongo/
4 | mongo:
5 | container_name: dc-mongo
6 | image: mongo:latest
7 | networks:
8 | - mongo
9 | ports:
10 | - "127.0.0.1:27017:27017"
11 | restart: always
12 | tty: true
13 | # https://hub.docker.com/_/mongo-express/
14 | mongo-express:
15 | container_name: dc-mongodb-express
16 | depends_on:
17 | - mongo
18 | environment:
19 | ME_CONFIG_MONGODB_PORT: 27017
20 | ME_CONFIG_MONGODB_SERVER: mongo
21 | image: mongo-express:latest
22 | links:
23 | - mongo
24 | networks:
25 | - mongo
26 | ports:
27 | - "127.0.0.1:8081:8081"
28 | restart: always
29 | tty: true
30 |
31 | networks:
32 | mongo:
33 | driver: bridge
34 |
--------------------------------------------------------------------------------
/docker/Redis/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 | redis:
4 | container_name: dc-redis
5 | image: redis:latest
6 | networks:
7 | - redis
8 | ports:
9 | - "127.0.0.1:6379:6379"
10 | restart: always
11 | tty: true
12 | redis-commander:
13 | command: --redis-host redis
14 | container_name: dc-redis-commander
15 | depends_on:
16 | - redis
17 | image: tenstartups/redis-commander:latest
18 | links:
19 | - redis
20 | networks:
21 | - redis
22 | ports:
23 | - "127.0.0.1:8181:8081"
24 | restart: always
25 | tty: true
26 |
27 | networks:
28 | redis:
29 | driver: bridge
30 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = Scrapy-Cookies
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/_ext/scrapydocs.py:
--------------------------------------------------------------------------------
1 | from docutils.parsers.rst.roles import set_classes
2 | from docutils import nodes
3 | from docutils.parsers.rst import Directive
4 | from sphinx.util.nodes import make_refnode
5 | from operator import itemgetter
6 |
7 |
8 | class settingslist_node(nodes.General, nodes.Element):
9 | pass
10 |
11 |
12 | class SettingsListDirective(Directive):
13 | def run(self):
14 | return [settingslist_node('')]
15 |
16 |
17 | def is_setting_index(node):
18 | if node.tagname == 'index':
19 | # index entries for setting directives look like:
20 | # [(u'pair', u'SETTING_NAME; setting', u'std:setting-SETTING_NAME', '')]
21 | entry_type, info, refid = node['entries'][0][:3]
22 | return entry_type == 'pair' and info.endswith('; setting')
23 | return False
24 |
25 |
26 | def get_setting_target(node):
27 | # target nodes are placed next to the node in the doc tree
28 | return node.parent[node.parent.index(node) + 1]
29 |
30 |
31 | def get_setting_name_and_refid(node):
32 | """Extract setting name from directive index node"""
33 | entry_type, info, refid = node['entries'][0][:3]
34 | return info.replace('; setting', ''), refid
35 |
36 |
37 | def collect_scrapy_settings_refs(app, doctree):
38 | env = app.builder.env
39 |
40 | if not hasattr(env, 'scrapy_all_settings'):
41 | env.scrapy_all_settings = []
42 |
43 | for node in doctree.traverse(is_setting_index):
44 | targetnode = get_setting_target(node)
45 | assert isinstance(targetnode, nodes.target), "Next node is not a target"
46 |
47 | setting_name, refid = get_setting_name_and_refid(node)
48 |
49 | env.scrapy_all_settings.append({
50 | 'docname': env.docname,
51 | 'setting_name': setting_name,
52 | 'refid': refid,
53 | })
54 |
55 |
56 | def make_setting_element(setting_data, app, fromdocname):
57 | refnode = make_refnode(app.builder, fromdocname,
58 | todocname=setting_data['docname'],
59 | targetid=setting_data['refid'],
60 | child=nodes.Text(setting_data['setting_name']))
61 | p = nodes.paragraph()
62 | p += refnode
63 |
64 | item = nodes.list_item()
65 | item += p
66 | return item
67 |
68 |
69 | def replace_settingslist_nodes(app, doctree, fromdocname):
70 | env = app.builder.env
71 |
72 | for node in doctree.traverse(settingslist_node):
73 | settings_list = nodes.bullet_list()
74 | settings_list.extend([make_setting_element(d, app, fromdocname)
75 | for d in sorted(env.scrapy_all_settings,
76 | key=itemgetter('setting_name'))
77 | if fromdocname != d['docname']])
78 | node.replace_self(settings_list)
79 |
80 |
81 | def setup(app):
82 | app.add_crossref_type(
83 | directivename = "setting",
84 | rolename = "setting",
85 | indextemplate = "pair: %s; setting",
86 | )
87 | app.add_crossref_type(
88 | directivename = "signal",
89 | rolename = "signal",
90 | indextemplate = "pair: %s; signal",
91 | )
92 | app.add_crossref_type(
93 | directivename = "command",
94 | rolename = "command",
95 | indextemplate = "pair: %s; command",
96 | )
97 | app.add_crossref_type(
98 | directivename = "reqmeta",
99 | rolename = "reqmeta",
100 | indextemplate = "pair: %s; reqmeta",
101 | )
102 | app.add_role('source', source_role)
103 | app.add_role('commit', commit_role)
104 | app.add_role('issue', issue_role)
105 | app.add_role('rev', rev_role)
106 |
107 | app.add_node(settingslist_node)
108 | app.add_directive('settingslist', SettingsListDirective)
109 |
110 | app.connect('doctree-read', collect_scrapy_settings_refs)
111 | app.connect('doctree-resolved', replace_settingslist_nodes)
112 |
113 |
114 | def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
115 | ref = 'https://github.com/scrapy/scrapy/blob/master/' + text
116 | set_classes(options)
117 | node = nodes.reference(rawtext, text, refuri=ref, **options)
118 | return [node], []
119 |
120 |
121 | def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
122 | ref = 'https://github.com/scrapy/scrapy/issues/' + text
123 | set_classes(options)
124 | node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options)
125 | return [node], []
126 |
127 |
128 | def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
129 | ref = 'https://github.com/scrapy/scrapy/commit/' + text
130 | set_classes(options)
131 | node = nodes.reference(rawtext, 'commit ' + text, refuri=ref, **options)
132 | return [node], []
133 |
134 |
135 | def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
136 | ref = 'http://hg.scrapy.org/scrapy/changeset/' + text
137 | set_classes(options)
138 | node = nodes.reference(rawtext, 'r' + text, refuri=ref, **options)
139 | return [node], []
140 |
--------------------------------------------------------------------------------
/docs/_static/selectors-sample1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Example website
5 |
6 |
7 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 |
3 | {% block footer %}
4 | {{ super() }}
5 |
16 | {% endblock %}
17 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Scrapy-Cookies documentation build configuration file, created by
4 | # sphinx-quickstart on Mon Nov 24 12:02:52 2008.
5 | #
6 | # This file is execfile()d with the current directory set to its containing dir.
7 | #
8 | # The contents of this file are pickled, so don't put values in the namespace
9 | # that aren't pickleable (module imports are okay, they're removed
10 | # automatically).
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | from os import path
17 |
18 | # If your extensions are in another directory, add it here. If the directory
19 | # is relative to the documentation root, use os.path.abspath to make it
20 | # absolute, like shown here.
21 | sys.path.append(path.join(path.dirname(__file__), "_ext"))
22 | sys.path.insert(0, path.dirname(path.dirname(__file__)))
23 |
24 |
25 | # General configuration
26 | # ---------------------
27 |
28 | # Add any Sphinx extension module names here, as strings. They can be extensions
29 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
30 | extensions = [
31 | 'scrapydocs',
32 | 'sphinx.ext.autodoc'
33 | ]
34 |
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 |
38 | # The suffix of source filenames.
39 | source_suffix = '.rst'
40 |
41 | # The encoding of source files.
42 | #source_encoding = 'utf-8'
43 |
44 | # The master toctree document.
45 | master_doc = 'index'
46 |
47 | # General information about the project.
48 | project = 'Scrapy-Cookies'
49 | copyright = '2018, Scrapy Enhancement developers'
50 |
51 | # The version info for the project you're documenting, acts as replacement for
52 | # |version| and |release|, also used in various other places throughout the
53 | # built documents.
54 | #
55 | # The short X.Y version.
56 | try:
57 | import scrapy_cookies
58 | version = '.'.join(map(str, scrapy_cookies.version_info[:2]))
59 | release = scrapy_cookies.__version__
60 | except ImportError:
61 | version = ''
62 | release = ''
63 |
64 | # The language for content autogenerated by Sphinx. Refer to documentation
65 | # for a list of supported languages.
66 | language = 'en'
67 |
68 | # There are two options for replacing |today|: either, you set today to some
69 | # non-false value, then it is used:
70 | #today = ''
71 | # Else, today_fmt is used as the format for a strftime call.
72 | #today_fmt = '%B %d, %Y'
73 |
74 | # List of documents that shouldn't be included in the build.
75 | #unused_docs = []
76 |
77 | # List of directories, relative to source directory, that shouldn't be searched
78 | # for source files.
79 | exclude_trees = ['.build']
80 |
81 | # The reST default role (used for this markup: `text`) to use for all documents.
82 | #default_role = None
83 |
84 | # If true, '()' will be appended to :func: etc. cross-reference text.
85 | #add_function_parentheses = True
86 |
87 | # If true, the current module name will be prepended to all description
88 | # unit titles (such as .. function::).
89 | #add_module_names = True
90 |
91 | # If true, sectionauthor and moduleauthor directives will be shown in the
92 | # output. They are ignored by default.
93 | #show_authors = False
94 |
95 | # The name of the Pygments (syntax highlighting) style to use.
96 | pygments_style = 'sphinx'
97 |
98 |
99 | # Options for HTML output
100 | # -----------------------
101 |
102 | # The theme to use for HTML and HTML Help pages. See the documentation for
103 | # a list of builtin themes.
104 | html_theme = 'sphinx_rtd_theme'
105 |
106 | # Theme options are theme-specific and customize the look and feel of a theme
107 | # further. For a list of options available for each theme, see the
108 | # documentation.
109 | #html_theme_options = {}
110 |
111 | # Add any paths that contain custom themes here, relative to this directory.
112 | # Add path to the RTD explicitly to robustify builds (otherwise might
113 | # fail in a clean Debian build env)
114 | import sphinx_rtd_theme
115 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
116 |
117 |
118 | # The style sheet to use for HTML and HTML Help pages. A file of that name
119 | # must exist either in Sphinx' static/ path, or in one of the custom paths
120 | # given in html_static_path.
121 | # html_style = 'scrapydoc.css'
122 |
123 | # The name for this set of Sphinx documents. If None, it defaults to
124 | # " v documentation".
125 | #html_title = None
126 |
127 | # A shorter title for the navigation bar. Default is the same as html_title.
128 | #html_short_title = None
129 |
130 | # The name of an image file (relative to this directory) to place at the top
131 | # of the sidebar.
132 | #html_logo = None
133 |
134 | # The name of an image file (within the static path) to use as favicon of the
135 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
136 | # pixels large.
137 | #html_favicon = None
138 |
139 | # Add any paths that contain custom static files (such as style sheets) here,
140 | # relative to this directory. They are copied after the builtin static files,
141 | # so a file named "default.css" will overwrite the builtin "default.css".
142 | html_static_path = ['_static']
143 |
144 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
145 | # using the given strftime format.
146 | html_last_updated_fmt = '%b %d, %Y'
147 |
148 | # Custom sidebar templates, maps document names to template names.
149 | #html_sidebars = {}
150 |
151 | # Additional templates that should be rendered to pages, maps page names to
152 | # template names.
153 | #html_additional_pages = {}
154 |
155 | # If false, no module index is generated.
156 | #html_use_modindex = True
157 |
158 | # If false, no index is generated.
159 | #html_use_index = True
160 |
161 | # If true, the index is split into individual pages for each letter.
162 | #html_split_index = False
163 |
164 | # If true, the reST sources are included in the HTML build as _sources/.
165 | html_copy_source = True
166 |
167 | # If true, an OpenSearch description file will be output, and all pages will
168 | # contain a tag referring to it. The value of this option must be the
169 | # base URL from which the finished HTML is served.
170 | #html_use_opensearch = ''
171 |
172 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
173 | #html_file_suffix = ''
174 |
175 | # Output file base name for HTML help builder.
176 | htmlhelp_basename = 'Scrapydoc'
177 |
178 |
179 | # Options for LaTeX output
180 | # ------------------------
181 |
182 | # The paper size ('letter' or 'a4').
183 | #latex_paper_size = 'letter'
184 |
185 | # The font size ('10pt', '11pt' or '12pt').
186 | #latex_font_size = '10pt'
187 |
188 | # Grouping the document tree into LaTeX files. List of tuples
189 | # (source start file, target name, title, author, document class [howto/manual]).
190 | latex_documents = [
191 | ('index', 'Scrapy.tex', u'Scrapy Documentation',
192 | u'Scrapy developers', 'manual'),
193 | ]
194 |
195 | # The name of an image file (relative to this directory) to place at the top of
196 | # the title page.
197 | #latex_logo = None
198 |
199 | # For "manual" documents, if this is true, then toplevel headings are parts,
200 | # not chapters.
201 | #latex_use_parts = False
202 |
203 | # Additional stuff for the LaTeX preamble.
204 | #latex_preamble = ''
205 |
206 | # Documents to append as an appendix to all manuals.
207 | #latex_appendices = []
208 |
209 | # If false, no module index is generated.
210 | #latex_use_modindex = True
211 |
212 |
213 | # Options for the linkcheck builder
214 | # ---------------------------------
215 |
216 | # A list of regular expressions that match URIs that should not be checked when
217 | # doing a linkcheck build.
218 | linkcheck_ignore = [
219 | 'http://localhost:\d+', 'http://hg.scrapy.org',
220 | 'http://directory.google.com/'
221 | ]
222 |
--------------------------------------------------------------------------------
/docs/conf.py.bak:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/master/config
8 |
9 | # -- Path setup --------------------------------------------------------------
10 |
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | import os
16 | import sys
17 | # sys.path.insert(0, os.path.abspath('.'))
18 |
19 | sys.path.append(path.join(path.dirname(__file__), "_ext"))
20 | sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
21 |
22 | # -- Project information -----------------------------------------------------
23 |
24 | project = 'Scrapy-Cookies'
25 | copyright = '2018, Grammy Jiang'
26 | author = 'Grammy Jiang'
27 |
28 | try:
29 | import scrapy
30 | version = '.'.join(map(str, scrapy.version_info[:2]))
31 | release = scrapy.__version__
32 | except ImportError:
33 | version = ''
34 | release = ''
35 |
36 | # The short X.Y version
37 | # version = ''
38 | # The full version, including alpha/beta/rc tags
39 | # release = '0.0.1'
40 |
41 |
42 | # -- General configuration ---------------------------------------------------
43 |
44 | # If your documentation needs a minimal Sphinx version, state it here.
45 | #
46 | # needs_sphinx = '1.0'
47 |
48 | # Add any Sphinx extension module names here, as strings. They can be
49 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
50 | # ones.
51 | extensions = [
52 | 'scrapydocs',
53 | 'sphinx.ext.autodoc',
54 | 'sphinx.ext.doctest',
55 | 'sphinx.ext.intersphinx',
56 | 'sphinx.ext.todo',
57 | 'sphinx.ext.coverage',
58 | 'sphinx.ext.mathjax',
59 | 'sphinx.ext.ifconfig',
60 | 'sphinx.ext.viewcode',
61 | 'sphinx.ext.githubpages',
62 | ]
63 |
64 | # Add any paths that contain templates here, relative to this directory.
65 | templates_path = ['_templates']
66 |
67 | # The suffix(es) of source filenames.
68 | # You can specify multiple suffix as a list of string:
69 | #
70 | # source_suffix = ['.rst', '.md']
71 | source_suffix = '.rst'
72 |
73 | # The master toctree document.
74 | master_doc = 'index'
75 |
76 | # The language for content autogenerated by Sphinx. Refer to documentation
77 | # for a list of supported languages.
78 | #
79 | # This is also used if you do content translation via gettext catalogs.
80 | # Usually you set "language" from the command line for these cases.
81 | language = None
82 |
83 | # List of patterns, relative to source directory, that match files and
84 | # directories to ignore when looking for source files.
85 | # This pattern also affects html_static_path and html_extra_path .
86 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
87 |
88 | # The name of the Pygments (syntax highlighting) style to use.
89 | pygments_style = 'sphinx'
90 |
91 |
92 | # -- Options for HTML output -------------------------------------------------
93 |
94 | # The theme to use for HTML and HTML Help pages. See the documentation for
95 | # a list of builtin themes.
96 | #
97 | html_theme = 'alabaster'
98 |
99 | # Theme options are theme-specific and customize the look and feel of a theme
100 | # further. For a list of options available for each theme, see the
101 | # documentation.
102 | #
103 | # html_theme_options = {}
104 |
105 | # Add any paths that contain custom static files (such as style sheets) here,
106 | # relative to this directory. They are copied after the builtin static files,
107 | # so a file named "default.css" will overwrite the builtin "default.css".
108 | html_static_path = ['_static']
109 |
110 | # Custom sidebar templates, must be a dictionary that maps document names
111 | # to template names.
112 | #
113 | # The default sidebars (for documents that don't match any pattern) are
114 | # defined by theme itself. Builtin themes are using these templates by
115 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
116 | # 'searchbox.html']``.
117 | #
118 | # html_sidebars = {}
119 |
120 |
121 | # -- Options for HTMLHelp output ---------------------------------------------
122 |
123 | # Output file base name for HTML help builder.
124 | htmlhelp_basename = 'Scrapy-Cookiesdoc'
125 |
126 |
127 | # -- Options for LaTeX output ------------------------------------------------
128 |
129 | latex_elements = {
130 | # The paper size ('letterpaper' or 'a4paper').
131 | #
132 | # 'papersize': 'letterpaper',
133 |
134 | # The font size ('10pt', '11pt' or '12pt').
135 | #
136 | # 'pointsize': '10pt',
137 |
138 | # Additional stuff for the LaTeX preamble.
139 | #
140 | # 'preamble': '',
141 |
142 | # Latex figure (float) alignment
143 | #
144 | # 'figure_align': 'htbp',
145 | }
146 |
147 | # Grouping the document tree into LaTeX files. List of tuples
148 | # (source start file, target name, title,
149 | # author, documentclass [howto, manual, or own class]).
150 | latex_documents = [
151 | (master_doc, 'Scrapy-Cookies.tex', 'Scrapy-Cookies Documentation',
152 | 'Grammy Jiang', 'manual'),
153 | ]
154 |
155 |
156 | # -- Options for manual page output ------------------------------------------
157 |
158 | # One entry per manual page. List of tuples
159 | # (source start file, name, description, authors, manual section).
160 | man_pages = [
161 | (master_doc, 'scrapy-cookies', 'Scrapy-Cookies Documentation',
162 | [author], 1)
163 | ]
164 |
165 |
166 | # -- Options for Texinfo output ----------------------------------------------
167 |
168 | # Grouping the document tree into Texinfo files. List of tuples
169 | # (source start file, target name, title, author,
170 | # dir menu entry, description, category)
171 | texinfo_documents = [
172 | (master_doc, 'Scrapy-Cookies', 'Scrapy-Cookies Documentation',
173 | author, 'Scrapy-Cookies', 'One line description of project.',
174 | 'Miscellaneous'),
175 | ]
176 |
177 |
178 | # -- Options for Epub output -------------------------------------------------
179 |
180 | # Bibliographic Dublin Core info.
181 | epub_title = project
182 | epub_author = author
183 | epub_publisher = author
184 | epub_copyright = copyright
185 |
186 | # The unique identifier of the text. This can be a ISBN number
187 | # or the project homepage.
188 | #
189 | # epub_identifier = ''
190 |
191 | # A unique identification for the text.
192 | #
193 | # epub_uid = ''
194 |
195 | # A list of files that should not be packed into the epub file.
196 | epub_exclude_files = ['search.html']
197 |
198 |
199 | # -- Extension configuration -------------------------------------------------
200 |
201 | # -- Options for intersphinx extension ---------------------------------------
202 |
203 | # Example configuration for intersphinx: refer to the Python standard library.
204 | intersphinx_mapping = {'https://docs.python.org/': None}
205 |
206 | # -- Options for todo extension ----------------------------------------------
207 |
208 | # If true, `todo` and `todoList` produce output, else they produce nothing.
209 | todo_include_todos = True
210 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. _topics-index:
2 |
3 | ======================================
4 | Scrapy-Cookies |version| documentation
5 | ======================================
6 |
7 | This documentation contains everything you need to know about Scrapy-Cookies.
8 |
9 | First steps
10 | ===========
11 |
12 | .. toctree::
13 | :caption: First steps
14 | :hidden:
15 |
16 | intro/overview
17 | intro/installation
18 | intro/tutorial
19 | intro/examples
20 |
21 | :doc:`intro/overview`
22 | Understand what Scrapy-Cookies is and how it can help you.
23 |
24 | :doc:`intro/installation`
25 | Get Scrapy-Cookies installed on your computer.
26 |
27 | :doc:`intro/tutorial`
28 | Write your first project with Scrapy-Cookies.
29 |
30 | :doc:`intro/examples`
31 | Learn more by playing with a pre-made project with Scrapy-Cookies.
32 |
33 | .. _section-basics:
34 |
35 | Basic concepts
36 | ==============
37 |
38 | .. toctree::
39 | :caption: Basic concepts
40 | :hidden:
41 |
42 | topics/cookiesmiddleware
43 | topics/storage
44 | topics/settings
45 |
46 |
47 | :doc:`topics/cookiesmiddleware`
48 | Extract cookies from response and Restore cookies to request.
49 |
50 | :doc:`topics/storage`
51 | Save ,restore and share the cookies.
52 |
53 | :doc:`topics/settings`
54 | Learn how to configure Scrapy-Cookies and see all available settings.
55 |
56 |
57 | .. _extending-scrapy:
58 |
59 | Extending Scrapy-Cookies
60 | ========================
61 |
62 | .. toctree::
63 | :caption: Extending Scrapy-Cookies
64 | :hidden:
65 |
66 | topics/storage
67 |
68 |
69 | :doc:`topics/storage`
70 | Customize how the storage save, restore and share the cookies
71 |
--------------------------------------------------------------------------------
/docs/intro/examples.rst:
--------------------------------------------------------------------------------
1 | .. _intro-examples:
2 |
3 | ========
4 | Examples
5 | ========
6 |
7 | The best way to learn is with examples, and Scrapy-Cookies is no exception. For
8 | this reason, there is an example project with Scrapy-Cookies named grouponbot_,
9 | that you can use to play and learn more about Scrapy-Cookies. It contains one
10 | spiders for https://www.groupon.com.au, only crawl the first page and save the
11 | cookies.
12 |
13 | The grouponbot_ project is available at:
14 | https://github.com/grammy-jiang/scrapy-enhancement-examples. You can find more
15 | information about it in the project's README.
16 |
17 | If you're familiar with git, you can checkout the code. Otherwise you can
18 | download the project as a zip file by clicking
19 | `here `_.
20 |
21 | .. _grouponbot: https://github.com/grammy-jiang/scrapy-enhancement-examples
22 |
--------------------------------------------------------------------------------
/docs/intro/installation.rst:
--------------------------------------------------------------------------------
1 | .. _intro-installation:
2 |
3 | ==================
4 | Installation guide
5 | ==================
6 |
7 | Installing Scrapy
8 | =================
9 |
10 | Scrapy-Cookies runs on Python 2.7 and Python 3.4 or above under CPython (default
11 | Python implementation) and PyPy (starting with PyPy 5.9).
12 |
13 | You can install Scrapy-Cookies and its dependencies from PyPI with::
14 |
15 | pip install Scrapy-Cookies
16 |
17 | We strongly recommend that you install Scrapy and Scrapy-Cookies in
18 | :ref:`a dedicated virtualenv `, to avoid conflicting
19 | with your system packages.
20 |
21 | For more detailed and platform specifics instructions, read on.
22 |
23 |
24 | Things that are good to know
25 | ----------------------------
26 |
27 | Scrapy-Cookies is written in pure Python and depends on a few key Python
28 | packages (among others):
29 |
30 | * `Scrapy`_, of course
31 | * `PyMongo`_
32 | * `redis-py`_
33 | * `ujson`_
34 |
35 | The minimal versions which Scrapy-Cookies is tested against are:
36 |
37 | * Scrapy 1.5.0
38 |
39 | Scrapy-Cookies may work with older versions of these packages but it is not
40 | guaranteed it will continue working because it’s not being tested against them.
41 |
42 | .. _Scrapy: https://scrapy.org/
43 | .. _PyMongo: http://api.mongodb.com/python/current/
44 | .. _redis-py: https://redis-py.readthedocs.io/en/latest/
45 | .. _ujson: https://github.com/esnme/ultrajson
46 |
47 |
48 | .. _intro-using-virtualenv:
49 |
50 | Using a virtual environment (recommended)
51 | -----------------------------------------
52 |
53 | TL;DR: We recommend installing Scrapy-Cookies inside a virtual environment on
54 | all platforms.
55 |
56 | Python packages can be installed either globally (a.k.a system wide), or in
57 | user-space. We do not recommend installing Scrapy and Scrapy-Cookies
58 | system wide.
59 |
60 | Instead, we recommend that you install Scrapy and Scrapy-Cookies within a
61 | so-called "virtual environment" (`virtualenv`_). Virtualenvs allow you to not
62 | conflict with already-installed Python system packages (which could break some
63 | of your system tools and scripts), and still install packages normally with
64 | ``pip`` (without ``sudo`` and the likes).
65 |
66 | To get started with virtual environments, see
67 | `virtualenv installation instructions`_. To install it globally (having it
68 | globally installed actually helps here), it should be a matter of running::
69 |
70 | $ [sudo] pip install virtualenv
71 |
72 | Check this `user guide`_ on how to create your virtualenv.
73 |
74 | .. note::
75 | If you use Linux or OS X, `virtualenvwrapper`_ is a handy tool to create
76 | virtualenvs.
77 |
78 | Once you have created a virtualenv, you can install Scrapy-Cookies inside it
79 | with ``pip``, just like any other Python package.
80 | (See :ref:`platform-specific guides `
81 | below for non-Python dependencies that you may need to install beforehand).
82 |
83 | Python virtualenvs can be created to use Python 2 by default, or Python 3 by
84 | default.
85 |
86 | * If you want to install Scrapy-Cookies with Python 3, install Scrapy-Cookies
87 | within a Python 3 virtualenv.
88 | * And if you want to install Scrapy-Cookies with Python 2, install
89 | Scrapy-Cookies within a Python 2 virtualenv.
90 |
91 | .. _virtualenv: https://virtualenv.pypa.io
92 | .. _virtualenv installation instructions: https://virtualenv.pypa.io/en/stable/installation/
93 | .. _virtualenvwrapper: https://virtualenvwrapper.readthedocs.io/en/latest/install.html
94 | .. _user guide: https://virtualenv.pypa.io/en/stable/userguide/
95 |
96 |
97 | .. _intro-install-platform-notes:
98 |
99 | Platform specific installation notes
100 | ====================================
101 |
102 | .. _intro-install-windows:
103 |
104 | Windows
105 | -------
106 |
107 | Same as Scrapy.
108 |
109 |
110 | .. _intro-install-ubuntu:
111 |
112 | Ubuntu 14.04 or above
113 | ---------------------
114 |
115 | Same as Scrapy.
116 |
117 |
118 | .. _intro-install-macos:
119 |
120 | Mac OS X
121 | --------
122 |
123 | Same as Scrapy.
124 |
125 |
126 | PyPy
127 | ----
128 |
129 | Same as Scrapy.
130 |
--------------------------------------------------------------------------------
/docs/intro/overview.rst:
--------------------------------------------------------------------------------
1 | .. _intro-overview:
2 |
3 | ==========================
4 | Scrapy-Cookies at a glance
5 | ==========================
6 |
7 | Scrapy-Cookies is a downloader middleware for Scrapy.
8 |
9 | Even though Scrapy-Cookies was originally designed for cookies save and restore
10 | (manage the login session), it can also be used to share cookies between various
11 | spider nodes.
12 |
13 |
14 | Walk-through of an example spider
15 | =================================
16 |
17 | In order to show you what Scrapy-Cookies brings to the table, we'll walk you
18 | through an example of a Scrapy project's settings with Scrapy-Cookies using the
19 | simplest way to save and restore the cookies.
20 |
21 | Here's the code for settings that uses in memory as storage::
22 |
23 | DOWNLOADER_MIDDLEWARES.update({
24 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
25 | 'scrapy_cookies.downloadermiddlewares.cookies.CookiesMiddleware': 700,
26 | })
27 |
28 | COOKIES_ENABLED = True
29 |
30 | COOKIES_PERSISTENCE = True
31 | COOKIES_PERSISTENCE_DIR = 'cookies'
32 |
33 | # ------------------------------------------------------------------------------
34 | # IN MEMORY STORAGE
35 | # ------------------------------------------------------------------------------
36 |
37 | COOKIES_STORAGE = 'scrapy_cookies.storage.in_memory.InMemoryStorage'
38 |
39 | Put this in your project's settings, and run your spider.
40 |
41 | When this finishes you will have a ``cookies`` file in the folder ``.scrapy``
42 | under your project folder. The file ``cookies`` is the pickled object contained
43 | cookies from your spider.
44 |
45 |
46 | What just happened?
47 | -------------------
48 |
49 | When you run your spider, this middleware initializes all objects related to
50 | maintaining cookies.
51 |
52 | The crawl starts to send requests and receive responses, at the same time this
53 | middleware extracts and sets the cookies from and to requests and responses.
54 |
55 | When the spider stopped, this middleware will save the cookies to the path
56 | defined in ``COOKIES_PERSISTENCE_DIR``.
57 |
58 |
59 | .. _topics-whatelse:
60 |
61 | What else?
62 | ==========
63 |
64 | You've seen how to save and store cookies with Scrapy-Cookies. And this
65 | middleware provides an interface to let you customize your own cookies storage
66 | ways, such as:
67 |
68 |
69 | * In-memory storage, with ultra-fast speed to process
70 |
71 | * SQLite storage, with ultra-fast speed when uses memory database, and easy to
72 | read and sharing with other process on disk databases
73 |
74 | * Other database like MongoDB, MySQL, even HBase to integrate with other
75 | programmes across your
76 |
77 |
78 | What's next?
79 | ============
80 |
81 | The next steps for you are to
82 | :ref:`install Scrapy-Cookies `,
83 | :ref:`follow through the tutorial ` to learn how to create
84 | a project with Scrapy-Cookies and `join the community`_. Thanks for your
85 | interest!
86 |
87 | .. _join the community: https://scrapy.org/community/
88 |
--------------------------------------------------------------------------------
/docs/intro/tutorial.rst:
--------------------------------------------------------------------------------
1 | .. _intro-tutorial:
2 |
3 | =======================
4 | Scrapy-Cookies Tutorial
5 | =======================
6 |
7 | In this tutorial, we'll assume that Scrapy-Cookies is already installed on your
8 | system. If that's not the case, see :ref:`intro-installation`.
9 |
10 | This tutorial will walk you through these tasks:
11 |
12 | 1. Use various storage classes in this middleware
13 | 2. Save cookies on disk
14 |
15 |
16 | Use various storage classes in this middleware
17 | ==============================================
18 |
19 | Before you start scraping, just put the following code into your settings.py::
20 |
21 | DOWNLOADER_MIDDLEWARES.update({
22 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
23 | 'scrapy_cookies.downloadermiddlewares.cookies.CookiesMiddleware': 700,
24 | })
25 |
26 | With the default settings of this middleware, a in-memory storage will be used.
27 |
28 | There is a storage named SQLiteStorage. If you want to use it instead of the
29 | in-memory one, simple put the following code below the previous one::
30 |
31 | COOKIES_STORAGE = 'scrapy_cookies.storage.sqlite.SQLiteStorage'
32 | COOKIES_SQLITE_DATABASE = ':memory:'
33 |
34 | There are other storage classes provided with this middleware, please refer to
35 | :ref:`topics-storage`.
36 |
37 | When you implement your own storage, you can set ``COOKIES_STORAGE`` to your own
38 | one.
39 |
40 |
41 | Save cookies and restore in your next run
42 | =========================================
43 |
44 | By default this middleware would not save the cookies. When you need to keep
45 | the cookies for further usage, for example a login cookie, you wish to save the
46 | cookies on disk for next run.
47 |
48 | This middleware provides this ability with one setting::
49 |
50 | COOKIES_PERSISTENCE = True
51 |
52 | Most of time the file saved cookies is named ``cookies`` under the folder
53 | ``.scrapy``. If you want to change it, use this setting::
54 |
55 | COOKIES_PERSISTENCE_DIR = 'your-cookies-path'
56 |
57 | After these settings, this middleware would load the previous saved cookies in
58 | the next run.
59 |
60 | .. note:: Please keep the storage is the same class when you want save the
61 | cookies and restore them. The cookies persistence file is not compatible
62 | between different storage classes.
63 |
64 | .. note:: This feature depends on the storage class used.
65 |
66 | Next steps
67 | ==========
68 |
69 | This tutorial covered only the basics of Scrapy-Cookies, but there's a lot of
70 | other features not mentioned here. Check the :ref:`topics-whatelse` section in
71 | :ref:`intro-overview` chapter for a quick overview of the most important ones.
72 |
73 | You can continue from the section :ref:`section-basics` to know more about this
74 | middleware, storage and other things this tutorial hasn't covered. If you prefer
75 | to play with an example project, check the :ref:`intro-examples` section.
76 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=Scrapy-Cookies
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx==3.2.1
2 | sphinx_rtd_theme
3 |
--------------------------------------------------------------------------------
/docs/topics/cookiesmiddleware.rst:
--------------------------------------------------------------------------------
1 | .. _topics-cookiesmiddleware:
2 |
3 | =================
4 | CookiesMiddleware
5 | =================
6 |
7 | This is the downloader middleware to inject cookies into requests and extract
8 | cookies from responses.
9 |
10 | This middleware mostly inherits the one from Scrapy, which implements the
11 | interface of `downloader middleware`_. With minimum changes, now
12 | it supports the storage class which implements a certain interface (actually
13 | MutableMapping_).
14 |
15 | .. _downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
16 | .. _MutableMapping: https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping
17 |
--------------------------------------------------------------------------------
/docs/topics/settings.rst:
--------------------------------------------------------------------------------
1 | .. _topic-settings:
2 |
3 | ========
4 | Settings
5 | ========
6 |
7 | The default settings of this middleware keeps the same behaviour as the one in
8 | Scrapy.
9 |
10 | As an enhancement, there are some settings added in this middleware:
11 |
12 | .. setting:: COOKIES_PERSISTENCE
13 |
14 | COOKIES_PERSISTENCE
15 | ~~~~~~~~~~~~~~~~~~~
16 |
17 | Default: ``False``
18 |
19 | Whether to enable this cookies middleware save the cookies on disk. If disabled,
20 | no cookies will be saved on disk.
21 |
22 | Notice that this setting only affects when the storage uses memory as cookies
23 | container.
24 |
25 | .. setting:: COOKIES_DEBUG
26 |
27 | COOKIES_PERSISTENCE_DIR
28 | ~~~~~~~~~~~~~~~~~~~~~~~
29 |
30 | Default: ``cookies``
31 |
32 | When ``COOKIES_PERSISTENCE`` is True, the storage which use memory as cookies
33 | container will save the cookies in the file ``cookies`` under the folder
34 | ``.scrapy`` in your project, while if the storage does not use memory as cookies
35 | container will not affect by this setting.
36 |
37 | .. setting:: COOKIES_STORAGE
38 |
39 | COOKIES_STORAGE
40 | ~~~~~~~~~~~~~~~
41 |
42 | Default: ``scrapy_cookies.storage.in_memory.InMemoryStorage``
43 |
44 | With this setting, the storage can be specified. There are some storage classes
45 | provided with this middleware by default:
46 |
47 | * :ref:`scrapy_cookies.storage.in_memory.InMemoryStorage`
48 | * :ref:`scrapy_cookies.storage.sqlite.SQLiteStorage`
49 | * :ref:`scrapy_cookies.storage.mongo.MongoStorage`
50 |
51 | .. setting:: COOKIES_MONGO_MONGOCLIENT_HOST
52 |
53 | COOKIES_MONGO_MONGOCLIENT_HOST
54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 |
56 | Default: ``localhost``
57 |
58 | Hostname or IP address or Unix domain socket path of a single mongod or mongos
59 | instance to connect to, or a mongodb URI, or a list of hostnames / mongodb URIs.
60 | If host is an IPv6 literal it must be enclosed in ‘[‘ and ‘]’ characters
61 | following the RFC2732 URL syntax (e.g. ‘[::1]’ for localhost). Multihomed and
62 | round robin DNS addresses are not supported.
63 |
64 | Please refer to mongo_client_.
65 |
66 | .. setting:: COOKIES_MONGO_MONGOCLIENT_PORT
67 |
68 | COOKIES_MONGO_MONGOCLIENT_PORT
69 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
70 |
71 | Default: ``27017``
72 |
73 | Port number on which to connect.
74 |
75 | Please refer to mongo_client_.
76 |
77 | .. setting:: COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS
78 |
79 | COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS
80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
81 |
82 | Default: ``dict``
83 |
84 | Default class to use for documents returned from queries on this client.
85 |
86 | Please refer to mongo_client_.
87 |
88 | .. setting:: COOKIES_MONGO_MONGOCLIENT_TZ_AWARE
89 |
90 | COOKIES_MONGO_MONGOCLIENT_TZ_AWARE
91 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 |
93 | Default: ``False``
94 |
95 | If True, datetime instances returned as values in a document by this MongoClient
96 | will be timezone aware (otherwise they will be naive).
97 |
98 | Please refer to mongo_client_.
99 |
100 | .. setting:: COOKIES_MONGO_MONGOCLIENT_CONNECT
101 |
102 | COOKIES_MONGO_MONGOCLIENT_CONNECT
103 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
104 |
105 | Default: ``True``
106 |
107 | If True (the default), immediately begin connecting to MongoDB in the
108 | background. Otherwise connect on the first operation.
109 |
110 | Please refer to mongo_client_.
111 |
112 | .. setting:: COOKIES_MONGO_MONGOCLIENT_KWARGS
113 |
114 | COOKIES_MONGO_MONGOCLIENT_KWARGS
115 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
116 |
117 | Please refer to mongo_client_.
118 |
119 | .. setting:: COOKIES_MONGO_DATABASE
120 |
121 | COOKIES_MONGO_DATABASE
122 | ~~~~~~~~~~~~~~~~~~~~~~
123 |
124 | Default: ``cookies``
125 |
126 | The name of the database - a string. If None (the default) the database named in
127 | the MongoDB connection URI is returned.
128 |
129 | Please refer to get_database_.
130 |
131 | .. setting:: COOKIES_MONGO_COLLECTION
132 |
133 | COOKIES_MONGO_COLLECTION
134 | ~~~~~~~~~~~~~~~~~~~~~~~~
135 |
136 | Default: ``cookies``
137 |
138 | The name of the collection - a string.
139 |
140 | Please refer to get_collection_.
141 |
142 |
143 | .. _mongo_client: http://api.mongodb.com/python/current/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient
144 | .. _get_database: http://api.mongodb.com/python/current/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient.get_database
145 | .. _get_collection: http://api.mongodb.com/python/current/api/pymongo/database.html#pymongo.database.Database.get_collection
146 |
147 |
148 | .. setting:: COOKIES_REDIS_HOST
149 |
150 | COOKIES_REDIS_HOST
151 | ~~~~~~~~~~~~~~~~~~
152 |
153 | Please refer to `redis-py's documentation`_.
154 |
155 | .. setting:: COOKIES_REDIS_PORT
156 |
157 | COOKIES_REDIS_PORT
158 | ~~~~~~~~~~~~~~~~~~
159 |
160 | Please refer to `redis-py's documentation`_.
161 |
162 | .. setting:: COOKIES_REDIS_DB
163 |
164 | COOKIES_REDIS_DB
165 | ~~~~~~~~~~~~~~~~
166 |
167 | Please refer to `redis-py's documentation`_.
168 |
169 | .. setting:: COOKIES_REDIS_PASSWORD
170 |
171 | COOKIES_REDIS_PASSWORD
172 | ~~~~~~~~~~~~~~~~~~~~~~
173 |
174 | Please refer to `redis-py's documentation`_.
175 |
176 | .. setting:: COOKIES_REDIS_SOCKET_TIMEOUT
177 |
178 | COOKIES_REDIS_SOCKET_TIMEOUT
179 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
180 |
181 | Please refer to `redis-py's documentation`_.
182 |
183 | .. setting:: COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT
184 |
185 | COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT
186 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
187 |
188 | Please refer to `redis-py's documentation`_.
189 |
190 | .. setting:: COOKIES_REDIS_SOCKET_KEEPALIVE
191 |
192 | COOKIES_REDIS_SOCKET_KEEPALIVE
193 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
194 |
195 | Please refer to `redis-py's documentation`_.
196 |
197 | .. setting:: COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS
198 |
199 | COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS
200 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
201 |
202 | Please refer to `redis-py's documentation`_.
203 |
204 | .. setting:: COOKIES_REDIS_CONNECTION_POOL
205 |
206 | COOKIES_REDIS_CONNECTION_POOL
207 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
208 |
209 | Please refer to `redis-py's documentation`_.
210 |
211 | .. setting:: COOKIES_REDIS_UNIX_SOCKET_PATH
212 |
213 | COOKIES_REDIS_UNIX_SOCKET_PATH
214 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
215 |
216 | Please refer to `redis-py's documentation`_.
217 |
218 | .. setting:: COOKIES_REDIS_ENCODING
219 |
220 | COOKIES_REDIS_ENCODING
221 | ~~~~~~~~~~~~~~~~~~~~~~
222 |
223 | Please refer to `redis-py's documentation`_.
224 |
225 | .. setting:: COOKIES_REDIS_ENCODING_ERRORS
226 |
227 | COOKIES_REDIS_ENCODING_ERRORS
228 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
229 |
230 | Please refer to `redis-py's documentation`_.
231 |
232 | .. setting:: COOKIES_REDIS_CHARSET
233 |
234 | COOKIES_REDIS_CHARSET
235 | ~~~~~~~~~~~~~~~~~~~~~
236 |
237 | Please refer to `redis-py's documentation`_.
238 |
239 | .. setting:: COOKIES_REDIS_ERRORS
240 |
241 | COOKIES_REDIS_ERRORS
242 | ~~~~~~~~~~~~~~~~~~~~
243 |
244 | Please refer to `redis-py's documentation`_.
245 |
246 | .. setting:: COOKIES_REDIS_DECODE_RESPONSES
247 |
248 | COOKIES_REDIS_DECODE_RESPONSES
249 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
250 |
251 | Please refer to `redis-py's documentation`_.
252 |
253 | .. setting:: COOKIES_REDIS_RETRY_ON_TIMEOUT
254 |
255 | COOKIES_REDIS_RETRY_ON_TIMEOUT
256 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
257 |
258 | Please refer to `redis-py's documentation`_.
259 |
260 | .. setting:: COOKIES_REDIS_SSL
261 |
262 | COOKIES_REDIS_SSL
263 | ~~~~~~~~~~~~~~~~~
264 |
265 | Please refer to `redis-py's documentation`_.
266 |
267 | .. setting:: COOKIES_REDIS_SSL_KEYFILE
268 |
269 | COOKIES_REDIS_SSL_KEYFILE
270 | ~~~~~~~~~~~~~~~~~~~~~~~~~
271 |
272 | Please refer to `redis-py's documentation`_.
273 |
274 | .. setting:: COOKIES_REDIS_SSL_CERTFILE
275 |
276 | COOKIES_REDIS_SSL_CERTFILE
277 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
278 |
279 | Please refer to `redis-py's documentation`_.
280 |
281 | .. setting:: COOKIES_REDIS_SSL_CERT_REQS
282 |
283 | COOKIES_REDIS_SSL_CERT_REQS
284 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
285 |
286 | Please refer to `redis-py's documentation`_.
287 |
288 | .. setting:: COOKIES_REDIS_SSL_CA_CERTS
289 |
290 | COOKIES_REDIS_SSL_CA_CERTS
291 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
292 |
293 | Please refer to `redis-py's documentation`_.
294 |
295 | .. setting:: COOKIES_REDIS_MAX_CONNECTIONS
296 |
297 | COOKIES_REDIS_MAX_CONNECTIONS
298 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
299 |
300 | Please refer to `redis-py's documentation`_.
301 |
302 | .. _redis-py's documentation: https://redis-py.readthedocs.io/en/latest/
303 |
--------------------------------------------------------------------------------
/docs/topics/storage.rst:
--------------------------------------------------------------------------------
1 | .. _topics-storage:
2 |
3 | =======
4 | Storage
5 | =======
6 |
7 | The class of storage is the one implementing MutableMapping_ interface. There
8 | are some storage classes provided with this middleware:
9 |
10 | .. _MutableMapping: https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping
11 |
12 | .. _storage-inmemory:
13 |
14 | InMemoryStorage
15 | ---------------
16 |
17 | .. module:: scrapy_cookies.storage.in_memory
18 | :synopsis: In Memory Storage
19 |
20 | .. class:: InMemoryStorage
21 |
22 | This storage enables keeping cookies inside the memory, to provide ultra fast
23 | read and write cookies performance.
24 |
25 | .. _storage-sqlite:
26 |
27 | SQLiteStorage
28 | -------------
29 |
30 | .. module:: scrapy_cookies.storage.sqlite
31 | :synopsis: SQLite Storage
32 |
33 | .. class:: SQLiteStorage
34 |
35 | This storage enables keeping cookies in SQLite, which supports already by
36 | Python.
37 |
38 | The following settings can be used to configure this storage:
39 |
40 | * |COOKIES_SQLITE_DATABASE|_
41 |
42 | .. |COOKIES_SQLITE_DATABASE| replace:: ``COOKIES_SQLITE_DATABASE``
43 | .. _COOKIES_SQLITE_DATABASE: https://docs.python.org/3/library/sqlite3.html#sqlite3.connect
44 |
45 | .. _storage-mongo:
46 |
47 | MongoStorage
48 | ------------
49 |
50 | .. module:: scrapy_cookies.storage.mongo
51 | :synopsis: Mongo Storage
52 |
53 | .. class:: MongoStorage
54 |
55 | This storage enables keeping cookies in MongoDB.
56 |
57 | The following settings can be used to configure this storage:
58 |
59 | * :setting:`COOKIES_MONGO_MONGOCLIENT_HOST`
60 | * :setting:`COOKIES_MONGO_MONGOCLIENT_PORT`
61 | * :setting:`COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS`
62 | * :setting:`COOKIES_MONGO_MONGOCLIENT_TZ_AWARE`
63 | * :setting:`COOKIES_MONGO_MONGOCLIENT_CONNECT`
64 | * :setting:`COOKIES_MONGO_MONGOCLIENT_KWARGS`
65 | * :setting:`COOKIES_MONGO_DATABASE`
66 | * :setting:`COOKIES_MONGO_COLLECTION`
67 |
68 | .. _storage-redis:
69 |
70 | RedisStorage
71 | ------------
72 |
73 | .. module:: scrapy_cookies.storage.redis
74 | :synopsis: Redis Storage
75 |
76 | .. class:: RedisStorage
77 |
78 | This storage enables keeping cookies in Redis.
79 |
80 | The following settings can be used to configure this storage:
81 |
82 | * :setting:`COOKIES_REDIS_HOST`
83 | * :setting:`COOKIES_REDIS_PORT`
84 | * :setting:`COOKIES_REDIS_DB`
85 | * :setting:`COOKIES_REDIS_PASSWORD`
86 | * :setting:`COOKIES_REDIS_SOCKET_TIMEOUT`
87 | * :setting:`COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT`
88 | * :setting:`COOKIES_REDIS_SOCKET_KEEPALIVE`
89 | * :setting:`COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS`
90 | * :setting:`COOKIES_REDIS_CONNECTION_POOL`
91 | * :setting:`COOKIES_REDIS_UNIX_SOCKET_PATH`
92 | * :setting:`COOKIES_REDIS_ENCODING`
93 | * :setting:`COOKIES_REDIS_ENCODING_ERRORS`
94 | * :setting:`COOKIES_REDIS_CHARSET`
95 | * :setting:`COOKIES_REDIS_ERRORS`
96 | * :setting:`COOKIES_REDIS_DECODE_RESPONSES`
97 | * :setting:`COOKIES_REDIS_RETRY_ON_TIMEOUT`
98 | * :setting:`COOKIES_REDIS_SSL`
99 | * :setting:`COOKIES_REDIS_SSL_KEYFILE`
100 | * :setting:`COOKIES_REDIS_SSL_CERTFILE`
101 | * :setting:`COOKIES_REDIS_SSL_CERT_REQS`
102 | * :setting:`COOKIES_REDIS_SSL_CA_CERTS`
103 | * :setting:`COOKIES_REDIS_MAX_CONNECTIONS`
104 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts =
3 | --cov=scrapy_cookies
4 | --cov-report=html
5 | --cov-report=term
6 | --docker-compose=tests/test_storages/docker-compose.yml
7 | --docker-compose-remove-volumes
8 | testpaths = tests
9 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": [
3 | "config:base"
4 | ]
5 | }
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hiredis
2 | pymongo
3 | redis
4 | scrapy
5 | six
6 | ujson
7 |
--------------------------------------------------------------------------------
/scrapy_cookies/VERSION:
--------------------------------------------------------------------------------
1 | 0.3
2 |
--------------------------------------------------------------------------------
/scrapy_cookies/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Scrapy-Cookies - A middleware of cookies persistence for Scrapy
3 | """
4 |
5 | __all__ = ["__version__", "version_info"]
6 |
7 | # Scrapy version
8 | import pkgutil
9 |
10 | __version__ = pkgutil.get_data(__package__, "VERSION").decode("ascii").strip()
11 | version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split("."))
12 | del pkgutil
13 |
--------------------------------------------------------------------------------
/scrapy_cookies/downloadermiddlewares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapedia/scrapy-cookies/342eaada3b84db4971be09862c34db9f207c0fb7/scrapy_cookies/downloadermiddlewares/__init__.py
--------------------------------------------------------------------------------
/scrapy_cookies/downloadermiddlewares/cookies.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from http.cookiejar import Cookie
3 | from typing import Dict, List
4 |
5 | from scrapy.crawler import Crawler
6 | from scrapy.exceptions import NotConfigured
7 | from scrapy.http import Request, Response
8 | from scrapy.http.cookies import CookieJar
9 | from scrapy.settings import SETTINGS_PRIORITIES, Settings
10 | from scrapy.signals import spider_closed, spider_opened
11 | from scrapy.spiders import Spider
12 | from scrapy.utils.misc import load_object
13 | try:
14 | from scrapy.utils.python import to_native_str
15 | except ImportError:
16 | # to_native_str is deprecated since version 2.8
17 | # https://docs.scrapy.org/en/2.8/news.html#deprecation-removals
18 | from scrapy.utils.python import to_unicode as to_native_str
19 |
20 | from scrapy_cookies.settings import default_settings, unfreeze_settings
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 |
25 | def format_cookie(cookie: Dict) -> str:
26 | # build cookie string
27 | cookie_str: str = "{}={}".format(cookie["name"], cookie["value"])
28 |
29 | if cookie.get("path", None):
30 | cookie_str += "; Path={}".format(cookie["path"])
31 | if cookie.get("domain", None):
32 | cookie_str += "; Domain={}".format(cookie["domain"])
33 |
34 | return cookie_str
35 |
36 |
37 | def get_request_cookies(jar: CookieJar, request: Request) -> List[Cookie]:
38 | if isinstance(request.cookies, dict):
39 | cookie_list: List[Dict] = [
40 | {"name": k, "value": v} for k, v in request.cookies.items()
41 | ]
42 | else:
43 | cookie_list: List[Dict] = request.cookies
44 |
45 | cookies: List[str] = [format_cookie(x) for x in cookie_list]
46 | headers: Dict[str, List[str]] = {"Set-Cookie": cookies}
47 | response: Response = Response(request.url, headers=headers)
48 |
49 | return jar.make_cookies(response, request)
50 |
51 |
52 | class CookiesMiddleware:
53 | """This middleware enables working with sites that need cookies"""
54 |
55 | def __init__(self, settings: Settings):
56 | self.settings: Settings = settings
57 | self.jars = load_object(settings["COOKIES_STORAGE"]).from_middleware(self)
58 | self.debug: bool = settings["COOKIES_DEBUG"]
59 |
60 | @classmethod
61 | def from_crawler(cls, crawler: Crawler):
62 | with unfreeze_settings(crawler.settings) as settings:
63 | settings.setmodule(
64 | module=default_settings, priority=SETTINGS_PRIORITIES["default"]
65 | )
66 | if not crawler.settings.getbool("COOKIES_ENABLED"):
67 | raise NotConfigured
68 | obj = cls(crawler.settings)
69 | crawler.signals.connect(obj.spider_opened, signal=spider_opened)
70 | crawler.signals.connect(obj.spider_closed, signal=spider_closed)
71 | return obj
72 |
73 | def spider_opened(self, spider: Spider):
74 | logger.info(
75 | "%s is used as the cookies storage.", self.settings["COOKIES_STORAGE"]
76 | )
77 | self.jars.open_spider(spider)
78 |
79 | def spider_closed(self, spider: Spider):
80 | self.jars.close_spider(spider)
81 |
82 | def process_request(self, request: Request, spider: Spider) -> None:
83 | if request.meta.get("dont_merge_cookies", False):
84 | return
85 |
86 | cookiejar_key = request.meta.get("cookiejar")
87 | jar: CookieJar = self.jars[cookiejar_key]
88 | cookies: List[Cookie] = get_request_cookies(jar, request)
89 | for cookie in cookies:
90 | jar.set_cookie_if_ok(cookie, request)
91 | self.jars[cookiejar_key] = jar
92 |
93 | # set Cookie header
94 | request.headers.pop("Cookie", None)
95 | jar.add_cookie_header(request)
96 | self._debug_cookie(request, spider)
97 |
98 | def process_response(
99 | self, request: Request, response: Response, spider: Spider
100 | ) -> Response:
101 | if request.meta.get("dont_merge_cookies", False):
102 | return response
103 |
104 | # extract cookies from Set-Cookie and drop invalid/expired cookies
105 | cookiejar_key = request.meta.get("cookiejar")
106 | jar: CookieJar = self.jars[cookiejar_key]
107 | jar.extract_cookies(response, request)
108 | self.jars[cookiejar_key] = jar
109 | self._debug_set_cookie(response, spider)
110 |
111 | return response
112 |
113 | def _debug_cookie(self, request: Request, spider: Spider):
114 | if self.debug:
115 | cl = [
116 | to_native_str(c, errors="replace")
117 | for c in request.headers.getlist("Cookie")
118 | ]
119 | if cl:
120 | cookies: str = "\n".join("Cookie: {}\n".format(c) for c in cl)
121 | msg: str = "Sending cookies to: {}\n{}".format(request, cookies)
122 | logger.debug(msg, extra={"spider": spider})
123 |
124 | def _debug_set_cookie(self, response: Response, spider: Spider):
125 | if self.debug:
126 | cl = [
127 | to_native_str(c, errors="replace")
128 | for c in response.headers.getlist("Set-Cookie")
129 | ]
130 | if cl:
131 | cookies: str = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)
132 | msg: str = "Received cookies from: {}\n{}".format(response, cookies)
133 | logger.debug(msg, extra={"spider": spider})
134 |
--------------------------------------------------------------------------------
/scrapy_cookies/settings/__init__.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 |
3 |
4 | @contextmanager
5 | def unfreeze_settings(settings):
6 | original_status = settings.frozen
7 | settings.frozen = False
8 | try:
9 | yield settings
10 | finally:
11 | settings.frozen = original_status
12 |
--------------------------------------------------------------------------------
/scrapy_cookies/settings/default_settings.py:
--------------------------------------------------------------------------------
1 | COOKIES_ENABLED = True
2 | COOKIES_DEBUG = False
3 |
4 | COOKIES_PERSISTENCE = False
5 | COOKIES_PERSISTENCE_DIR = "cookies"
6 |
7 | # ------------------------------------------------------------------------------
8 | # IN MEMORY STORAGE
9 | # ------------------------------------------------------------------------------
10 |
11 | COOKIES_STORAGE = "scrapy_cookies.storage.in_memory.InMemoryStorage"
12 |
13 | # ------------------------------------------------------------------------------
14 | # SQLITE STORAGE
15 | # ------------------------------------------------------------------------------
16 |
17 | # COOKIES_STORAGE = 'scrapy_cookies.storage.sqlite.SQLiteStorage'
18 | COOKIES_SQLITE_DATABASE = ":memory:"
19 |
20 | # ------------------------------------------------------------------------------
21 | # MONGODB
22 | # ------------------------------------------------------------------------------
23 |
24 | # http://api.mongodb.com/python/current/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient
25 |
26 | # COOKIES_STORAGE = 'scrapy_cookies.storage.mongo.MongoStorage'
27 | COOKIES_MONGO_MONGOCLIENT_HOST = "localhost"
28 | COOKIES_MONGO_MONGOCLIENT_PORT = 27017
29 | COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS = dict
30 | COOKIES_MONGO_MONGOCLIENT_TZ_AWARE = False
31 | COOKIES_MONGO_MONGOCLIENT_CONNECT = True
32 |
33 | COOKIES_MONGO_MONGOCLIENT_KWARGS = {
34 | # 'username': 'username',
35 | # 'password': 'password',
36 | # 'authSource': 'admin',
37 | # 'authMechanism': 'SCRAM-SHA-1',
38 | }
39 |
40 | COOKIES_MONGO_DATABASE = "cookies"
41 | # or
42 | # COOKIES_MONGO_DATABASE = {
43 | # 'name': 'cookies',
44 | # 'codec_options': None,
45 | # 'read_preference': None,
46 | # 'write_concern': None,
47 | # 'read_concern': None
48 | # }
49 |
50 | COOKIES_MONGO_COLLECTION = "cookies"
51 | # or
52 | # COOKIES_MONGO_COLLECTION = {
53 | # 'name': 'cookies',
54 | # 'codec_options': None,
55 | # 'read_preference': None,
56 | # 'write_concern': None,
57 | # 'read_concern': None
58 | # }
59 |
60 | # ------------------------------------------------------------------------------
61 | # REDIS STORAGE
62 | # ------------------------------------------------------------------------------
63 |
64 | # COOKIES_STORAGE = 'scrapy_cookies.storage.redis.RedisStorage'
65 | COOKIES_REDIS_HOST = "localhost"
66 | COOKIES_REDIS_PORT = 6379
67 | COOKIES_REDIS_DB = 0
68 | COOKIES_REDIS_PASSWORD = None
69 | COOKIES_REDIS_SOCKET_TIMEOUT = None
70 | COOKIES_REDIS_SOCKET_CONNECT_TIMEOUT = None
71 | COOKIES_REDIS_SOCKET_KEEPALIVE = None
72 | COOKIES_REDIS_SOCKET_KEEPALIVE_OPTIONS = None
73 | COOKIES_REDIS_CONNECTION_POOL = None
74 | COOKIES_REDIS_UNIX_SOCKET_PATH = None
75 | COOKIES_REDIS_ENCODING = "utf-8"
76 | COOKIES_REDIS_ENCODING_ERRORS = "strict"
77 | COOKIES_REDIS_CHARSET = None
78 | COOKIES_REDIS_ERRORS = None
79 | COOKIES_REDIS_DECODE_RESPONSES = False
80 | COOKIES_REDIS_RETRY_ON_TIMEOUT = False
81 | COOKIES_REDIS_SSL = False
82 | COOKIES_REDIS_SSL_KEYFILE = None
83 | COOKIES_REDIS_SSL_CERTFILE = None
84 | COOKIES_REDIS_SSL_CERT_REQS = None
85 | COOKIES_REDIS_SSL_CA_CERTS = None
86 | COOKIES_REDIS_MAX_CONNECTIONS = None
87 |
--------------------------------------------------------------------------------
/scrapy_cookies/signals.py:
--------------------------------------------------------------------------------
1 | """
2 | Scrapy-Cookies signals
3 |
4 | These signals are documented in docs/topics/signals.rst. Please don't add new
5 | signals here without documenting them there.
6 | """
7 |
8 | cookies_invalidated = object()
9 |
--------------------------------------------------------------------------------
/scrapy_cookies/storage/__init__.py:
--------------------------------------------------------------------------------
1 | from collections.abc import MutableMapping
2 |
3 | from scrapy.settings import Settings
4 | from scrapy.spiders import Spider
5 |
6 | from scrapy_cookies.downloadermiddlewares.cookies import CookiesMiddleware
7 |
8 |
9 | class BaseStorage(MutableMapping):
10 | name = None
11 |
12 | def __init__(self, settings: Settings):
13 | self.settings: Settings = settings
14 |
15 | @classmethod
16 | def from_middleware(cls, middleware: CookiesMiddleware):
17 | obj = cls(middleware.settings)
18 | return obj
19 |
20 | def open_spider(self, spider: Spider):
21 | pass
22 |
23 | def close_spider(self, spider: Spider):
24 | pass
25 |
26 | def __delitem__(self, v):
27 | pass
28 |
29 | def __getitem__(self, k):
30 | pass
31 |
32 | def __iter__(self):
33 | pass
34 |
35 | def __len__(self):
36 | pass
37 |
38 | def __setitem__(self, k, v):
39 | pass
40 |
--------------------------------------------------------------------------------
/scrapy_cookies/storage/in_memory.py:
--------------------------------------------------------------------------------
1 | import io
2 | import logging
3 | import os
4 | import pickle
5 | from collections import UserDict
6 | from typing import Dict
7 |
8 | from scrapy.http.cookies import CookieJar
9 | from scrapy.settings import Settings
10 | from scrapy.spiders import Spider
11 | from scrapy.utils.project import data_path
12 |
13 | from scrapy_cookies.storage import BaseStorage
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | class InMemoryStorage(UserDict, BaseStorage):
19 | def __init__(self, settings: Settings):
20 | super(InMemoryStorage, self).__init__()
21 | self.settings: Settings = settings
22 | self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])
23 |
24 | def open_spider(self, spider: Spider):
25 | logger.info("COOKIES_PERSISTENCE is %s.", self.settings["COOKIES_PERSISTENCE"])
26 | if not self.settings["COOKIES_PERSISTENCE"]:
27 | return
28 | if not os.path.exists(self.cookies_dir):
29 | logger.info("Cookies dir does not exist.")
30 | return
31 | with io.open(self.cookies_dir, "br") as f:
32 | self.data: Dict = pickle.load(f)
33 | logger.info("The number of restored cookies is %d.", len(self.data))
34 |
35 | def close_spider(self, spider: Spider):
36 | if self.settings["COOKIES_PERSISTENCE"]:
37 | with io.open(self.cookies_dir, "bw") as f:
38 | pickle.dump(self.data, f)
39 | logger.info("The number of saved cookies is %d.", len(self.data))
40 |
41 | def __missing__(self, key) -> CookieJar:
42 | self.data.update({key: CookieJar()})
43 | return self.data[key]
44 |
--------------------------------------------------------------------------------
/scrapy_cookies/storage/mongo.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pickle
3 | import re
4 | from http.cookiejar import Cookie
5 | from itertools import starmap
6 | from typing import Dict
7 |
8 | import pymongo
9 | from pymongo import MongoClient
10 | from pymongo.collection import Collection
11 | from pymongo.database import Database
12 | from scrapy.http.cookies import CookieJar
13 | from scrapy.settings import Settings
14 | from scrapy.spiders import Spider
15 |
16 | from scrapy_cookies.storage import BaseStorage
17 |
18 | logger = logging.getLogger(__name__)
19 | pattern = re.compile("^COOKIES_MONGO_MONGOCLIENT_(?P(?!KWARGS).*)$")
20 |
21 |
22 | def get_arguments(var):
23 | return {str: {"name": var}, dict: var}[type(var)]
24 |
25 |
26 | def write_cookiejar(cookiejar: CookieJar):
27 | return pickle.dumps(cookiejar)
28 |
29 |
30 | def read_cookiejar(document):
31 | try:
32 | return pickle.loads(document["cookiejar"])
33 | except TypeError:
34 | return None
35 |
36 |
37 | def convert_cookiejar(cookiejar):
38 | def _convert_cookies(x):
39 | if isinstance(x, (str, int, bool)):
40 | return x
41 | elif isinstance(x, Cookie):
42 | return dict(
43 | map(
44 | lambda attr: (attr, getattr(x, attr)),
45 | (
46 | "version",
47 | "name",
48 | "value",
49 | "port",
50 | "port_specified",
51 | "domain",
52 | "domain_specified",
53 | "domain_initial_dot",
54 | "path",
55 | "path_specified",
56 | "secure",
57 | "expires",
58 | "discard",
59 | "comment",
60 | "comment_url",
61 | ),
62 | )
63 | )
64 |
65 | elif isinstance(x, dict):
66 | return dict(
67 | starmap(
68 | lambda k, v: (_convert_cookies(k), _convert_cookies(v)), x.items()
69 | )
70 | )
71 |
72 | return _convert_cookies(cookiejar._cookies)
73 |
74 |
75 | class MongoStorage(BaseStorage):
76 | def __init__(self, settings: Settings):
77 | super(MongoStorage, self).__init__(settings)
78 | self.mongo_settings: Dict[str, str] = dict(
79 | starmap(
80 | lambda k, v: (pattern.sub(lambda x: x.group(1).lower(), k), v),
81 | filter(
82 | lambda pair: pattern.match(pair[0]), settings.copy_to_dict().items()
83 | ),
84 | )
85 | )
86 | self.mongo_settings.update(self.settings["COOKIES_MONGO_MONGOCLIENT_KWARGS"])
87 | self.client: MongoClient = None
88 | self.db: Database = None
89 | self.coll: Collection = None
90 |
91 | @classmethod
92 | def from_middleware(cls, middleware):
93 | obj = cls(middleware.settings)
94 | return obj
95 |
96 | def open_spider(self, spider: Spider):
97 | self.client: MongoClient = MongoClient(**self.mongo_settings)
98 |
99 | self.db: Database = self.client.get_database(
100 | **get_arguments(self.settings["COOKIES_MONGO_DATABASE"])
101 | )
102 | self.coll: Collection = self.db.get_collection(
103 | **get_arguments(self.settings["COOKIES_MONGO_COLLECTION"])
104 | )
105 | self.coll.create_index([("key", pymongo.ASCENDING)], unique=True)
106 |
107 | def close_spider(self, spider: Spider):
108 | self.client.close()
109 |
110 | def __missing__(self, k) -> CookieJar:
111 | cookiejar: CookieJar = CookieJar()
112 | self[k] = cookiejar
113 | return cookiejar
114 |
115 | def __delitem__(self, v):
116 | # TODO: finish this method
117 | self.coll.delete_one({})
118 |
119 | def __getitem__(self, k) -> CookieJar:
120 | v: CookieJar = read_cookiejar(self.coll.find_one({"key": k}))
121 | if isinstance(v, CookieJar):
122 | return v
123 | if hasattr(self.__class__, "__missing__"):
124 | return self.__class__.__missing__(self, k)
125 | raise KeyError(k)
126 |
127 | def __iter__(self):
128 | return iter(self.coll.find())
129 |
130 | def __len__(self) -> int:
131 | return self.coll.count_documents({})
132 |
133 | def __setitem__(self, k, v):
134 | self.coll.update_one(
135 | {"key": k},
136 | {
137 | "$set": {
138 | "key": k,
139 | "cookiejar": write_cookiejar(v),
140 | "cookies": convert_cookiejar(v),
141 | }
142 | },
143 | upsert=True,
144 | )
145 |
--------------------------------------------------------------------------------
/scrapy_cookies/storage/redis_.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pickle
3 | import re
4 | from itertools import starmap
5 | from typing import Dict
6 |
7 | import ujson
8 | from redis.client import Redis
9 | from scrapy.http.cookies import CookieJar
10 | from scrapy.settings import Settings
11 | from scrapy.spiders import Spider
12 |
13 | from scrapy_cookies.storage import BaseStorage
14 |
15 | logger = logging.getLogger(__name__)
16 | pattern = re.compile("^COOKIES_REDIS_(?P(?!KWARGS).*)$")
17 |
18 |
19 | def get_arguments(var):
20 | return {str: {"name": var}, dict: var}[type(var)]
21 |
22 |
23 | def write_cookiejar(cookiejar):
24 | return {
25 | "cookiejar": pickle.dumps(cookiejar),
26 | "cookies": ujson.dumps(cookiejar._cookies),
27 | }
28 |
29 |
30 | def read_cookiejar(document):
31 | try:
32 | return pickle.loads(document["cookiejar"])
33 | except (TypeError, KeyError):
34 | return None
35 |
36 |
37 | class RedisStorage(BaseStorage):
38 | def __init__(self, settings: Settings):
39 | super(RedisStorage, self).__init__(settings)
40 | self.redis_settings: Dict[str, str] = dict(
41 | starmap(
42 | lambda k, v: (pattern.sub(lambda x: x.group(1).lower(), k), v),
43 | filter(
44 | lambda pair: pattern.match(pair[0]), settings.copy_to_dict().items()
45 | ),
46 | )
47 | )
48 | self.r: Redis = None
49 |
50 | @classmethod
51 | def from_middleware(cls, middleware):
52 | obj = cls(middleware.settings)
53 | return obj
54 |
55 | def open_spider(self, spider: Spider):
56 | self.r: Redis = Redis(**self.redis_settings)
57 |
58 | def close_spider(self, spider: Spider):
59 | pass
60 |
61 | def __missing__(self, k) -> CookieJar:
62 | cookiejar: CookieJar = CookieJar()
63 | self[k] = cookiejar
64 | return cookiejar
65 |
66 | def __delitem__(self, v):
67 | self.r.delete(v)
68 |
69 | def __getitem__(self, k) -> CookieJar:
70 | v: CookieJar = read_cookiejar(self.r.hgetall(k))
71 | if isinstance(v, CookieJar):
72 | return v
73 | if hasattr(self.__class__, "__missing__"):
74 | return self.__class__.__missing__(self, k)
75 | raise KeyError(k)
76 |
77 | def __iter__(self):
78 | return self.r.scan_iter()
79 |
80 | def __len__(self) -> int:
81 | return self.r.dbsize()
82 |
83 | def __setitem__(self, k, v: CookieJar):
84 | self.r.hmset(name=k, mapping=write_cookiejar(v))
85 |
--------------------------------------------------------------------------------
/scrapy_cookies/storage/sqlite.py:
--------------------------------------------------------------------------------
1 | import io
2 | import logging
3 | import os
4 | import pickle
5 | import sqlite3
6 | from sqlite3 import Connection, Cursor, Row
7 |
8 | from scrapy.http.cookies import CookieJar
9 | from scrapy.settings import Settings
10 | from scrapy.spiders import Spider
11 | from scrapy.utils.project import data_path
12 |
13 | from scrapy_cookies.storage import BaseStorage
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def adapt_cookiejar(cookiejar: CookieJar) -> bytes:
19 | return pickle.dumps(cookiejar)
20 |
21 |
22 | def convert_cookiejar_and_its_key(cookiejar_or_its_key: bytes):
23 | return pickle.loads(cookiejar_or_its_key)
24 |
25 |
26 | sqlite3.register_adapter(CookieJar, adapt_cookiejar)
27 | sqlite3.register_converter("cookiejar", convert_cookiejar_and_its_key)
28 | sqlite3.register_converter("cookiejar_key", convert_cookiejar_and_its_key)
29 |
30 |
31 | class SQLiteStorage(BaseStorage):
32 | def __init__(self, settings: Settings):
33 | super(SQLiteStorage, self).__init__(settings)
34 | self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])
35 | self.database: str = settings["COOKIES_SQLITE_DATABASE"]
36 | self.conn: Connection = None
37 | self.cur: Cursor = None
38 |
39 | def open_spider(self, spider: Spider):
40 | self.conn: Connection = sqlite3.connect(
41 | self.database, detect_types=sqlite3.PARSE_COLNAMES, isolation_level=None
42 | )
43 | self.conn.row_factory = sqlite3.Row
44 | self.cur: Cursor = self.conn.cursor()
45 | if self.database == ":memory:":
46 | if self.settings["COOKIES_PERSISTENCE"] and os.path.isfile(
47 | self.cookies_dir
48 | ):
49 | with io.open(self.cookies_dir, "r") as f:
50 | self.cur.executescript(f.read())
51 | return
52 | self.cur.execute(
53 | "CREATE TABLE IF NOT EXISTS cookies ("
54 | "cookiejar_key BLOB PRIMARY KEY UNIQUE, cookiejar BLOB, str TEXT"
55 | ")"
56 | )
57 |
58 | def close_spider(self, spider: Spider):
59 | if self.database == ":memory:" and self.settings["COOKIES_PERSISTENCE"]:
60 | with open(self.cookies_dir, "w") as f:
61 | for line in self.conn.iterdump():
62 | f.write("%s\n" % line)
63 | self.conn.close()
64 |
65 | def __delitem__(self, v):
66 | self.cur.execute("DELETE FROM cookies WHERE cookiejar_key=?", pickle.dumps(v))
67 |
68 | def __getitem__(self, k) -> CookieJar:
69 | result: Row = self.cur.execute(
70 | 'SELECT cookiejar as "cookiejar [CookieJar]" '
71 | "FROM cookies "
72 | "WHERE cookiejar_key=?",
73 | (pickle.dumps(k),),
74 | ).fetchone()
75 | if result:
76 | return result["cookiejar"]
77 | if hasattr(self.__class__, "__missing__"):
78 | return self.__class__.__missing__(self, k)
79 | raise KeyError(k)
80 |
81 | def __iter__(self):
82 | return iter(
83 | self.cur.execute(
84 | 'SELECT cookiejar_key as "cookiejar_key [CookieJar_key]", cookiejar as "cookiejar [CookieJar]" '
85 | "FROM cookies"
86 | ).fetchall()
87 | )
88 |
89 | def __len__(self) -> int:
90 | return self.cur.execute("SELECT COUNT(*) FROM cookies").fetchone()[0]
91 |
92 | def __setitem__(self, k, v: CookieJar) -> None:
93 | self.cur.execute(
94 | "INSERT OR REPLACE INTO cookies (cookiejar_key, cookiejar, str) VALUES (?, ?, ?)",
95 | (pickle.dumps(k), v, str(k)),
96 | )
97 |
98 | def __missing__(self, k) -> CookieJar:
99 | v: CookieJar = CookieJar()
100 | self.__setitem__(k, v)
101 | return v
102 |
103 | def __contains__(self, k) -> bool:
104 | self.cur.execute(
105 | 'SELECT cookiejar as "cookiejar [CookieJar]" '
106 | "FROM cookies "
107 | "WHERE cookiejar_key=?",
108 | (pickle.dumps(k),),
109 | )
110 | return bool(self.cur.fetchone())
111 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_rpm]
2 | doc_files = docs AUTHORS INSTALL LICENSE README.rst
3 |
4 | [bdist_wheel]
5 | universal=1
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, join
2 |
3 | from setuptools import find_packages, setup
4 |
5 | with open(join(dirname(__file__), "scrapy_cookies/VERSION"), "rb") as f:
6 | version = f.read().decode("ascii").strip()
7 |
8 |
9 | extras_require = {}
10 |
11 | setup(
12 | name="Scrapy-Cookies",
13 | version=version,
14 | url="https://github.com/grammy-jiang/scrapy-cookies",
15 | description="A middleware of cookies persistence for Scrapy",
16 | long_description=open("README.rst").read(),
17 | author="Scrapedia",
18 | author_email="Scrapedia@outlook.com",
19 | maintainer="Scrapedia",
20 | maintainer_email="Scrapedia@outlook.com",
21 | license="BSD",
22 | packages=find_packages(exclude=("tests", "tests.*")),
23 | include_package_data=True,
24 | zip_safe=False,
25 | classifiers=[
26 | "Framework :: Scrapy",
27 | "Development Status :: 2 - Pre-Alpha",
28 | "Environment :: Plugins",
29 | "Intended Audience :: Developers",
30 | "License :: OSI Approved :: BSD License",
31 | "Operating System :: OS Independent",
32 | "Programming Language :: Python",
33 | "Programming Language :: Python :: 2",
34 | "Programming Language :: Python :: 2.7",
35 | "Programming Language :: Python :: 3",
36 | "Programming Language :: Python :: 3.4",
37 | "Programming Language :: Python :: 3.5",
38 | "Programming Language :: Python :: 3.6",
39 | "Programming Language :: Python :: Implementation :: CPython",
40 | "Programming Language :: Python :: Implementation :: PyPy",
41 | "Topic :: Internet :: WWW/HTTP",
42 | "Topic :: Software Development :: Libraries :: Application Frameworks",
43 | "Topic :: Software Development :: Libraries :: Python Modules",
44 | ],
45 | python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*",
46 | install_requires=["hiredis", "pymongo", "redis", "scrapy", "ujson"],
47 | extras_require=extras_require,
48 | )
49 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapedia/scrapy-cookies/342eaada3b84db4971be09862c34db9f207c0fb7/tests/__init__.py
--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-benchmark
3 | pytest-cov
4 | pytest-docker-compose
5 | pytest-sugar
6 | pytest-twisted
7 | pytest-xdist
8 | testfixtures
9 |
--------------------------------------------------------------------------------
/tests/test_downloadermiddleware_cookies.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from unittest import TestCase
4 |
5 | from scrapy.crawler import Crawler
6 | from scrapy.exceptions import NotConfigured
7 | from scrapy.http import Request, Response
8 | from scrapy.settings import Settings
9 | from scrapy.spiders import Spider
10 | from scrapy.utils.test import get_crawler
11 | from testfixtures import LogCapture
12 |
13 | from scrapy_cookies.downloadermiddlewares.cookies import CookiesMiddleware
14 | from scrapy_cookies.settings import default_settings
15 |
16 |
17 | class CookiesMiddlewareTest(TestCase):
18 | def assertCookieValEqual(self, first, second, msg=None):
19 | cookievaleq = lambda cv: re.split(";\s*", cv.decode("latin1"))
20 | return self.assertEqual(
21 | sorted(cookievaleq(first)), sorted(cookievaleq(second)), msg
22 | )
23 |
24 | def setUp(self):
25 | self.spider = Spider("foo")
26 | settings = Settings()
27 | settings.setmodule(default_settings)
28 | self.crawler = Crawler(Spider, settings)
29 | self.mw = CookiesMiddleware.from_crawler(self.crawler)
30 | self.mw.spider_opened(self.spider)
31 |
32 | def tearDown(self):
33 | self.mw.spider_closed(self.spider)
34 | del self.mw
35 |
36 | def test_basic(self):
37 | req = Request("http://scrapytest.org/")
38 | assert self.mw.process_request(req, self.spider) is None
39 | assert "Cookie" not in req.headers
40 |
41 | headers = {"Set-Cookie": "C1=value1; path=/"}
42 | res = Response("http://scrapytest.org/", headers=headers)
43 | assert self.mw.process_response(req, res, self.spider) is res
44 |
45 | req2 = Request("http://scrapytest.org/sub1/")
46 | assert self.mw.process_request(req2, self.spider) is None
47 | self.assertEqual(req2.headers.get("Cookie"), b"C1=value1")
48 |
49 | def test_setting_false_cookies_enabled(self):
50 | self.assertRaises(
51 | NotConfigured,
52 | CookiesMiddleware.from_crawler,
53 | get_crawler(settings_dict={"COOKIES_ENABLED": False}),
54 | )
55 |
56 | def test_setting_default_cookies_enabled(self):
57 | self.assertIsInstance(
58 | CookiesMiddleware.from_crawler(get_crawler()), CookiesMiddleware
59 | )
60 |
61 | def test_setting_true_cookies_enabled(self):
62 | self.assertIsInstance(
63 | CookiesMiddleware.from_crawler(
64 | get_crawler(settings_dict={"COOKIES_ENABLED": True})
65 | ),
66 | CookiesMiddleware,
67 | )
68 |
69 | def test_setting_enabled_cookies_debug(self):
70 | crawler = get_crawler(settings_dict={"COOKIES_DEBUG": True})
71 | mw = CookiesMiddleware.from_crawler(crawler)
72 | mw.spider_opened(self.spider)
73 | with LogCapture(
74 | "scrapy_cookies.downloadermiddlewares.cookies",
75 | propagate=False,
76 | level=logging.DEBUG,
77 | ) as l:
78 | req = Request("http://scrapytest.org/")
79 | res = Response(
80 | "http://scrapytest.org/", headers={"Set-Cookie": "C1=value1; path=/"}
81 | )
82 | mw.process_response(req, res, crawler.spider)
83 | req2 = Request("http://scrapytest.org/sub1/")
84 | mw.process_request(req2, crawler.spider)
85 |
86 | l.check(
87 | (
88 | "scrapy_cookies.downloadermiddlewares.cookies",
89 | "DEBUG",
90 | "Received cookies from: <200 http://scrapytest.org/>\n"
91 | "Set-Cookie: C1=value1; path=/\n",
92 | ),
93 | (
94 | "scrapy_cookies.downloadermiddlewares.cookies",
95 | "DEBUG",
96 | "Sending cookies to: \n"
97 | "Cookie: C1=value1\n",
98 | ),
99 | )
100 |
101 | def test_setting_disabled_cookies_debug(self):
102 | crawler = get_crawler(settings_dict={"COOKIES_DEBUG": False})
103 | mw = CookiesMiddleware.from_crawler(crawler)
104 | mw.spider_opened(self.spider)
105 | with LogCapture(
106 | "scrapy_cookies.downloadermiddlewares.cookies",
107 | propagate=False,
108 | level=logging.DEBUG,
109 | ) as l:
110 | req = Request("http://scrapytest.org/")
111 | res = Response(
112 | "http://scrapytest.org/", headers={"Set-Cookie": "C1=value1; path=/"}
113 | )
114 | mw.process_response(req, res, crawler.spider)
115 | req2 = Request("http://scrapytest.org/sub1/")
116 | mw.process_request(req2, crawler.spider)
117 |
118 | l.check()
119 |
120 | def test_do_not_break_on_non_utf8_header(self):
121 | req = Request("http://scrapytest.org/")
122 | assert self.mw.process_request(req, self.spider) is None
123 | assert "Cookie" not in req.headers
124 |
125 | headers = {"Set-Cookie": b"C1=in\xa3valid; path=/", "Other": b"ignore\xa3me"}
126 | res = Response("http://scrapytest.org/", headers=headers)
127 | assert self.mw.process_response(req, res, self.spider) is res
128 |
129 | req2 = Request("http://scrapytest.org/sub1/")
130 | assert self.mw.process_request(req2, self.spider) is None
131 | self.assertIn("Cookie", req2.headers)
132 |
133 | def test_dont_merge_cookies(self):
134 | # merge some cookies into jar
135 | headers = {"Set-Cookie": "C1=value1; path=/"}
136 | req = Request("http://scrapytest.org/")
137 | res = Response("http://scrapytest.org/", headers=headers)
138 | assert self.mw.process_response(req, res, self.spider) is res
139 |
140 | # test Cookie header is not seted to request
141 | req = Request("http://scrapytest.org/dontmerge", meta={"dont_merge_cookies": 1})
142 | assert self.mw.process_request(req, self.spider) is None
143 | assert "Cookie" not in req.headers
144 |
145 | # check that returned cookies are not merged back to jar
146 | res = Response(
147 | "http://scrapytest.org/dontmerge",
148 | headers={"Set-Cookie": "dont=mergeme; path=/"},
149 | )
150 | assert self.mw.process_response(req, res, self.spider) is res
151 |
152 | # check that cookies are merged back
153 | req = Request("http://scrapytest.org/mergeme")
154 | assert self.mw.process_request(req, self.spider) is None
155 | self.assertEqual(req.headers.get("Cookie"), b"C1=value1")
156 |
157 | # check that cookies are merged when dont_merge_cookies is passed as 0
158 | req = Request("http://scrapytest.org/mergeme", meta={"dont_merge_cookies": 0})
159 | assert self.mw.process_request(req, self.spider) is None
160 | self.assertEqual(req.headers.get("Cookie"), b"C1=value1")
161 |
162 | def test_complex_cookies(self):
163 | # merge some cookies into jar
164 | cookies = [
165 | {
166 | "name": "C1",
167 | "value": "value1",
168 | "path": "/foo",
169 | "domain": "scrapytest.org",
170 | },
171 | {
172 | "name": "C2",
173 | "value": "value2",
174 | "path": "/bar",
175 | "domain": "scrapytest.org",
176 | },
177 | {
178 | "name": "C3",
179 | "value": "value3",
180 | "path": "/foo",
181 | "domain": "scrapytest.org",
182 | },
183 | {"name": "C4", "value": "value4", "path": "/foo", "domain": "scrapy.org"},
184 | ]
185 |
186 | req = Request("http://scrapytest.org/", cookies=cookies)
187 | self.mw.process_request(req, self.spider)
188 |
189 | # embed C1 and C3 for scrapytest.org/foo
190 | req = Request("http://scrapytest.org/foo")
191 | self.mw.process_request(req, self.spider)
192 | assert req.headers.get("Cookie") in (
193 | b"C1=value1; C3=value3",
194 | b"C3=value3; C1=value1",
195 | )
196 |
197 | # embed C2 for scrapytest.org/bar
198 | req = Request("http://scrapytest.org/bar")
199 | self.mw.process_request(req, self.spider)
200 | self.assertEqual(req.headers.get("Cookie"), b"C2=value2")
201 |
202 | # embed nothing for scrapytest.org/baz
203 | req = Request("http://scrapytest.org/baz")
204 | self.mw.process_request(req, self.spider)
205 | assert "Cookie" not in req.headers
206 |
207 | def test_merge_request_cookies(self):
208 | req = Request("http://scrapytest.org/", cookies={"galleta": "salada"})
209 | assert self.mw.process_request(req, self.spider) is None
210 | self.assertEqual(req.headers.get("Cookie"), b"galleta=salada")
211 |
212 | headers = {"Set-Cookie": "C1=value1; path=/"}
213 | res = Response("http://scrapytest.org/", headers=headers)
214 | assert self.mw.process_response(req, res, self.spider) is res
215 |
216 | req2 = Request("http://scrapytest.org/sub1/")
217 | assert self.mw.process_request(req2, self.spider) is None
218 |
219 | self.assertCookieValEqual(
220 | req2.headers.get("Cookie"), b"C1=value1; galleta=salada"
221 | )
222 |
223 | def test_cookiejar_key(self):
224 | req = Request(
225 | "http://scrapytest.org/",
226 | cookies={"galleta": "salada"},
227 | meta={"cookiejar": "store1"},
228 | )
229 | assert self.mw.process_request(req, self.spider) is None
230 | self.assertEqual(req.headers.get("Cookie"), b"galleta=salada")
231 |
232 | headers = {"Set-Cookie": "C1=value1; path=/"}
233 | res = Response("http://scrapytest.org/", headers=headers, request=req)
234 | assert self.mw.process_response(req, res, self.spider) is res
235 |
236 | req2 = Request("http://scrapytest.org/", meta=res.meta)
237 | assert self.mw.process_request(req2, self.spider) is None
238 | self.assertCookieValEqual(
239 | req2.headers.get("Cookie"), b"C1=value1; galleta=salada"
240 | )
241 |
242 | req3 = Request(
243 | "http://scrapytest.org/",
244 | cookies={"galleta": "dulce"},
245 | meta={"cookiejar": "store2"},
246 | )
247 | assert self.mw.process_request(req3, self.spider) is None
248 | self.assertEqual(req3.headers.get("Cookie"), b"galleta=dulce")
249 |
250 | headers = {"Set-Cookie": "C2=value2; path=/"}
251 | res2 = Response("http://scrapytest.org/", headers=headers, request=req3)
252 | assert self.mw.process_response(req3, res2, self.spider) is res2
253 |
254 | req4 = Request("http://scrapytest.org/", meta=res2.meta)
255 | assert self.mw.process_request(req4, self.spider) is None
256 | self.assertCookieValEqual(
257 | req4.headers.get("Cookie"), b"C2=value2; galleta=dulce"
258 | )
259 |
260 | # cookies from hosts with port
261 | req5_1 = Request("http://scrapytest.org:1104/")
262 | assert self.mw.process_request(req5_1, self.spider) is None
263 |
264 | headers = {"Set-Cookie": "C1=value1; path=/"}
265 | res5_1 = Response(
266 | "http://scrapytest.org:1104/", headers=headers, request=req5_1
267 | )
268 | assert self.mw.process_response(req5_1, res5_1, self.spider) is res5_1
269 |
270 | req5_2 = Request("http://scrapytest.org:1104/some-redirected-path")
271 | assert self.mw.process_request(req5_2, self.spider) is None
272 | self.assertEqual(req5_2.headers.get("Cookie"), b"C1=value1")
273 |
274 | req5_3 = Request("http://scrapytest.org/some-redirected-path")
275 | assert self.mw.process_request(req5_3, self.spider) is None
276 | self.assertEqual(req5_3.headers.get("Cookie"), b"C1=value1")
277 |
278 | # skip cookie retrieval for not http request
279 | req6 = Request("file:///scrapy/sometempfile")
280 | assert self.mw.process_request(req6, self.spider) is None
281 | self.assertEqual(req6.headers.get("Cookie"), None)
282 |
283 | def test_local_domain(self):
284 | request = Request("http://example-host/", cookies={"currencyCookie": "USD"})
285 | assert self.mw.process_request(request, self.spider) is None
286 | self.assertIn("Cookie", request.headers)
287 | self.assertEqual(b"currencyCookie=USD", request.headers["Cookie"])
288 |
--------------------------------------------------------------------------------
/tests/test_storages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapedia/scrapy-cookies/342eaada3b84db4971be09862c34db9f207c0fb7/tests/test_storages/__init__.py
--------------------------------------------------------------------------------
/tests/test_storages/confest.py:
--------------------------------------------------------------------------------
1 | pytest_plugins = ["docker_compose"]
2 |
--------------------------------------------------------------------------------
/tests/test_storages/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 | mongo:
4 | container_name: dc-pytest-scrapy-cookies-mongo
5 | image: mongo:latest
6 | networks:
7 | - pytest_scrapy_cookies
8 | ports:
9 | - "127.0.0.1:27017:27017"
10 | restart: always
11 | tty: true
12 | redis:
13 | container_name: dc-pytest-scrapy-cookies-redis
14 | image: redis:latest
15 | networks:
16 | - pytest_scrapy_cookies
17 | ports:
18 | - "127.0.0.1:6379:6379"
19 | restart: always
20 | tty: true
21 |
22 | networks:
23 | pytest_scrapy_cookies:
24 | driver: bridge
25 |
--------------------------------------------------------------------------------
/tests/test_storages/test_storage_in_memory.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | from copy import deepcopy
4 | from unittest import TestCase
5 |
6 | from scrapy import Spider
7 | from scrapy.http.cookies import CookieJar
8 | from scrapy.settings import Settings
9 |
10 | from scrapy_cookies.settings import default_settings
11 | from scrapy_cookies.storage.in_memory import InMemoryStorage
12 |
13 |
14 | class StorageTest(TestCase):
15 | def setUp(self):
16 | self.spider = Spider("foo")
17 | self.settings = Settings()
18 | self.settings.setmodule(default_settings)
19 |
20 | def tearDown(self):
21 | pass
22 |
23 | def test_in_memory(self):
24 | tmpdir = tempfile.mkdtemp()
25 | local_settings = {
26 | "COOKIES_PERSISTENCE": True,
27 | "COOKIES_PERSISTENCE_DIR": tmpdir + "/cookies",
28 | }
29 | settings = deepcopy(self.settings)
30 | settings.setdict(local_settings)
31 |
32 | storage = InMemoryStorage(settings)
33 | storage.open_spider(self.spider)
34 |
35 | cookie = storage["no_key"]
36 | self.assertIsInstance(cookie, CookieJar)
37 | self.assertDictEqual(cookie._cookies, CookieJar()._cookies)
38 |
39 | storage["key_1"] = CookieJar()
40 | self.assertIn("key_1", storage)
41 | self.assertEqual(storage["key_1"]._cookies, CookieJar()._cookies)
42 |
43 | storage.close_spider(self.spider)
44 | self.assertTrue(os.path.isfile(tmpdir + "/cookies"))
45 |
--------------------------------------------------------------------------------
/tests/test_storages/test_storage_mongo.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | from collections.abc import Iterable
3 | from unittest import TestCase
4 |
5 | from pytest import mark
6 | from scrapy import Spider
7 | from scrapy.http.cookies import CookieJar
8 | from scrapy.settings import Settings
9 |
10 | from scrapy_cookies.settings import default_settings
11 | from scrapy_cookies.storage.mongo import MongoStorage
12 |
13 |
14 | @mark.usefixtures("class_scoped_container_getter")
15 | class MongoStorageTest(TestCase):
16 | local_settings = {
17 | "COOKIES_STORAGE": "scrapy_cookies.storage.mongo.MongoStorage",
18 | "COOKIES_MONGO_MONGOCLIENT_HOST": "localhost",
19 | "COOKIES_MONGO_MONGOCLIENT_PORT": 27017,
20 | "COOKIES_MONGO_MONGOCLIENT_DOCUMENT_CLASS": dict,
21 | "COOKIES_MONGO_MONGOCLIENT_TZ_AWARE": False,
22 | "COOKIES_MONGO_MONGOCLIENT_CONNECT": True,
23 | "COOKIES_MONGO_MONGOCLIENT_KWARGS": {},
24 | "COOKIES_MONGO_DATABASE": "cookies",
25 | "COOKIES_MONGO_COLLECTION": "cookies",
26 | }
27 |
28 | def setUp(self):
29 | self.spider = Spider("foo")
30 | self.settings = Settings()
31 | self.settings.setmodule(default_settings)
32 | self.settings.setdict(self.local_settings)
33 | self.storage = MongoStorage(self.settings)
34 | self.storage.open_spider(self.spider)
35 |
36 | def tearDown(self):
37 | self.storage.close_spider(self.spider)
38 | self.storage.coll.delete_many({})
39 |
40 | def test_getitem(self):
41 | cookies = CookieJar()
42 | self.storage.coll.insert_one(
43 | {
44 | "key": "new_cookies",
45 | "cookiejar": pickle.dumps(cookies),
46 | "cookies": cookies._cookies,
47 | }
48 | )
49 |
50 | self.assertDictEqual(self.storage["new_cookies"]._cookies, cookies._cookies)
51 |
52 | def test_missing(self):
53 | self.assertDictEqual(
54 | self.storage["no_exist_cookies"]._cookies, CookieJar()._cookies
55 | )
56 |
57 | def test_setitem(self):
58 | cookies = CookieJar()
59 | self.storage["new_cookies"] = cookies
60 | self.assertDictEqual(
61 | self.storage.coll.find_one({"key": "new_cookies"}, {"_id": 0}),
62 | {
63 | "key": "new_cookies",
64 | "cookiejar": pickle.dumps(cookies),
65 | "cookies": cookies._cookies,
66 | },
67 | )
68 |
69 | def test_iter(self):
70 | self.assertIsInstance(self.storage, Iterable)
71 |
--------------------------------------------------------------------------------
/tests/test_storages/test_storage_redis.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | from collections.abc import Iterable
3 | from unittest import TestCase
4 |
5 | import ujson
6 | from pytest import mark
7 | from scrapy import Spider
8 | from scrapy.http.cookies import CookieJar
9 | from scrapy.settings import Settings
10 |
11 | from scrapy_cookies.settings import default_settings
12 | from scrapy_cookies.storage.redis_ import RedisStorage
13 |
14 |
15 | @mark.usefixtures("class_scoped_container_getter")
16 | class RedisStorageTest(TestCase):
17 | maxDiff = None
18 | local_settings = {}
19 |
20 | def setUp(self):
21 | self.spider = Spider("foo")
22 | self.settings = Settings()
23 | self.settings.setmodule(default_settings)
24 | self.settings.setdict(self.local_settings)
25 | self.storage = RedisStorage(self.settings)
26 | self.storage.open_spider(self.spider)
27 |
28 | def tearDown(self):
29 | self.storage.close_spider(self.spider)
30 | self.storage.r.flushall()
31 |
32 | def test_getitem(self):
33 | cookies = CookieJar()
34 | self.storage.r.hmset(
35 | "new_cookies",
36 | {
37 | "cookiejar": pickle.dumps(cookies),
38 | "cookies": ujson.dumps(cookies._cookies),
39 | },
40 | )
41 | self.assertDictEqual(self.storage["new_cookies"]._cookies, cookies._cookies)
42 |
43 | def test_missing(self):
44 | self.assertDictEqual(
45 | self.storage["no_exist_cookies"]._cookies, CookieJar()._cookies
46 | )
47 |
48 | def test_setitem(self):
49 | cookies = CookieJar()
50 | self.storage["new_cookies"] = cookies
51 | _ = self.storage.r.hgetall("new_cookies")
52 | self.assertDictEqual(
53 | pickle.loads(self.storage.r.hgetall("new_cookies")[b"cookiejar"])._cookies,
54 | cookies._cookies,
55 | )
56 | self.assertDictEqual(
57 | self.storage.r.hgetall("new_cookies"),
58 | {
59 | b"cookiejar": pickle.dumps(cookies),
60 | b"cookies": ujson.dumps(cookies._cookies).encode(),
61 | },
62 | )
63 |
64 | def test_iter(self):
65 | self.assertIsInstance(self.storage, Iterable)
66 |
67 | def test_len(self):
68 | self.assertEqual(len(self.storage), 0)
69 | self.storage["new_cookies_1"] = CookieJar()
70 | self.assertEqual(len(self.storage), 1)
71 | self.storage["new_cookies_2"] = CookieJar()
72 | self.assertEqual(len(self.storage), 2)
73 |
74 | def test_delitem(self):
75 | self.storage["new_cookies"] = CookieJar()
76 | del self.storage["new_cookies"]
77 | self.assertFalse(self.storage.r.hgetall("new_cookies"))
78 |
--------------------------------------------------------------------------------
/tests/test_storages/test_storage_sqlite.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | from copy import deepcopy
4 | from unittest import TestCase
5 |
6 | from scrapy import Spider
7 | from scrapy.http.cookies import CookieJar
8 | from scrapy.settings import Settings
9 |
10 | from scrapy_cookies.settings import default_settings
11 | from scrapy_cookies.storage.sqlite import SQLiteStorage
12 |
13 |
14 | class StorageTest(TestCase):
15 | def setUp(self):
16 | self.spider = Spider("foo")
17 | self.settings = Settings()
18 | self.settings.setmodule(default_settings)
19 |
20 | def tearDown(self):
21 | pass
22 |
23 | def test_sqlite(self):
24 | tmpdir = tempfile.mkdtemp()
25 | local_settings = {
26 | "COOKIES_STORAGE": "scrapy_cookies.storage.sqlite.SQLiteStorage",
27 | "COOKIES_SQLITE_DATABASE": ":memory:",
28 | "COOKIES_PERSISTENCE": True,
29 | "COOKIES_PERSISTENCE_DIR": tmpdir + "/cookies",
30 | }
31 | settings = deepcopy(self.settings)
32 | settings.setdict(local_settings)
33 |
34 | storage = SQLiteStorage(settings)
35 | storage.open_spider(self.spider)
36 |
37 | cookie = storage["no_key"]
38 | self.assertIn("no_key", storage)
39 | self.assertIsInstance(cookie, CookieJar)
40 | self.assertEqual(cookie._cookies, CookieJar()._cookies)
41 |
42 | storage["key_1"] = CookieJar()
43 | self.assertIn("key_1", storage)
44 | self.assertEqual(storage["key_1"]._cookies, CookieJar()._cookies)
45 |
46 | self.assertNotIn("key_2", storage)
47 |
48 | self.assertEqual(len(storage), 2)
49 |
50 | _dict = {"no_key": CookieJar()._cookies, "key_1": CookieJar()._cookies}
51 | for k, v in storage:
52 | self.assertDictEqual(v._cookies, _dict[k])
53 |
54 | storage.close_spider(self.spider)
55 | self.assertTrue(os.path.isfile(tmpdir + "/cookies"))
56 |
57 | storage_2 = SQLiteStorage(settings)
58 | storage_2.open_spider(self.spider)
59 | self.assertIn("key_1", storage_2)
60 | self.assertDictEqual(storage_2["key_1"]._cookies, CookieJar()._cookies)
61 |
62 | storage_2.close_spider(self.spider)
63 | self.assertTrue(os.path.isfile(tmpdir + "/cookies"))
64 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox (https://tox.readthedocs.io/) is a tool for running tests
2 | # in multiple virtualenvs. This configuration file will run the
3 | # test suite on all supported python versions. To use it, "pip install tox"
4 | # and then run "tox" from this directory.
5 |
6 | [tox]
7 | envlist = py36,py37
8 |
9 | [testenv]
10 | commands =
11 | pytest
12 | deps =
13 | -r requirements.txt
14 | -r tests/requirements.txt
15 | passenv =
16 | PYTHONPATH
17 |
18 | [docs]
19 | changedir = docs
20 | deps =
21 | -r docs/requirements.txt
22 |
23 | [testenv:docs]
24 | changedir = {[docs]changedir}
25 | deps = {[docs]deps}
26 | commands =
27 | sphinx-build -W -b html . {envtmpdir}/html
28 |
29 | [testenv:docs-links]
30 | changedir = {[docs]changedir}
31 | deps = {[docs]deps}
32 | commands =
33 | sphinx-build -W -b linkcheck . {envtmpdir}/linkcheck
34 |
--------------------------------------------------------------------------------