├── .gitignore
├── LICENCE
├── MANIFEST.in
├── README.md
├── requirements.txt
├── scrapy_cloudflare_middleware
    ├── __init__.py
    └── middlewares.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *,cover
 48 | .hypothesis/
 49 | reports/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask instance folder
 60 | instance/
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | .tmpdocs/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # IPython Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # dotenv
 82 | .env
 83 | 
 84 | # virtualenv
 85 | .venv
 86 | venv/
 87 | venv-jenkins*/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | 
 93 | # Rope project settings
 94 | .ropeproject
 95 | 
 96 | # .idea is the directory for pycharm project files
 97 | .idea
 98 | 
 99 | # MACOS stuff
100 | .DS_Store
101 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 2 |                     Version 2, December 2004
 3 | 
 4 |  Copyright (C) 2018 Clément Denoix <clement.denoix@gmail.com>
 5 | 
 6 |  Everyone is permitted to copy and distribute verbatim or modified
 7 |  copies of this license document, and changing it is allowed as long
 8 |  as the name is changed.
 9 | 
10 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12 | 
13 |   0. You just DO WHAT THE FUCK YOU WANT TO.
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![PyPI](https://img.shields.io/pypi/v/scrapy_cloudflare_middleware.svg)](https://pypi.python.org/pypi/scrapy_cloudflare_middleware)
 2 | 
 3 | ## Scrapy "CloudFlare" middleware
 4 | 
 5 | A Scrapy middleware to bypass the CloudFlare's anti-bot protection, based on [cloudflare-scrape](https://github.com/Anorov/cloudflare-scrape).
 6 | 
 7 | ### Installation
 8 | ```
 9 | pip install scrapy_cloudflare_middleware
10 | ```
11 | 
12 | ### Usage
13 | 
14 | Add the middleware to your `DOWNLOADER_MIDDLEWARES` settings
15 | 
16 | ```python
17 | DOWNLOADER_MIDDLEWARES = {
18 |     # The priority of 560 is important, because we want this middleware to kick in just before the scrapy built-in `RetryMiddleware`.
19 |     'scrapy_cloudflare_middleware.middlewares.CloudFlareMiddleware': 560
20 | }
21 | ```
22 | 
23 | Done.
24 | Happy scraping !
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy>=1.0
2 | cfscrape>=1.9.4
3 | 


--------------------------------------------------------------------------------
/scrapy_cloudflare_middleware/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clemfromspace/scrapy-cloudflare-middleware/03ccea9164418dabe8180053165b0da0bffa0741/scrapy_cloudflare_middleware/__init__.py


--------------------------------------------------------------------------------
/scrapy_cloudflare_middleware/middlewares.py:
--------------------------------------------------------------------------------
 1 | """This module contains the ``CloudFlareMiddleware``"""
 2 | 
 3 | from cfscrape import get_tokens
 4 | 
 5 | import logging
 6 | 
 7 | 
 8 | class CloudFlareMiddleware:
 9 |     """Scrapy middleware to bypass the CloudFlare's anti-bot protection"""
10 | 
11 |     @staticmethod
12 |     def is_cloudflare_challenge(response):
13 |         """Test if the given response contains the cloudflare's anti-bot protection"""
14 | 
15 |         return (
16 |             response.status == 503
17 |             and response.headers.get('Server', '').startswith(b'cloudflare')
18 |             and 'jschl_vc' in response.text
19 |             and 'jschl_answer' in response.text
20 |         )
21 | 
22 |     def process_response(self, request, response, spider):
23 |         """Handle the a Scrapy response"""
24 | 
25 |         if not self.is_cloudflare_challenge(response):
26 |             return response
27 | 
28 |         logger = logging.getLogger('cloudflaremiddleware')
29 | 
30 |         logger.debug(
31 |             'Cloudflare protection detected on %s, trying to bypass...',
32 |             response.url
33 |         )
34 | 
35 |         cloudflare_tokens, __ = get_tokens(
36 |             request.url,
37 |             user_agent=spider.settings.get('USER_AGENT')
38 |         )
39 | 
40 |         logger.debug(
41 |             'Successfully bypassed the protection for %s, re-scheduling the request',
42 |             response.url
43 |         )
44 | 
45 |         request.cookies.update(cloudflare_tokens)
46 |         request.priority = 99999
47 | 
48 |         return request
49 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = scrapy_cloudflare_middleware
 3 | version = 0.0.1
 4 | url = https://github.com/clemfromspace/scrapy-cloudflare-middleware
 5 | licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 6 | description = A Scrapy Middleware to bypass the CloudFlare's anti-bot protection
 7 | long_description = file:README.md
 8 | 
 9 | [options]
10 | include_package_data = true
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """This module contains the packaging routine for the ``scrapy-algolia-exporter`` package"""
 2 | 
 3 | from setuptools import setup, find_packages
 4 | from pip.download import PipSession
 5 | from pip.req import parse_requirements
 6 | 
 7 | 
 8 | def get_requirements(source):
 9 |     """Get the requirements from the given ``source``
10 | 
11 |     Parameters
12 |     ----------
13 |     source: str
14 |         The filename containing the requirements
15 | 
16 |     """
17 | 
18 |     install_reqs = parse_requirements(filename=source, session=PipSession())
19 | 
20 |     return [str(ir.req) for ir in install_reqs]
21 | 
22 | setup(
23 |     packages=find_packages(),
24 |     install_requires=get_requirements('requirements.txt')
25 | )
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------