├── .gitignore ├── LICENCE ├── MANIFEST.in ├── README.md ├── requirements.txt ├── scrapy_cloudflare_middleware ├── __init__.py └── middlewares.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | reports/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask instance folder 60 | instance/ 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | .tmpdocs/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # IPython Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | venv-jenkins*/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # .idea is the directory for pycharm project files 97 | .idea 98 | 99 | # MACOS stuff 100 | .DS_Store 101 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2018 Clément Denoix 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI](https://img.shields.io/pypi/v/scrapy_cloudflare_middleware.svg)](https://pypi.python.org/pypi/scrapy_cloudflare_middleware) 2 | 3 | ## Scrapy "CloudFlare" middleware 4 | 5 | A Scrapy middleware to bypass the CloudFlare's anti-bot protection, based on [cloudflare-scrape](https://github.com/Anorov/cloudflare-scrape). 6 | 7 | ### Installation 8 | ``` 9 | pip install scrapy_cloudflare_middleware 10 | ``` 11 | 12 | ### Usage 13 | 14 | Add the middleware to your `DOWNLOADER_MIDDLEWARES` settings 15 | 16 | ```python 17 | DOWNLOADER_MIDDLEWARES = { 18 | # The priority of 560 is important, because we want this middleware to kick in just before the scrapy built-in `RetryMiddleware`. 19 | 'scrapy_cloudflare_middleware.middlewares.CloudFlareMiddleware': 560 20 | } 21 | ``` 22 | 23 | Done. 24 | Happy scraping ! 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=1.0 2 | cfscrape>=1.9.4 3 | -------------------------------------------------------------------------------- /scrapy_cloudflare_middleware/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemfromspace/scrapy-cloudflare-middleware/03ccea9164418dabe8180053165b0da0bffa0741/scrapy_cloudflare_middleware/__init__.py -------------------------------------------------------------------------------- /scrapy_cloudflare_middleware/middlewares.py: -------------------------------------------------------------------------------- 1 | """This module contains the ``CloudFlareMiddleware``""" 2 | 3 | from cfscrape import get_tokens 4 | 5 | import logging 6 | 7 | 8 | class CloudFlareMiddleware: 9 | """Scrapy middleware to bypass the CloudFlare's anti-bot protection""" 10 | 11 | @staticmethod 12 | def is_cloudflare_challenge(response): 13 | """Test if the given response contains the cloudflare's anti-bot protection""" 14 | 15 | return ( 16 | response.status == 503 17 | and response.headers.get('Server', '').startswith(b'cloudflare') 18 | and 'jschl_vc' in response.text 19 | and 'jschl_answer' in response.text 20 | ) 21 | 22 | def process_response(self, request, response, spider): 23 | """Handle the a Scrapy response""" 24 | 25 | if not self.is_cloudflare_challenge(response): 26 | return response 27 | 28 | logger = logging.getLogger('cloudflaremiddleware') 29 | 30 | logger.debug( 31 | 'Cloudflare protection detected on %s, trying to bypass...', 32 | response.url 33 | ) 34 | 35 | cloudflare_tokens, __ = get_tokens( 36 | request.url, 37 | user_agent=spider.settings.get('USER_AGENT') 38 | ) 39 | 40 | logger.debug( 41 | 'Successfully bypassed the protection for %s, re-scheduling the request', 42 | response.url 43 | ) 44 | 45 | request.cookies.update(cloudflare_tokens) 46 | request.priority = 99999 47 | 48 | return request 49 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = scrapy_cloudflare_middleware 3 | version = 0.0.1 4 | url = https://github.com/clemfromspace/scrapy-cloudflare-middleware 5 | licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 6 | description = A Scrapy Middleware to bypass the CloudFlare's anti-bot protection 7 | long_description = file:README.md 8 | 9 | [options] 10 | include_package_data = true 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """This module contains the packaging routine for the ``scrapy-algolia-exporter`` package""" 2 | 3 | from setuptools import setup, find_packages 4 | from pip.download import PipSession 5 | from pip.req import parse_requirements 6 | 7 | 8 | def get_requirements(source): 9 | """Get the requirements from the given ``source`` 10 | 11 | Parameters 12 | ---------- 13 | source: str 14 | The filename containing the requirements 15 | 16 | """ 17 | 18 | install_reqs = parse_requirements(filename=source, session=PipSession()) 19 | 20 | return [str(ir.req) for ir in install_reqs] 21 | 22 | setup( 23 | packages=find_packages(), 24 | install_requires=get_requirements('requirements.txt') 25 | ) 26 | 27 | 28 | --------------------------------------------------------------------------------