├── .coveragerc ├── .github └── ISSUE_TEMPLATE │ └── bug-report-template.md ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.md ├── appveyor.yml ├── cfscrape ├── __init__.py └── user_agents.py ├── pytest.ini ├── setup.py ├── tests ├── __init__.py ├── fixtures │ ├── cf_recaptcha_15_04_2019.html │ ├── js_challenge_03_12_2018.html │ ├── js_challenge_09_06_2016.html │ ├── js_challenge_10_04_2019.html │ ├── js_challenge_13_03_2019.html │ ├── js_challenge_21_03_2019.html │ ├── js_challenge_21_05_2015.html │ ├── js_challenge_30_11_2019.html │ └── requested_page.html ├── test_adapters.py └── test_cfscrape.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = cfscrape 3 | omit = 4 | *test* 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report template 3 | about: For reporting issues and errors 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Before creating an issue, first upgrade cfscrape with `pip install -U cfscrape` and see if you're still experiencing the problem. Please also confirm your Node version (`node --version` or `nodejs --version`) is version 10 or higher. 11 | 12 | Make sure the website you're having issues with is actually using anti-bot protection by Cloudflare and not a competitor like Imperva Incapsula or Sucuri. And if you're using an anonymizing proxy, a VPN, or Tor, Cloudflare often flags those IPs and may block you or present you with a captcha as a result. 13 | 14 | Please **confirm the following statements and check the boxes** before creating an issue: 15 | 16 | - [ ] I've upgraded cfscrape with `pip install -U cfscrape` 17 | - [ ] I'm using Node version 10 or higher 18 | - [ ] The site protection I'm having issues with is from Cloudflare 19 | - [ ] I'm not using Tor, a VPN, or an anonymizing proxy 20 | 21 | ## Python version number 22 | 23 | Run `python --version` and paste the output below: 24 | 25 | ``` 26 | 27 | ``` 28 | 29 | ## cfscrape version number 30 | 31 | Run `pip show cfscrape` and paste the output below: 32 | 33 | ``` 34 | 35 | ``` 36 | 37 | ## Code snippet involved with the issue 38 | 39 | ``` 40 | 41 | ``` 42 | 43 | ## Complete exception and traceback 44 | 45 | (*If the problem doesn't involve an exception being raised, leave this blank*) 46 | 47 | ``` 48 | 49 | ``` 50 | 51 | ## URL of the Cloudflare-protected page 52 | 53 | [LINK GOES HERE] 54 | 55 | ## URL of Pastebin/Gist with HTML source of protected page 56 | 57 | [LINK GOES HERE] 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # Tests / Coverage 27 | report.xml 28 | coverage.xml 29 | .coverage 30 | .tox/ 31 | .testmondata 32 | 33 | # IDE 34 | .idea/ 35 | 36 | # Prevent unintended commits 37 | .env 38 | 39 | # Temp files 40 | *.swp 41 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | dist: trusty 4 | sudo: false 5 | 6 | matrix: 7 | include: 8 | - python: '2.7' 9 | - python: '3.4' 10 | - python: '3.5' 11 | - python: '3.6' 12 | - python: '3.7' 13 | dist: xenial 14 | sudo: true 15 | - env: cfscrape_node='4.5' 16 | - env: cfscrape_node='node' 17 | - os: osx 18 | language: node_js 19 | node_js: node 20 | cache: 21 | directories: 22 | - $HOME/Library/Caches/Homebrew 23 | - /usr/local/Homebrew 24 | 25 | cache: pip 26 | 27 | before_install: 28 | - | 29 | if [ -n "${cfscrape_node}" ]; then 30 | source ~/.nvm/nvm.sh 31 | nvm install "${cfscrape_node}" 32 | nvm use "${cfscrape_node}" 33 | fi 34 | 35 | install: 36 | - node -p process.versions 37 | - make 38 | 39 | script: make ci 40 | 41 | after_success: make coverage 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Anorov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | pep8-rules := E501,W503,W504 2 | 3 | init: 4 | pip install pipenv -U 5 | pipenv install --dev 6 | 7 | requirements: 8 | pipenv lock -r > requirements.txt 9 | pipenv lock --dev -r > requirements-dev.txt 10 | 11 | test: 12 | # This runs all of the tests, on both Python 2 and Python 3. 13 | pipenv run tox --parallel auto 14 | 15 | watch: 16 | # This automatically selects and re-executes only tests affected by recent changes. 17 | pipenv run ptw -- --testmon 18 | 19 | retry: 20 | # This will retry failed tests on every file change. 21 | pipenv run py.test -n auto --forked --looponfail 22 | 23 | ci: 24 | pipenv run py.test tests 25 | 26 | lint: 27 | pipenv run flake8 --ignore $(pep8-rules) cfscrape tests setup.py 28 | 29 | format: 30 | # Automatic reformatting 31 | pipenv run autopep8 -aaa --ignore $(pep8-rules) --in-place --recursive cfscrape tests setup.py 32 | 33 | coverage: 34 | pipenv run py.test --cov-config .coveragerc --verbose --cov-report term --cov-report xml --cov=cfscrape tests 35 | pipenv run coveralls 36 | 37 | publish: 38 | pip install 'twine>=1.5.0' 39 | python setup.py sdist bdist_wheel 40 | twine upload dist/* 41 | rm -fr build dist .egg cfscrape.egg-info 42 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple/" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | requests = "*" 8 | 9 | [dev-packages] 10 | pytest = "*" 11 | # more_itertools is added to resolve a CI related issue with Pipenv + pytest 12 | more_itertools = { version = ">=4.0.0", markers = "python_version >= '2.7'" } 13 | # mock is added to resolve a CI related issue with Pipenv + pytest 14 | mock = "*" 15 | pytest-cov = "*" 16 | pytest-xdist = "*" 17 | pytest-forked = "*" 18 | pytest-testmon = "*" 19 | pytest-watch = "*" 20 | pytest-timeout = "*" 21 | responses = "*" 22 | sure = "*" 23 | "flake8" = "*" 24 | "autopep8" = "*" 25 | tox = "*" 26 | coverage = "*" 27 | coveralls = "*" 28 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "a998a4d3507e5aa42cff1a7d37b30bb937d36690dbb22a27cc50ba1b25166e80" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": {}, 8 | "sources": [ 9 | { 10 | "name": "pypi", 11 | "url": "https://pypi.org/simple/", 12 | "verify_ssl": true 13 | } 14 | ] 15 | }, 16 | "default": { 17 | "certifi": { 18 | "hashes": [ 19 | "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", 20 | "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae" 21 | ], 22 | "version": "==2019.3.9" 23 | }, 24 | "chardet": { 25 | "hashes": [ 26 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 27 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 28 | ], 29 | "version": "==3.0.4" 30 | }, 31 | "idna": { 32 | "hashes": [ 33 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 34 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 35 | ], 36 | "version": "==2.8" 37 | }, 38 | "requests": { 39 | "hashes": [ 40 | "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", 41 | "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" 42 | ], 43 | "index": "pypi", 44 | "version": "==2.21.0" 45 | }, 46 | "urllib3": { 47 | "hashes": [ 48 | "sha256:2393a695cd12afedd0dcb26fe5d50d0cf248e5a66f75dbd89a3d4eb333a61af4", 49 | "sha256:a637e5fae88995b256e3409dc4d52c2e2e0ba32c42a6365fee8bbd2238de3cfb" 50 | ], 51 | "version": "==1.24.3" 52 | } 53 | }, 54 | "develop": { 55 | "apipkg": { 56 | "hashes": [ 57 | "sha256:37228cda29411948b422fae072f57e31d3396d2ee1c9783775980ee9c9990af6", 58 | "sha256:58587dd4dc3daefad0487f6d9ae32b4542b185e1c36db6993290e7c41ca2b47c" 59 | ], 60 | "version": "==1.5" 61 | }, 62 | "argh": { 63 | "hashes": [ 64 | "sha256:a9b3aaa1904eeb78e32394cd46c6f37ac0fb4af6dc488daa58971bdc7d7fcaf3", 65 | "sha256:e9535b8c84dc9571a48999094fda7f33e63c3f1b74f3e5f3ac0105a58405bb65" 66 | ], 67 | "version": "==0.26.2" 68 | }, 69 | "asn1crypto": { 70 | "hashes": [ 71 | "sha256:2f1adbb7546ed199e3c90ef23ec95c5cf3585bac7d11fb7eb562a3fe89c64e87", 72 | "sha256:9d5c20441baf0cb60a4ac34cc447c6c189024b6b4c6cd7877034f4965c464e49" 73 | ], 74 | "version": "==0.24.0" 75 | }, 76 | "atomicwrites": { 77 | "hashes": [ 78 | "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", 79 | "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" 80 | ], 81 | "version": "==1.3.0" 82 | }, 83 | "attrs": { 84 | "hashes": [ 85 | "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", 86 | "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" 87 | ], 88 | "version": "==19.1.0" 89 | }, 90 | "autopep8": { 91 | "hashes": [ 92 | "sha256:4d8eec30cc81bc5617dbf1218201d770dc35629363547f17577c61683ccfb3ee" 93 | ], 94 | "index": "pypi", 95 | "version": "==1.4.4" 96 | }, 97 | "certifi": { 98 | "hashes": [ 99 | "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", 100 | "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae" 101 | ], 102 | "version": "==2019.3.9" 103 | }, 104 | "cffi": { 105 | "hashes": [ 106 | "sha256:041c81822e9f84b1d9c401182e174996f0bae9991f33725d059b771744290774", 107 | "sha256:046ef9a22f5d3eed06334d01b1e836977eeef500d9b78e9ef693f9380ad0b83d", 108 | "sha256:066bc4c7895c91812eff46f4b1c285220947d4aa46fa0a2651ff85f2afae9c90", 109 | "sha256:066c7ff148ae33040c01058662d6752fd73fbc8e64787229ea8498c7d7f4041b", 110 | "sha256:2444d0c61f03dcd26dbf7600cf64354376ee579acad77aef459e34efcb438c63", 111 | "sha256:300832850b8f7967e278870c5d51e3819b9aad8f0a2c8dbe39ab11f119237f45", 112 | "sha256:34c77afe85b6b9e967bd8154e3855e847b70ca42043db6ad17f26899a3df1b25", 113 | "sha256:46de5fa00f7ac09f020729148ff632819649b3e05a007d286242c4882f7b1dc3", 114 | "sha256:4aa8ee7ba27c472d429b980c51e714a24f47ca296d53f4d7868075b175866f4b", 115 | "sha256:4d0004eb4351e35ed950c14c11e734182591465a33e960a4ab5e8d4f04d72647", 116 | "sha256:4e3d3f31a1e202b0f5a35ba3bc4eb41e2fc2b11c1eff38b362de710bcffb5016", 117 | "sha256:50bec6d35e6b1aaeb17f7c4e2b9374ebf95a8975d57863546fa83e8d31bdb8c4", 118 | "sha256:55cad9a6df1e2a1d62063f79d0881a414a906a6962bc160ac968cc03ed3efcfb", 119 | "sha256:5662ad4e4e84f1eaa8efce5da695c5d2e229c563f9d5ce5b0113f71321bcf753", 120 | "sha256:59b4dc008f98fc6ee2bb4fd7fc786a8d70000d058c2bbe2698275bc53a8d3fa7", 121 | "sha256:73e1ffefe05e4ccd7bcea61af76f36077b914f92b76f95ccf00b0c1b9186f3f9", 122 | "sha256:a1f0fd46eba2d71ce1589f7e50a9e2ffaeb739fb2c11e8192aa2b45d5f6cc41f", 123 | "sha256:a2e85dc204556657661051ff4bab75a84e968669765c8a2cd425918699c3d0e8", 124 | "sha256:a5457d47dfff24882a21492e5815f891c0ca35fefae8aa742c6c263dac16ef1f", 125 | "sha256:a8dccd61d52a8dae4a825cdbb7735da530179fea472903eb871a5513b5abbfdc", 126 | "sha256:ae61af521ed676cf16ae94f30fe202781a38d7178b6b4ab622e4eec8cefaff42", 127 | "sha256:b012a5edb48288f77a63dba0840c92d0504aa215612da4541b7b42d849bc83a3", 128 | "sha256:d2c5cfa536227f57f97c92ac30c8109688ace8fa4ac086d19d0af47d134e2909", 129 | "sha256:d42b5796e20aacc9d15e66befb7a345454eef794fdb0737d1af593447c6c8f45", 130 | "sha256:dee54f5d30d775f525894d67b1495625dd9322945e7fee00731952e0368ff42d", 131 | "sha256:e070535507bd6aa07124258171be2ee8dfc19119c28ca94c9dfb7efd23564512", 132 | "sha256:e1ff2748c84d97b065cc95429814cdba39bcbd77c9c85c89344b317dc0d9cbff", 133 | "sha256:ed851c75d1e0e043cbf5ca9a8e1b13c4c90f3fbd863dacb01c0808e2b5204201" 134 | ], 135 | "version": "==1.12.3" 136 | }, 137 | "chardet": { 138 | "hashes": [ 139 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 140 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 141 | ], 142 | "version": "==3.0.4" 143 | }, 144 | "colorama": { 145 | "hashes": [ 146 | "sha256:05eed71e2e327246ad6b38c540c4a3117230b19679b875190486ddd2d721422d", 147 | "sha256:f8ac84de7840f5b9c4e3347b3c1eaa50f7e49c2b07596221daec5edaabbd7c48" 148 | ], 149 | "version": "==0.4.1" 150 | }, 151 | "configparser": { 152 | "hashes": [ 153 | "sha256:8be81d89d6e7b4c0d4e44bcc525845f6da25821de80cb5e06e7e0238a2899e32", 154 | "sha256:da60d0014fd8c55eb48c1c5354352e363e2d30bbf7057e5e171a468390184c75" 155 | ], 156 | "markers": "python_version == '2.7'", 157 | "version": "==3.7.4" 158 | }, 159 | "cookies": { 160 | "hashes": [ 161 | "sha256:15bee753002dff684987b8df8c235288eb8d45f8191ae056254812dfd42c81d3", 162 | "sha256:d6b698788cae4cfa4e62ef8643a9ca332b79bd96cb314294b864ae8d7eb3ee8e" 163 | ], 164 | "markers": "python_version < '3.4'", 165 | "version": "==2.2.1" 166 | }, 167 | "coverage": { 168 | "hashes": [ 169 | "sha256:3684fabf6b87a369017756b551cef29e505cb155ddb892a7a29277b978da88b9", 170 | "sha256:39e088da9b284f1bd17c750ac672103779f7954ce6125fd4382134ac8d152d74", 171 | "sha256:3c205bc11cc4fcc57b761c2da73b9b72a59f8d5ca89979afb0c1c6f9e53c7390", 172 | "sha256:465ce53a8c0f3a7950dfb836438442f833cf6663d407f37d8c52fe7b6e56d7e8", 173 | "sha256:48020e343fc40f72a442c8a1334284620f81295256a6b6ca6d8aa1350c763bbe", 174 | "sha256:5296fc86ab612ec12394565c500b412a43b328b3907c0d14358950d06fd83baf", 175 | "sha256:5f61bed2f7d9b6a9ab935150a6b23d7f84b8055524e7be7715b6513f3328138e", 176 | "sha256:68a43a9f9f83693ce0414d17e019daee7ab3f7113a70c79a3dd4c2f704e4d741", 177 | "sha256:6b8033d47fe22506856fe450470ccb1d8ba1ffb8463494a15cfc96392a288c09", 178 | "sha256:7ad7536066b28863e5835e8cfeaa794b7fe352d99a8cded9f43d1161be8e9fbd", 179 | "sha256:7bacb89ccf4bedb30b277e96e4cc68cd1369ca6841bde7b005191b54d3dd1034", 180 | "sha256:839dc7c36501254e14331bcb98b27002aa415e4af7ea039d9009409b9d2d5420", 181 | "sha256:8f9a95b66969cdea53ec992ecea5406c5bd99c9221f539bca1e8406b200ae98c", 182 | "sha256:932c03d2d565f75961ba1d3cec41ddde00e162c5b46d03f7423edcb807734eab", 183 | "sha256:988529edadc49039d205e0aa6ce049c5ccda4acb2d6c3c5c550c17e8c02c05ba", 184 | "sha256:998d7e73548fe395eeb294495a04d38942edb66d1fa61eb70418871bc621227e", 185 | "sha256:9de60893fb447d1e797f6bf08fdf0dbcda0c1e34c1b06c92bd3a363c0ea8c609", 186 | "sha256:9e80d45d0c7fcee54e22771db7f1b0b126fb4a6c0a2e5afa72f66827207ff2f2", 187 | "sha256:a545a3dfe5082dc8e8c3eb7f8a2cf4f2870902ff1860bd99b6198cfd1f9d1f49", 188 | "sha256:a5d8f29e5ec661143621a8f4de51adfb300d7a476224156a39a392254f70687b", 189 | "sha256:aca06bfba4759bbdb09bf52ebb15ae20268ee1f6747417837926fae990ebc41d", 190 | "sha256:bb23b7a6fd666e551a3094ab896a57809e010059540ad20acbeec03a154224ce", 191 | "sha256:bfd1d0ae7e292105f29d7deaa9d8f2916ed8553ab9d5f39ec65bcf5deadff3f9", 192 | "sha256:c62ca0a38958f541a73cf86acdab020c2091631c137bd359c4f5bddde7b75fd4", 193 | "sha256:c709d8bda72cf4cd348ccec2a4881f2c5848fd72903c185f363d361b2737f773", 194 | "sha256:c968a6aa7e0b56ecbd28531ddf439c2ec103610d3e2bf3b75b813304f8cb7723", 195 | "sha256:df785d8cb80539d0b55fd47183264b7002077859028dfe3070cf6359bf8b2d9c", 196 | "sha256:f406628ca51e0ae90ae76ea8398677a921b36f0bd71aab2099dfed08abd0322f", 197 | "sha256:f46087bbd95ebae244a0eda01a618aff11ec7a069b15a3ef8f6b520db523dcf1", 198 | "sha256:f8019c5279eb32360ca03e9fac40a12667715546eed5c5eb59eb381f2f501260", 199 | "sha256:fc5f4d209733750afd2714e9109816a29500718b32dd9a5db01c0cb3a019b96a" 200 | ], 201 | "index": "pypi", 202 | "version": "==4.5.3" 203 | }, 204 | "coveralls": { 205 | "hashes": [ 206 | "sha256:baa26648430d5c2225ab12d7e2067f75597a4b967034bba7e3d5ab7501d207a1", 207 | "sha256:ff9b7823b15070f26f654837bb02a201d006baaf2083e0514ffd3b34a3ffed81" 208 | ], 209 | "index": "pypi", 210 | "version": "==1.7.0" 211 | }, 212 | "cryptography": { 213 | "hashes": [ 214 | "sha256:066f815f1fe46020877c5983a7e747ae140f517f1b09030ec098503575265ce1", 215 | "sha256:210210d9df0afba9e000636e97810117dc55b7157c903a55716bb73e3ae07705", 216 | "sha256:26c821cbeb683facb966045e2064303029d572a87ee69ca5a1bf54bf55f93ca6", 217 | "sha256:2afb83308dc5c5255149ff7d3fb9964f7c9ee3d59b603ec18ccf5b0a8852e2b1", 218 | "sha256:2db34e5c45988f36f7a08a7ab2b69638994a8923853dec2d4af121f689c66dc8", 219 | "sha256:409c4653e0f719fa78febcb71ac417076ae5e20160aec7270c91d009837b9151", 220 | "sha256:45a4f4cf4f4e6a55c8128f8b76b4c057027b27d4c67e3fe157fa02f27e37830d", 221 | "sha256:48eab46ef38faf1031e58dfcc9c3e71756a1108f4c9c966150b605d4a1a7f659", 222 | "sha256:6b9e0ae298ab20d371fc26e2129fd683cfc0cfde4d157c6341722de645146537", 223 | "sha256:6c4778afe50f413707f604828c1ad1ff81fadf6c110cb669579dea7e2e98a75e", 224 | "sha256:8c33fb99025d353c9520141f8bc989c2134a1f76bac6369cea060812f5b5c2bb", 225 | "sha256:9873a1760a274b620a135054b756f9f218fa61ca030e42df31b409f0fb738b6c", 226 | "sha256:9b069768c627f3f5623b1cbd3248c5e7e92aec62f4c98827059eed7053138cc9", 227 | "sha256:9e4ce27a507e4886efbd3c32d120db5089b906979a4debf1d5939ec01b9dd6c5", 228 | "sha256:acb424eaca214cb08735f1a744eceb97d014de6530c1ea23beb86d9c6f13c2ad", 229 | "sha256:c8181c7d77388fe26ab8418bb088b1a1ef5fde058c6926790c8a0a3d94075a4a", 230 | "sha256:d4afbb0840f489b60f5a580a41a1b9c3622e08ecb5eec8614d4fb4cd914c4460", 231 | "sha256:d9ed28030797c00f4bc43c86bf819266c76a5ea61d006cd4078a93ebf7da6bfd", 232 | "sha256:e603aa7bb52e4e8ed4119a58a03b60323918467ef209e6ff9db3ac382e5cf2c6" 233 | ], 234 | "version": "==2.6.1" 235 | }, 236 | "docopt": { 237 | "hashes": [ 238 | "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491" 239 | ], 240 | "version": "==0.6.2" 241 | }, 242 | "entrypoints": { 243 | "hashes": [ 244 | "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", 245 | "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" 246 | ], 247 | "version": "==0.3" 248 | }, 249 | "enum34": { 250 | "hashes": [ 251 | "sha256:2d81cbbe0e73112bdfe6ef8576f2238f2ba27dd0d55752a776c41d38b7da2850", 252 | "sha256:644837f692e5f550741432dd3f223bbb9852018674981b1664e5dc339387588a", 253 | "sha256:6bd0f6ad48ec2aa117d3d141940d484deccda84d4fcd884f5c3d93c23ecd8c79", 254 | "sha256:8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1" 255 | ], 256 | "markers": "python_version < '3.4'", 257 | "version": "==1.1.6" 258 | }, 259 | "execnet": { 260 | "hashes": [ 261 | "sha256:027ee5d961afa01e97b90d6ccc34b4ed976702bc58e7f092b3c513ea288cb6d2", 262 | "sha256:752a3786f17416d491f833a29217dda3ea4a471fc5269c492eebcee8cc4772d3" 263 | ], 264 | "version": "==1.6.0" 265 | }, 266 | "filelock": { 267 | "hashes": [ 268 | "sha256:b8d5ca5ca1c815e1574aee746650ea7301de63d87935b3463d26368b76e31633", 269 | "sha256:d610c1bb404daf85976d7a82eb2ada120f04671007266b708606565dd03b5be6" 270 | ], 271 | "version": "==3.0.10" 272 | }, 273 | "flake8": { 274 | "hashes": [ 275 | "sha256:859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661", 276 | "sha256:a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8" 277 | ], 278 | "index": "pypi", 279 | "version": "==3.7.7" 280 | }, 281 | "funcsigs": { 282 | "hashes": [ 283 | "sha256:330cc27ccbf7f1e992e69fef78261dc7c6569012cf397db8d3de0234e6c937ca", 284 | "sha256:a7bb0f2cf3a3fd1ab2732cb49eba4252c2af4240442415b4abce3b87022a8f50" 285 | ], 286 | "markers": "python_version < '3.3'", 287 | "version": "==1.0.2" 288 | }, 289 | "functools32": { 290 | "hashes": [ 291 | "sha256:89d824aa6c358c421a234d7f9ee0bd75933a67c29588ce50aaa3acdf4d403fa0", 292 | "sha256:f6253dfbe0538ad2e387bd8fdfd9293c925d63553f5813c4e587745416501e6d" 293 | ], 294 | "markers": "python_version < '3.2'", 295 | "version": "==3.2.3.post2" 296 | }, 297 | "idna": { 298 | "hashes": [ 299 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 300 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 301 | ], 302 | "version": "==2.8" 303 | }, 304 | "ipaddress": { 305 | "hashes": [ 306 | "sha256:64b28eec5e78e7510698f6d4da08800a5c575caa4a286c93d651c5d3ff7b6794", 307 | "sha256:b146c751ea45cad6188dd6cf2d9b757f6f4f8d6ffb96a023e6f2e26eea02a72c" 308 | ], 309 | "version": "==1.0.22" 310 | }, 311 | "mccabe": { 312 | "hashes": [ 313 | "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", 314 | "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" 315 | ], 316 | "version": "==0.6.1" 317 | }, 318 | "mock": { 319 | "hashes": [ 320 | "sha256:83657d894c90d5681d62155c82bda9c1187827525880eda8ff5df4ec813437c3", 321 | "sha256:d157e52d4e5b938c550f39eb2fd15610db062441a9c2747d3dbfa9298211d0f8" 322 | ], 323 | "markers": "python_version < '3.3'", 324 | "version": "==3.0.5" 325 | }, 326 | "more-itertools": { 327 | "hashes": [ 328 | "sha256:38a936c0a6d98a38bcc2d03fdaaedaba9f412879461dd2ceff8d37564d6522e4", 329 | "sha256:c0a5785b1109a6bd7fac76d6837fd1feca158e54e521ccd2ae8bfe393cc9d4fc", 330 | "sha256:fe7a7cae1ccb57d33952113ff4fa1bc5f879963600ed74918f1236e212ee50b9" 331 | ], 332 | "index": "pypi", 333 | "markers": "python_version >= '2.7'", 334 | "version": "==5.0.0" 335 | }, 336 | "pathlib2": { 337 | "hashes": [ 338 | "sha256:25199318e8cc3c25dcb45cbe084cc061051336d5a9ea2a12448d3d8cb748f742", 339 | "sha256:5887121d7f7df3603bca2f710e7219f3eca0eb69e0b7cc6e0a022e155ac931a7" 340 | ], 341 | "markers": "python_version < '3.6'", 342 | "version": "==2.3.3" 343 | }, 344 | "pathtools": { 345 | "hashes": [ 346 | "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0" 347 | ], 348 | "version": "==0.1.2" 349 | }, 350 | "pluggy": { 351 | "hashes": [ 352 | "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180", 353 | "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a" 354 | ], 355 | "version": "==0.11.0" 356 | }, 357 | "py": { 358 | "hashes": [ 359 | "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", 360 | "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" 361 | ], 362 | "version": "==1.8.0" 363 | }, 364 | "pycodestyle": { 365 | "hashes": [ 366 | "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", 367 | "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" 368 | ], 369 | "version": "==2.5.0" 370 | }, 371 | "pycparser": { 372 | "hashes": [ 373 | "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" 374 | ], 375 | "version": "==2.19" 376 | }, 377 | "pyflakes": { 378 | "hashes": [ 379 | "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", 380 | "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2" 381 | ], 382 | "version": "==2.1.1" 383 | }, 384 | "pyopenssl": { 385 | "hashes": [ 386 | "sha256:aeca66338f6de19d1aa46ed634c3b9ae519a64b458f8468aec688e7e3c20f200", 387 | "sha256:c727930ad54b10fc157015014b666f2d8b41f70c0d03e83ab67624fd3dd5d1e6" 388 | ], 389 | "version": "==19.0.0" 390 | }, 391 | "pytest": { 392 | "hashes": [ 393 | "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24", 394 | "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6" 395 | ], 396 | "index": "pypi", 397 | "version": "==4.5.0" 398 | }, 399 | "pytest-cov": { 400 | "hashes": [ 401 | "sha256:2b097cde81a302e1047331b48cadacf23577e431b61e9c6f49a1170bbe3d3da6", 402 | "sha256:e00ea4fdde970725482f1f35630d12f074e121a23801aabf2ae154ec6bdd343a" 403 | ], 404 | "index": "pypi", 405 | "version": "==2.7.1" 406 | }, 407 | "pytest-forked": { 408 | "hashes": [ 409 | "sha256:5fe33fbd07d7b1302c95310803a5e5726a4ff7f19d5a542b7ce57c76fed8135f", 410 | "sha256:d352aaced2ebd54d42a65825722cb433004b4446ab5d2044851d9cc7a00c9e38" 411 | ], 412 | "index": "pypi", 413 | "version": "==1.0.2" 414 | }, 415 | "pytest-testmon": { 416 | "hashes": [ 417 | "sha256:df00594e55f8f8f826e0e345dc23863ebac066eb749f8229c515a0373669c5bb" 418 | ], 419 | "index": "pypi", 420 | "version": "==0.9.16" 421 | }, 422 | "pytest-timeout": { 423 | "hashes": [ 424 | "sha256:4a30ba76837a32c7b7cd5c84ee9933fde4b9022b0cd20ea7d4a577c2a1649fb1", 425 | "sha256:d49f618c6448c14168773b6cdda022764c63ea80d42274e3156787e8088d04c6" 426 | ], 427 | "index": "pypi", 428 | "version": "==1.3.3" 429 | }, 430 | "pytest-watch": { 431 | "hashes": [ 432 | "sha256:06136f03d5b361718b8d0d234042f7b2f203910d8568f63df2f866b547b3d4b9" 433 | ], 434 | "index": "pypi", 435 | "version": "==4.2.0" 436 | }, 437 | "pytest-xdist": { 438 | "hashes": [ 439 | "sha256:b0bb4b0293ee8657b9eb3ff334a3b6aac4db74fd4a86b81e1982c879237a47eb", 440 | "sha256:f83a485293e81fd57c8a5a85a3f12473a532c5ca7dec518857cbb72766bb526c" 441 | ], 442 | "index": "pypi", 443 | "version": "==1.28.0" 444 | }, 445 | "pyyaml": { 446 | "hashes": [ 447 | "sha256:1adecc22f88d38052fb787d959f003811ca858b799590a5eaa70e63dca50308c", 448 | "sha256:436bc774ecf7c103814098159fbb84c2715d25980175292c648f2da143909f95", 449 | "sha256:460a5a4248763f6f37ea225d19d5c205677d8d525f6a83357ca622ed541830c2", 450 | "sha256:5a22a9c84653debfbf198d02fe592c176ea548cccce47553f35f466e15cf2fd4", 451 | "sha256:7a5d3f26b89d688db27822343dfa25c599627bc92093e788956372285c6298ad", 452 | "sha256:9372b04a02080752d9e6f990179a4ab840227c6e2ce15b95e1278456664cf2ba", 453 | "sha256:a5dcbebee834eaddf3fa7366316b880ff4062e4bcc9787b78c7fbb4a26ff2dd1", 454 | "sha256:aee5bab92a176e7cd034e57f46e9df9a9862a71f8f37cad167c6fc74c65f5b4e", 455 | "sha256:c51f642898c0bacd335fc119da60baae0824f2cde95b0330b56c0553439f0673", 456 | "sha256:c68ea4d3ba1705da1e0d85da6684ac657912679a649e8868bd850d2c299cce13", 457 | "sha256:e23d0cc5299223dcc37885dae624f382297717e459ea24053709675a976a3e19" 458 | ], 459 | "version": "==5.1" 460 | }, 461 | "requests": { 462 | "hashes": [ 463 | "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", 464 | "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" 465 | ], 466 | "index": "pypi", 467 | "version": "==2.21.0" 468 | }, 469 | "responses": { 470 | "hashes": [ 471 | "sha256:502d9c0c8008439cfcdef7e251f507fcfdd503b56e8c0c87c3c3e3393953f790", 472 | "sha256:97193c0183d63fba8cd3a041c75464e4b09ea0aff6328800d1546598567dde0b" 473 | ], 474 | "index": "pypi", 475 | "version": "==0.10.6" 476 | }, 477 | "scandir": { 478 | "hashes": [ 479 | "sha256:2586c94e907d99617887daed6c1d102b5ca28f1085f90446554abf1faf73123e", 480 | "sha256:2ae41f43797ca0c11591c0c35f2f5875fa99f8797cb1a1fd440497ec0ae4b022", 481 | "sha256:2b8e3888b11abb2217a32af0766bc06b65cc4a928d8727828ee68af5a967fa6f", 482 | "sha256:2c712840c2e2ee8dfaf36034080108d30060d759c7b73a01a52251cc8989f11f", 483 | "sha256:4d4631f6062e658e9007ab3149a9b914f3548cb38bfb021c64f39a025ce578ae", 484 | "sha256:67f15b6f83e6507fdc6fca22fedf6ef8b334b399ca27c6b568cbfaa82a364173", 485 | "sha256:7d2d7a06a252764061a020407b997dd036f7bd6a175a5ba2b345f0a357f0b3f4", 486 | "sha256:8c5922863e44ffc00c5c693190648daa6d15e7c1207ed02d6f46a8dcc2869d32", 487 | "sha256:92c85ac42f41ffdc35b6da57ed991575bdbe69db895507af88b9f499b701c188", 488 | "sha256:b24086f2375c4a094a6b51e78b4cf7ca16c721dcee2eddd7aa6494b42d6d519d", 489 | "sha256:cb925555f43060a1745d0a321cca94bcea927c50114b623d73179189a4e100ac" 490 | ], 491 | "markers": "python_version < '3.5'", 492 | "version": "==1.10.0" 493 | }, 494 | "six": { 495 | "hashes": [ 496 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 497 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 498 | ], 499 | "version": "==1.12.0" 500 | }, 501 | "sure": { 502 | "hashes": [ 503 | "sha256:3c8d5271fb18e2c69e2613af1ad400d8df090f1456081635bd3171847303cdaa" 504 | ], 505 | "index": "pypi", 506 | "version": "==1.4.11" 507 | }, 508 | "toml": { 509 | "hashes": [ 510 | "sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c", 511 | "sha256:235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e" 512 | ], 513 | "version": "==0.10.0" 514 | }, 515 | "tox": { 516 | "hashes": [ 517 | "sha256:5358eae59e3bdba5d9bd0794331854e36250e62fa3cf7d4059ae730bfe5ca432", 518 | "sha256:b7f8eb013c1c5fd758b272d4af97eeba168e6ea3fb604004582fce5aef0771c2" 519 | ], 520 | "index": "pypi", 521 | "version": "==3.11.0" 522 | }, 523 | "typing": { 524 | "hashes": [ 525 | "sha256:4027c5f6127a6267a435201981ba156de91ad0d1d98e9ddc2aa173453453492d", 526 | "sha256:57dcf675a99b74d64dacf6fba08fb17cf7e3d5fdff53d4a30ea2a5e7e52543d4", 527 | "sha256:a4c8473ce11a65999c8f59cb093e70686b6c84c98df58c1dae9b3b196089858a" 528 | ], 529 | "markers": "python_version < '3.5'", 530 | "version": "==3.6.6" 531 | }, 532 | "urllib3": { 533 | "hashes": [ 534 | "sha256:2393a695cd12afedd0dcb26fe5d50d0cf248e5a66f75dbd89a3d4eb333a61af4", 535 | "sha256:a637e5fae88995b256e3409dc4d52c2e2e0ba32c42a6365fee8bbd2238de3cfb" 536 | ], 537 | "version": "==1.24.3" 538 | }, 539 | "virtualenv": { 540 | "hashes": [ 541 | "sha256:99acaf1e35c7ccf9763db9ba2accbca2f4254d61d1912c5ee364f9cc4a8942a0", 542 | "sha256:fe51cdbf04e5d8152af06c075404745a7419de27495a83f0d72518ad50be3ce8" 543 | ], 544 | "version": "==16.6.0" 545 | }, 546 | "watchdog": { 547 | "hashes": [ 548 | "sha256:965f658d0732de3188211932aeb0bb457587f04f63ab4c1e33eab878e9de961d" 549 | ], 550 | "version": "==0.9.0" 551 | }, 552 | "wcwidth": { 553 | "hashes": [ 554 | "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", 555 | "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" 556 | ], 557 | "version": "==0.1.7" 558 | } 559 | } 560 | } 561 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cloudflare-scrape 2 | ================= 3 | 4 | A simple Python module to bypass Cloudflare's anti-bot page (also known as "I'm Under Attack Mode", or IUAM), implemented with [Requests](https://github.com/kennethreitz/requests). Python versions 2.6 - 3.7 are supported. Cloudflare changes their techniques periodically, so I will update this repo frequently. 5 | 6 | This can be useful if you wish to scrape or crawl a website protected with Cloudflare. Cloudflare's anti-bot page currently just checks if the client supports JavaScript, though they may add additional techniques in the future. 7 | 8 | Due to Cloudflare continually changing and hardening their protection page, cloudflare-scrape requires Node.js to solve JavaScript challenges. This allows the script to easily impersonate a regular web browser without explicitly deobfuscating and parsing Cloudflare's JavaScript. 9 | 10 | Note: This only works when regular Cloudflare anti-bots is enabled (the "Checking your browser before accessing..." loading page). If there is a reCAPTCHA challenge, you're out of luck. Thankfully, the JavaScript check page is much more common. 11 | 12 | For reference, this is the default message Cloudflare uses for these sorts of pages: 13 | 14 | Checking your browser before accessing website.com. 15 | 16 | This process is automatic. Your browser will redirect to your requested content shortly. 17 | 18 | Please allow up to 5 seconds... 19 | 20 | Any script using cloudflare-scrape will sleep for 5 seconds for the first visit to any site with Cloudflare anti-bots enabled, though no delay will occur after the first request. 21 | 22 | Installation 23 | ============ 24 | 25 | Simply run `pip install cfscrape`. You can upgrade with `pip install -U cfscrape`. The PyPI package is at https://pypi.python.org/pypi/cfscrape/ 26 | 27 | Alternatively, clone this repository and run `python setup.py install`. 28 | 29 | Node.js dependency 30 | ============ 31 | 32 | [Node.js](https://nodejs.org/) version 10 or above is required to interpret Cloudflare's obfuscated JavaScript challenge. 33 | 34 | Your machine may already have Node installed (check with `node -v`). If not, you can install it with `apt-get install nodejs` on Ubuntu >= 18.04 and Debian >= 9 and `brew install node` on macOS. Otherwise, you can get it from [Node's download page](https://nodejs.org/en/download/) or [their package manager installation page](https://nodejs.org/en/download/package-manager/). 35 | 36 | 37 | Updates 38 | ======= 39 | 40 | Cloudflare regularly modifies their anti-bot protection page and improves their bot detection capabilities. 41 | 42 | If you notice that the anti-bot page has changed, or if this module suddenly stops working, please create a GitHub issue so that I can update the code accordingly. 43 | 44 | * Many issues are a result of users not updating to the latest release of this project. Before filing an issue, please run the following command to update cloudflare-scrape to the latest version: 45 | 46 | ``` 47 | pip install -U cfscrape 48 | ``` 49 | 50 | If you are still encountering a problem, create a GitHub issue and please include: 51 | 52 | * The version number from `pip show cfscrape`. 53 | * The relevant code snippet that's experiencing an issue or raising an exception. 54 | * The full exception and traceback, if applicable. 55 | * The URL of the Cloudflare-protected page which the script does not work on. 56 | * A Pastebin or Gist containing the HTML source of the protected page. 57 | 58 | 59 | If you've upgraded and are still experiencing problems, **[click here to create a GitHub issue and fill out the pertinent information](https://github.com/Anorov/cloudflare-scrape/issues/new?assignees=&labels=bug&template=bug-report-template.md&title=)**. 60 | 61 | Usage 62 | ===== 63 | 64 | The simplest way to use cloudflare-scrape is by calling `create_scraper()`. 65 | 66 | ```python 67 | import cfscrape 68 | 69 | scraper = cfscrape.create_scraper() # returns a CloudflareScraper instance 70 | # Or: scraper = cfscrape.CloudflareScraper() # CloudflareScraper inherits from requests.Session 71 | print scraper.get("http://somesite.com").content # => "..." 72 | ``` 73 | 74 | That's it. Any requests made from this session object to websites protected by Cloudflare anti-bot will be handled automatically. Websites not using Cloudflare will be treated normally. You don't need to configure or call anything further, and you can effectively treat all websites as if they're not protected with anything. 75 | 76 | You use cloudflare-scrape exactly the same way you use Requests. (`CloudflareScraper` works identically to a Requests `Session` object.) Just instead of calling `requests.get()` or `requests.post()`, you call `scraper.get()` or `scraper.post()`. Consult [Requests' documentation](http://docs.python-requests.org/en/latest/user/quickstart/) for more information. 77 | 78 | ## Options 79 | 80 | ### Existing session 81 | 82 | If you already have an existing Requests session, you can pass it to `create_scraper()` to continue using that session. 83 | 84 | ```python 85 | session = requests.session() 86 | session.headers = ... 87 | scraper = cfscrape.create_scraper(sess=session) 88 | ``` 89 | 90 | Unfortunately, not all of Requests' session attributes are easily transferable, so if you run into problems with this, you should replace your initial `sess = requests.session()` call with `sess = cfscrape.create_scraper()`. 91 | 92 | ### Delays 93 | 94 | Normally, when a browser is faced with a Cloudflare IUAM challenge page, Cloudflare requires the browser to wait 5 seconds before submitting the challenge answer. If a website is under heavy load, sometimes this may fail. One solution is to increase the delay (perhaps to 10 or 15 seconds, depending on the website). If you would like to override this delay, pass the `delay` keyword argument to `create_scraper()` or `CloudflareScraper()`. 95 | 96 | There is no need to override this delay unless cloudflare-scrape generates an error recommending you increase the delay. 97 | 98 | ```python 99 | scraper = cfscrape.create_scraper(delay=10) 100 | ``` 101 | 102 | ## Integration 103 | 104 | It's easy to integrate cloudflare-scrape with other applications and tools. Cloudflare uses two cookies as tokens: one to verify you made it past their challenge page and one to track your session. To bypass the challenge page, simply include both of these cookies (with the appropriate user-agent) in all HTTP requests you make. 105 | 106 | To retrieve just the cookies (as a dictionary), use `cfscrape.get_tokens()`. To retrieve them as a full `Cookie` HTTP header, use `cfscrape.get_cookie_string()`. 107 | 108 | `get_tokens` and `get_cookie_string` both accept Requests' usual keyword arguments (like `get_tokens(url, proxies={"http": "socks5://localhost:9050"})`). Please read [Requests' documentation on request arguments](http://docs.python-requests.org/en/master/api/#requests.Session.request) for more information. 109 | 110 | *User-Agent Handling* 111 | 112 | The two integration functions return a tuple of `(cookie, user_agent_string)`. **You must use the same user-agent string for obtaining tokens and for making requests with those tokens, otherwise Cloudflare will flag you as a bot.** That means you have to pass the returned `user_agent_string` to whatever script, tool, or service you are passing the tokens to (e.g. curl, or a specialized scraping tool), and it must use that passed user-agent when it makes HTTP requests. 113 | 114 | If your tool already has a particular user-agent configured, you can make cloudflare-scrape use it with `cfscrape.get_tokens("http://somesite.com/", user_agent="User-Agent Here")` (also works for `get_cookie_string`). Otherwise, a randomly selected user-agent will be used. 115 | 116 | -------------------------------------------------------------------------------- 117 | 118 | ### Integration examples 119 | 120 | Remember, you must always use the same user-agent when retrieving or using these cookies. These functions all return a tuple of `(cookie_dict, user_agent_string)`. 121 | 122 | **Retrieving a cookie dict through a proxy** 123 | 124 | `get_tokens` is a convenience function for returning a Python dict containing Cloudflare's session cookies. For demonstration, we will configure this request to use a proxy. (Please note that if you request Cloudflare clearance tokens through a proxy, you must always use the same proxy when those tokens are passed to the server. Cloudflare requires that the challenge-solving IP and the visitor IP stay the same.) 125 | 126 | If you do not wish to use a proxy, just don't pass the `proxies` keyword argument. These convenience functions support all of Requests' normal keyword arguments, like `params`, `data`, and `headers`. 127 | 128 | ```python 129 | import cfscrape 130 | 131 | proxies = {"http": "http://localhost:8080", "https": "http://localhost:8080"} 132 | tokens, user_agent = cfscrape.get_tokens("http://somesite.com", proxies=proxies) 133 | print tokens 134 | # => {'cf_clearance': 'c8f913c707b818b47aa328d81cab57c349b1eee5-1426733163-3600', '__cfduid': 'dd8ec03dfdbcb8c2ea63e920f1335c1001426733158'} 135 | ``` 136 | 137 | **Retrieving a cookie string** 138 | 139 | `get_cookie_string` is a convenience function for returning the tokens as a string for use as a `Cookie` HTTP header value. 140 | 141 | This is useful when crafting an HTTP request manually, or working with an external application or library that passes on raw cookie headers. 142 | 143 | ```python 144 | import cfscrape 145 | request = "GET / HTTP/1.1\r\n" 146 | 147 | cookie_value, user_agent = cfscrape.get_cookie_string("http://somesite.com") 148 | request += "Cookie: %s\r\nUser-Agent: %s\r\n" % (cookie_value, user_agent) 149 | 150 | print request 151 | 152 | # GET / HTTP/1.1\r\n 153 | # Cookie: cf_clearance=c8f913c707b818b47aa328d81cab57c349b1eee5-1426733163-3600; __cfduid=dd8ec03dfdbcb8c2ea63e920f1335c1001426733158 154 | # User-Agent: Some/User-Agent String 155 | ``` 156 | 157 | **curl example** 158 | 159 | Here is an example of integrating cloudflare-scrape with curl. As you can see, all you have to do is pass the cookies and user-agent to curl. 160 | 161 | ```python 162 | import subprocess 163 | import cfscrape 164 | 165 | # With get_tokens() cookie dict: 166 | 167 | # tokens, user_agent = cfscrape.get_tokens("http://somesite.com") 168 | # cookie_arg = "cf_clearance=%s; __cfduid=%s" % (tokens["cf_clearance"], tokens["__cfduid"]) 169 | 170 | # With get_cookie_string() cookie header; recommended for curl and similar external applications: 171 | 172 | cookie_arg, user_agent = cfscrape.get_cookie_string("http://somesite.com") 173 | 174 | # With a custom user-agent string you can optionally provide: 175 | 176 | # ua = "Scraping Bot" 177 | # cookie_arg, user_agent = cfscrape.get_cookie_string("http://somesite.com", user_agent=ua) 178 | 179 | result = subprocess.check_output(["curl", "--cookie", cookie_arg, "-A", user_agent, "http://somesite.com"]) 180 | ``` 181 | 182 | Trimmed down version. Prints page contents of any site protected with Cloudflare, via curl. (Warning: `shell=True` can be dangerous to use with `subprocess` in real code.) 183 | 184 | ```python 185 | url = "http://somesite.com" 186 | cookie_arg, user_agent = cfscrape.get_cookie_string(url) 187 | cmd = "curl --cookie {cookie_arg} -A {user_agent} {url}" 188 | print(subprocess.check_output(cmd.format(cookie_arg=cookie_arg, user_agent=user_agent, url=url), shell=True)) 189 | ``` 190 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | os: Visual Studio 2015 2 | 3 | cache: 4 | - '%LOCALAPPDATA%\pip\Cache' 5 | 6 | environment: 7 | matrix: 8 | - PYTHON: 'C:/Python27' 9 | - PYTHON: 'C:/Python27-x64' 10 | - PYTHON: 'C:/Python34' 11 | - PYTHON: 'C:/Python34-x64' 12 | - PYTHON: 'C:/Python35' 13 | - PYTHON: 'C:/Python35-x64' 14 | - PYTHON: 'C:/Python36' 15 | - PYTHON: 'C:/Python36-x64' 16 | - PYTHON: 'C:/Python37' 17 | - PYTHON: 'C:/Python37-x64' 18 | cfscrape_node: '4.5' 19 | - PYTHON: 'C:/Python37-x64' 20 | cfscrape_node: 'node' 21 | 22 | install: 23 | - ps: >- 24 | If ($env:cfscrape_node -ne $null) { 25 | If ($env:cfscrape_node -Match "node") { 26 | Install-Product node "" 27 | } 28 | Else { 29 | Install-Product node $env:cfscrape_node 30 | } 31 | } 32 | - 'set PATH=%PYTHON%;%PYTHON%/Scripts;%PATH%' 33 | - 'python --version' 34 | - 'node -p process.versions' 35 | - 'pip -V' 36 | - 'pip install pipenv' 37 | - 'pipenv install --dev' 38 | 39 | # Not a C# project, build stuff at the install step instead. 40 | build: false 41 | 42 | test_script: 'pipenv run pytest tests' 43 | -------------------------------------------------------------------------------- /cfscrape/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import random 5 | import re 6 | import ssl 7 | import subprocess 8 | import copy 9 | import time 10 | import os 11 | from base64 import b64encode 12 | from collections import OrderedDict 13 | 14 | from requests.sessions import Session 15 | from requests.adapters import HTTPAdapter 16 | from requests.compat import urlparse, urlunparse 17 | from requests.exceptions import RequestException 18 | 19 | from urllib3.util.ssl_ import create_urllib3_context, DEFAULT_CIPHERS 20 | 21 | from .user_agents import USER_AGENTS 22 | 23 | __version__ = "2.1.1" 24 | 25 | DEFAULT_USER_AGENT = random.choice(USER_AGENTS) 26 | 27 | DEFAULT_HEADERS = OrderedDict( 28 | ( 29 | ("Host", None), 30 | ("Connection", "keep-alive"), 31 | ("Upgrade-Insecure-Requests", "1"), 32 | ("User-Agent", DEFAULT_USER_AGENT), 33 | ( 34 | "Accept", 35 | "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 36 | ), 37 | ("Accept-Language", "en-US,en;q=0.9"), 38 | ("Accept-Encoding", "gzip, deflate"), 39 | ) 40 | ) 41 | 42 | BUG_REPORT = """\ 43 | Cloudflare may have changed their technique, or there may be a bug in the script. 44 | 45 | Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \ 46 | bug report at https://github.com/Anorov/cloudflare-scrape/issues."\ 47 | """ 48 | 49 | ANSWER_ACCEPT_ERROR = """\ 50 | The challenge answer was not properly accepted by Cloudflare. This can occur if \ 51 | the target website is under heavy load, or if Cloudflare is experiencing issues. You can 52 | potentially resolve this by increasing the challenge answer delay (default: 8 seconds). \ 53 | For example: cfscrape.create_scraper(delay=15) 54 | 55 | If increasing the delay does not help, please open a GitHub issue at \ 56 | https://github.com/Anorov/cloudflare-scrape/issues\ 57 | """ 58 | 59 | # Remove a few problematic TLSv1.0 ciphers from the defaults 60 | DEFAULT_CIPHERS += ":!ECDHE+SHA:!AES128-SHA:!AESCCM:!DHE:!ARIA" 61 | 62 | 63 | class CloudflareAdapter(HTTPAdapter): 64 | """ HTTPS adapter that creates a SSL context with custom ciphers """ 65 | 66 | def get_connection(self, *args, **kwargs): 67 | conn = super(CloudflareAdapter, self).get_connection(*args, **kwargs) 68 | 69 | if conn.conn_kw.get("ssl_context"): 70 | conn.conn_kw["ssl_context"].set_ciphers(DEFAULT_CIPHERS) 71 | else: 72 | context = create_urllib3_context(ciphers=DEFAULT_CIPHERS) 73 | conn.conn_kw["ssl_context"] = context 74 | 75 | return conn 76 | 77 | 78 | class CloudflareError(RequestException): 79 | pass 80 | 81 | 82 | class CloudflareCaptchaError(CloudflareError): 83 | pass 84 | 85 | 86 | class CloudflareScraper(Session): 87 | def __init__(self, *args, **kwargs): 88 | self.delay = kwargs.pop("delay", None) 89 | # Use headers with a random User-Agent if no custom headers have been set 90 | headers = OrderedDict(kwargs.pop("headers", DEFAULT_HEADERS)) 91 | 92 | # Set the User-Agent header if it was not provided 93 | headers.setdefault("User-Agent", DEFAULT_USER_AGENT) 94 | 95 | super(CloudflareScraper, self).__init__(*args, **kwargs) 96 | 97 | # Define headers to force using an OrderedDict and preserve header order 98 | self.headers = headers 99 | self.org_method = None 100 | 101 | self.mount("https://", CloudflareAdapter()) 102 | 103 | @staticmethod 104 | def is_cloudflare_iuam_challenge(resp): 105 | return ( 106 | resp.status_code in (503, 429) 107 | and resp.headers.get("Server", "").startswith("cloudflare") 108 | and b"jschl_vc" in resp.content 109 | and b"jschl_answer" in resp.content 110 | ) 111 | 112 | @staticmethod 113 | def is_cloudflare_captcha_challenge(resp): 114 | return ( 115 | resp.status_code == 403 116 | and resp.headers.get("Server", "").startswith("cloudflare") 117 | and b"/cdn-cgi/l/chk_captcha" in resp.content 118 | ) 119 | 120 | def request(self, method, url, *args, **kwargs): 121 | resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs) 122 | 123 | # Check if Cloudflare captcha challenge is presented 124 | if self.is_cloudflare_captcha_challenge(resp): 125 | self.handle_captcha_challenge(resp, url) 126 | 127 | # Check if Cloudflare anti-bot "I'm Under Attack Mode" is enabled 128 | if self.is_cloudflare_iuam_challenge(resp): 129 | resp = self.solve_cf_challenge(resp, **kwargs) 130 | 131 | return resp 132 | 133 | def cloudflare_is_bypassed(self, url, resp=None): 134 | cookie_domain = ".{}".format(urlparse(url).netloc) 135 | return ( 136 | self.cookies.get("cf_clearance", None, domain=cookie_domain) or 137 | (resp and resp.cookies.get("cf_clearance", None, domain=cookie_domain)) 138 | ) 139 | 140 | def handle_captcha_challenge(self, resp, url): 141 | error = ( 142 | "Cloudflare captcha challenge presented for %s (cfscrape cannot solve captchas)" 143 | % urlparse(url).netloc 144 | ) 145 | if ssl.OPENSSL_VERSION_NUMBER < 0x10101000: 146 | error += ". Your OpenSSL version is lower than 1.1.1. Please upgrade your OpenSSL library and recompile Python." 147 | 148 | raise CloudflareCaptchaError(error, response=resp) 149 | 150 | def solve_cf_challenge(self, resp, **original_kwargs): 151 | start_time = time.time() 152 | 153 | body = resp.text 154 | parsed_url = urlparse(resp.url) 155 | domain = parsed_url.netloc 156 | challenge_form = re.search(r'\',body, flags=re.S).group(0) # find challenge form 157 | method = re.search(r'method=\"(.*?)\"', challenge_form, flags=re.S).group(1) 158 | if self.org_method is None: 159 | self.org_method = resp.request.method 160 | submit_url = "%s://%s%s" % (parsed_url.scheme, 161 | domain, 162 | re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[0]) 163 | 164 | cloudflare_kwargs = copy.deepcopy(original_kwargs) 165 | 166 | headers = cloudflare_kwargs.setdefault("headers", {}) 167 | headers["Referer"] = resp.url 168 | 169 | try: 170 | cloudflare_kwargs["params"] = dict() 171 | cloudflare_kwargs["data"] = dict() 172 | if len(re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')) != 1: 173 | for param in re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[1].split('&'): 174 | cloudflare_kwargs["params"].update({param.split('=')[0]:param.split('=')[1]}) 175 | 176 | for input_ in re.findall(r'\|\<\/input\>)', challenge_form, flags=re.S): 177 | if re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1) != 'jschl_answer': 178 | if method == 'POST': 179 | cloudflare_kwargs["data"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1): 180 | re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)}) 181 | elif method == 'GET': 182 | cloudflare_kwargs["params"].update({re.search(r'name=\"(.*?)\"',input_, flags=re.S).group(1): 183 | re.search(r'value=\"(.*?)\"',input_, flags=re.S).group(1)}) 184 | if method == 'POST': 185 | for k in ("jschl_vc", "pass"): 186 | if k not in cloudflare_kwargs["data"]: 187 | raise ValueError("%s is missing from challenge form" % k) 188 | elif method == 'GET': 189 | for k in ("jschl_vc", "pass"): 190 | if k not in cloudflare_kwargs["params"]: 191 | raise ValueError("%s is missing from challenge form" % k) 192 | 193 | except Exception as e: 194 | # Something is wrong with the page. 195 | # This may indicate Cloudflare has changed their anti-bot 196 | # technique. If you see this and are running the latest version, 197 | # please open a GitHub issue so I can update the code accordingly. 198 | raise ValueError( 199 | "Unable to parse Cloudflare anti-bot IUAM page: %s %s" 200 | % (e, BUG_REPORT) 201 | ) 202 | 203 | # Solve the Javascript challenge 204 | answer, delay = self.solve_challenge(body, domain) 205 | if method == 'POST': 206 | cloudflare_kwargs["data"]["jschl_answer"] = answer 207 | elif method == 'GET': 208 | cloudflare_kwargs["params"]["jschl_answer"] = answer 209 | 210 | # Requests transforms any request into a GET after a redirect, 211 | # so the redirect has to be handled manually here to allow for 212 | # performing other types of requests even as the first request. 213 | cloudflare_kwargs["allow_redirects"] = False 214 | 215 | # Cloudflare requires a delay before solving the challenge 216 | time.sleep(max(delay - (time.time() - start_time), 0)) 217 | 218 | # Send the challenge response and handle the redirect manually 219 | redirect = self.request(method, submit_url, **cloudflare_kwargs) 220 | if "Location" in redirect.headers: 221 | redirect_location = urlparse(redirect.headers["Location"]) 222 | 223 | if not redirect_location.netloc: 224 | redirect_url = urlunparse( 225 | ( 226 | parsed_url.scheme, 227 | domain, 228 | redirect_location.path, 229 | redirect_location.params, 230 | redirect_location.query, 231 | redirect_location.fragment, 232 | ) 233 | ) 234 | return self.request(method, redirect_url, **original_kwargs) 235 | return self.request(method, redirect.headers["Location"], **original_kwargs) 236 | elif "Set-Cookie" in redirect.headers: 237 | if 'cf_clearance' in redirect.headers['Set-Cookie']: 238 | resp = self.request(self.org_method, submit_url, cookies = redirect.cookies) 239 | return resp 240 | else: 241 | return self.request(method, submit_url, **original_kwargs) 242 | else: 243 | resp = self.request(self.org_method, submit_url, **cloudflare_kwargs) 244 | return resp 245 | 246 | 247 | def solve_challenge(self, body, domain): 248 | try: 249 | all_scripts = re.findall(r'\ 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
28 | 29 |
30 |
31 |

One more step

32 |

Please complete the security check to access example-site.dev

33 |
34 | 35 |
36 |
37 |
38 |
39 | 40 |
41 |
42 | 43 | 44 |
45 | 56 |
57 | 58 | 104 | 105 |
106 |
107 | 108 |
109 |
110 | 111 | 112 | 113 |
114 |
115 |
116 |
117 |
118 | 119 |
120 |
121 |
122 |

Why do I have to complete a CAPTCHA?

123 | 124 |

Completing the CAPTCHA proves you are a human and gives you temporary access to the web property.

125 |
126 | 127 |
128 |

What can I do to prevent this in the future?

129 | 130 | 131 |

If you are on a personal connection, like at home, you can run an anti-virus scan on your device to make sure it is not infected with malware.

132 | 133 |

If you are at an office or shared network, you can ask the network administrator to run a scan across the network looking for misconfigured or infected devices.

134 | 135 |
136 |
137 |
138 | 139 | 140 | 150 | 151 | 152 |
153 |
154 | 155 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /tests/fixtures/js_challenge_03_12_2018.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Just a moment... 10 | 21 | 22 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 80 | 81 | 82 |
52 |
53 | 54 | 65 | 66 |
67 | 68 | 69 | 70 |
71 |
72 | 73 | 74 |
75 | DDoS protection by Cloudflare 76 |
77 | Ray ID: 4834ce407815974a 78 |
79 |
83 | 84 | -------------------------------------------------------------------------------- /tests/fixtures/js_challenge_09_06_2016.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Just a moment... 10 | 21 | 22 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 77 | 78 |
51 |
52 | 53 | 63 |
64 | 65 | 66 | 67 |
68 |
69 | 70 | 71 |
72 | DDoS protection by CloudFlare 73 |
74 | Ray ID: 2b05d3393e872d77 75 |
76 |
79 | 80 | 81 | -------------------------------------------------------------------------------- /tests/fixtures/js_challenge_10_04_2019.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Just a moment... 10 | 21 | 22 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 92 | 93 | 94 |
66 |
67 | 68 | 79 | 80 |
81 | 82 | 83 | 84 | 85 |
86 | 87 | 88 | 89 |
90 | 91 |
95 | 96 | 97 | -------------------------------------------------------------------------------- /tests/fixtures/js_challenge_13_03_2019.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Just a moment... 10 | 21 | 22 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 79 | 80 | 81 |
52 |
53 | 54 | 66 |
67 | 68 | 69 | 70 |
71 |
72 | 73 | 74 | 78 |
82 | 83 | 84 | -------------------------------------------------------------------------------- /tests/fixtures/js_challenge_21_03_2019.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Just a moment... 10 | 21 | 22 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 76 | 77 | 78 |
52 |
53 | 54 | 65 | 66 |
67 | 68 | 69 | 70 | 71 |
72 |
73 | 74 | 75 |
79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /tests/fixtures/js_challenge_30_11_2019.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Just a moment... 10 | 21 | 22 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 97 | 98 | 99 |
66 |
67 | 68 | 80 | 81 |
82 | 83 | 84 | 85 | 86 |
87 | 88 |
89 | 90 | 91 |
92 | DDoS protection by Cloudflare 93 |
94 | Ray ID: 53d393f93ae1c82f 95 |
96 |
100 | 101 | -------------------------------------------------------------------------------- /tests/fixtures/requested_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Success 6 | 7 | 8 |

The challenge was bypassed successfully.

9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/test_adapters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import ssl 4 | import sure # noqa 5 | import urllib3 6 | 7 | from cfscrape import CloudflareAdapter 8 | 9 | 10 | class TestCloudflareAdapter: 11 | 12 | def test_create_adapter(self): 13 | adapter = CloudflareAdapter() 14 | adapter.should.be.a("requests.adapters.HTTPAdapter") 15 | adapter.close() 16 | 17 | def test_get_connection(self): 18 | adapter = CloudflareAdapter() 19 | 20 | conn = adapter.get_connection("https://127.0.0.1", None) 21 | 22 | conn.conn_kw.should.be.a("dict") 23 | conn.conn_kw.should.have.key("ssl_context") 24 | ssl_context = conn.conn_kw["ssl_context"] 25 | 26 | # This should be ssl.SSLContext unless pyOpenSSL is installed. 27 | # If pyOpenSSL is injected into urllib3, this should still work. 28 | try: 29 | assert isinstance(ssl_context, urllib3.contrib.pyopenssl.PyOpenSSLContext) 30 | except BaseException: 31 | assert isinstance(ssl_context, ssl.SSLContext) 32 | 33 | adapter.close() 34 | 35 | def test_set_ciphers(self): 36 | adapter = CloudflareAdapter() 37 | 38 | # Reinitialize the pool manager with a different context 39 | ctx = ssl.create_default_context() 40 | adapter.init_poolmanager(1, 1, ssl_context=ctx) 41 | # Check to see if the context remains the same without error 42 | conn = adapter.get_connection('https://127.0.0.1', None) 43 | conn.conn_kw.should.be.a("dict") 44 | assert conn.conn_kw["ssl_context"] is ctx 45 | 46 | adapter.close() 47 | -------------------------------------------------------------------------------- /tests/test_cfscrape.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pytest 4 | import cfscrape 5 | import requests 6 | import re 7 | import os 8 | import ssl 9 | import responses 10 | import subprocess 11 | 12 | from sure import expect 13 | from . import challenge_responses, recaptcha_responses, requested_page, url, \ 14 | cloudflare_cookies, DefaultResponse, ChallengeResponse, fixtures, \ 15 | cfscrape_kwargs 16 | 17 | 18 | class TestCloudflareScraper: 19 | 20 | @challenge_responses(filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031', redirect_to=url) 21 | def test_js_challenge_10_04_2019(self, **kwargs): 22 | scraper = cfscrape.CloudflareScraper(**kwargs) 23 | expect(scraper.get(url).content).to.equal(requested_page) 24 | 25 | @challenge_responses(filename='js_challenge_21_03_2019.html', jschl_answer='13.0802397598') 26 | def test_js_challenge_21_03_2019(self, **kwargs): 27 | scraper = cfscrape.CloudflareScraper(**kwargs) 28 | expect(scraper.get(url).content).to.equal(requested_page) 29 | 30 | @challenge_responses(filename='js_challenge_13_03_2019.html', jschl_answer='38.5879578333') 31 | def test_js_challenge_13_03_2019(self, **kwargs): 32 | scraper = cfscrape.CloudflareScraper(**kwargs) 33 | expect(scraper.get(url).content).to.equal(requested_page) 34 | 35 | @challenge_responses(filename='js_challenge_03_12_2018.html', jschl_answer='10.66734594') 36 | def test_js_challenge_03_12_2018(self, **kwargs): 37 | scraper = cfscrape.CloudflareScraper(**kwargs) 38 | expect(scraper.get(url).content).to.equal(requested_page) 39 | 40 | @challenge_responses(filename='js_challenge_09_06_2016.html', jschl_answer='6648') 41 | def test_js_challenge_09_06_2016(self, **kwargs): 42 | scraper = cfscrape.CloudflareScraper(**kwargs) 43 | expect(scraper.get(url).content).to.equal(requested_page) 44 | 45 | @pytest.mark.skip(reason='Unable to identify Cloudflare IUAM Javascript on website.') 46 | @challenge_responses(filename='js_challenge_21_05_2015.html', jschl_answer='649') 47 | def test_js_challenge_21_05_2015(self, **kwargs): 48 | scraper = cfscrape.CloudflareScraper(**kwargs) 49 | expect(scraper.get(url).content).to.equal(requested_page) 50 | 51 | @recaptcha_responses(filename='cf_recaptcha_15_04_2019.html') 52 | def test_cf_recaptcha_15_04_2019(self, **kwargs): 53 | scraper = cfscrape.CloudflareScraper(**kwargs) 54 | message = re.compile(r'captcha challenge presented') 55 | scraper.get.when.called_with(url) \ 56 | .should.have.raised(cfscrape.CloudflareCaptchaError, message) 57 | 58 | v = ssl.OPENSSL_VERSION_NUMBER 59 | ssl.OPENSSL_VERSION_NUMBER = 0x0090581f 60 | try: 61 | scraper = cfscrape.CloudflareScraper(**kwargs) 62 | message = re.compile(r'OpenSSL version is lower than 1.1.1') 63 | scraper.get.when.called_with(url) \ 64 | .should.have.raised(cfscrape.CloudflareCaptchaError, message) 65 | finally: 66 | ssl.OPENSSL_VERSION_NUMBER = v 67 | 68 | @responses.activate 69 | def test_js_challenge_unable_to_identify(self): 70 | body = fixtures('js_challenge_10_04_2019.html') 71 | body = body.replace(b'setTimeout', b'') 72 | 73 | responses.add(ChallengeResponse(url=url, body=body)) 74 | 75 | scraper = cfscrape.create_scraper(**cfscrape_kwargs) 76 | message = re.compile(r'Unable to identify Cloudflare IUAM Javascript') 77 | scraper.get.when.called_with(url) \ 78 | .should.have.raised(ValueError, message) 79 | 80 | @responses.activate 81 | def test_js_challenge_unexpected_answer(self): 82 | body = fixtures('js_challenge_10_04_2019.html') 83 | body = body.replace(b'\'; 121\'', b'a.value = "foobar"') 84 | 85 | responses.add(ChallengeResponse(url=url, body=body)) 86 | 87 | scraper = cfscrape.create_scraper(**cfscrape_kwargs) 88 | message = re.compile(r'Cloudflare IUAM challenge returned unexpected answer') 89 | scraper.get.when.called_with(url) \ 90 | .should.have.raised(ValueError, message) 91 | 92 | @responses.activate 93 | def test_js_challenge_missing_pass(self): 94 | body = fixtures('js_challenge_10_04_2019.html') 95 | body = body.replace(b'name="pass"', b'') 96 | 97 | responses.add(ChallengeResponse(url=url, body=body)) 98 | 99 | scraper = cfscrape.create_scraper(**cfscrape_kwargs) 100 | message = re.compile(r'Unable to parse .* pass is missing from challenge form') 101 | scraper.get.when.called_with(url) \ 102 | .should.have.raised(ValueError, message) 103 | 104 | def test_js_challenge_subprocess_unknown_error(self, caplog): 105 | def test(self, **kwargs): 106 | __Popen = subprocess.Popen 107 | 108 | # Temporarily disable this method to generate an exception 109 | subprocess.Popen = None 110 | 111 | try: 112 | scraper = cfscrape.CloudflareScraper(**kwargs) 113 | scraper.get.when.called_with(url) \ 114 | .should.have.raised(TypeError) 115 | caplog.text.should.match(re.compile(r'Error executing Cloudflare IUAM Javascript')) 116 | finally: 117 | subprocess.Popen = __Popen 118 | 119 | challenge_responses( 120 | filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031' 121 | )(test)(self) 122 | 123 | def test_js_challenge_subprocess_system_error(self, caplog): 124 | def test(self, **kwargs): 125 | __Popen = subprocess.Popen 126 | 127 | # Temporarily Mock subprocess method to raise an OSError 128 | def mock(*args, **kwargs): 129 | raise OSError('System Error') 130 | 131 | subprocess.Popen = mock 132 | 133 | try: 134 | scraper = cfscrape.CloudflareScraper(**kwargs) 135 | scraper.get.when.called_with(url) \ 136 | .should.have.raised(OSError, re.compile(r'System Error')) 137 | caplog.text.should.equal('') 138 | finally: 139 | subprocess.Popen = __Popen 140 | 141 | challenge_responses( 142 | filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031' 143 | )(test)(self) 144 | 145 | def test_js_challenge_subprocess_non_zero(self, caplog): 146 | def test(self, **kwargs): 147 | __Popen = subprocess.Popen 148 | 149 | # Temporarily Mock subprocess method to return non-zero exit code 150 | def mock(*args, **kwargs): 151 | def node(): pass 152 | node.communicate = lambda: ('stdout', 'stderr') 153 | node.returncode = 1 154 | return node 155 | 156 | subprocess.Popen = mock 157 | 158 | try: 159 | scraper = cfscrape.CloudflareScraper(**kwargs) 160 | message = re.compile(r'non-zero exit status') 161 | scraper.get.when.called_with(url) \ 162 | .should.have.raised(subprocess.CalledProcessError, message) 163 | caplog.text.should.match(re.compile(r'Error executing Cloudflare IUAM Javascript')) 164 | caplog.text.should_not.match(re.compile(r'Outdated Node.js detected')) 165 | finally: 166 | subprocess.Popen = __Popen 167 | 168 | challenge_responses( 169 | filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031' 170 | )(test)(self) 171 | 172 | def test_js_challenge_outdated_node(self, caplog): 173 | def test(self, **kwargs): 174 | __Popen = subprocess.Popen 175 | 176 | # Temporarily Mock subprocess method to return non-zero exit code 177 | def mock(*args, **kwargs): 178 | def node(): pass 179 | node.communicate = lambda: ('stdout', 'Outdated Node.js detected') 180 | node.returncode = 1 181 | return node 182 | 183 | subprocess.Popen = mock 184 | 185 | try: 186 | scraper = cfscrape.CloudflareScraper(**kwargs) 187 | message = re.compile(r'non-zero exit status') 188 | scraper.get.when.called_with(url) \ 189 | .should.have.raised(subprocess.CalledProcessError, message) 190 | caplog.text.should_not.match(re.compile(r'Error executing Cloudflare IUAM Javascript')) 191 | caplog.text.should.match(re.compile(r'Outdated Node.js detected')) 192 | finally: 193 | subprocess.Popen = __Popen 194 | 195 | challenge_responses( 196 | filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031' 197 | )(test)(self) 198 | 199 | @challenge_responses(filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031') 200 | def test_js_challenge_environment_error(self, **kwargs): 201 | __path = os.environ['PATH'] 202 | # Temporarily unset PATH to hide Node.js 203 | os.environ['PATH'] = '' 204 | try: 205 | scraper = cfscrape.CloudflareScraper(**kwargs) 206 | message = re.compile(r'Missing Node.js runtime') 207 | scraper.get.when.called_with(url) \ 208 | .should.have.raised(EnvironmentError, message) 209 | finally: 210 | os.environ['PATH'] = __path 211 | 212 | @challenge_responses(filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031') 213 | def test_get_cookie_string(self, **kwargs): 214 | # get_cookie_string doesn't accept the delay kwarg. 215 | # Set the delay in the Test class to speed up this test. 216 | delay = kwargs.pop('delay', 0.1) 217 | expected_ua = kwargs.setdefault('user_agent', 'custom-ua') 218 | 219 | cfduid, cf_clearance = cloudflare_cookies() 220 | 221 | # Use a class to workaround a `responses` bug where 222 | # cookies aren't mocked correctly. 223 | class Test(cfscrape.CloudflareScraper): 224 | def __init__(self, *args, **kwargs): 225 | kwargs.setdefault('delay', delay) 226 | super(Test, self).__init__(*args, **kwargs) 227 | 228 | self.cookies.set('__cfduid', cfduid) 229 | self.cookies.set('cf_clearance', cf_clearance) 230 | 231 | result = Test.get_cookie_string(url, **kwargs) 232 | result.should.be.a('tuple') 233 | result.should.have.length_of(2) 234 | 235 | cookie_arg, user_agent = result 236 | 237 | cookie_arg.should.be.a('str') 238 | cookie_arg.should.contain('cf_clearance=%s' % cf_clearance.value) 239 | cookie_arg.should.contain('__cfduid=%s' % cfduid.value) 240 | 241 | user_agent.should.equal(expected_ua) 242 | 243 | @challenge_responses(filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031') 244 | def test_get_tokens(self, **kwargs): 245 | # get_tokens doesn't accept the delay kwarg. 246 | # Set the delay in the Test class to speed up this test. 247 | delay = kwargs.pop('delay', 0.1) 248 | expected_ua = kwargs.setdefault('user_agent', 'custom-ua') 249 | 250 | cfduid, cf_clearance = cloudflare_cookies() 251 | 252 | # Use a class to workaround a `responses` bug where 253 | # cookies aren't mocked correctly. 254 | class Test(cfscrape.CloudflareScraper): 255 | def __init__(self, *args, **kwargs): 256 | kwargs.setdefault('delay', delay) 257 | super(Test, self).__init__(*args, **kwargs) 258 | 259 | self.cookies.set('__cfduid', cfduid) 260 | self.cookies.set('cf_clearance', cf_clearance) 261 | 262 | tokens = Test.get_tokens(url, **kwargs) 263 | tokens.should.be.a('tuple') 264 | tokens.should.have.length_of(2) 265 | 266 | cookies, user_agent = tokens 267 | 268 | cookies.should.be.a('dict') 269 | cookies.should.equal({ 270 | 'cf_clearance': cf_clearance.value, 271 | '__cfduid': cfduid.value 272 | }) 273 | 274 | user_agent.should.equal(expected_ua) 275 | 276 | @challenge_responses(filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031') 277 | def test_get_tokens_missing_cookie(self, **kwargs): 278 | # get_tokens doesn't accept the delay kwarg. 279 | delay = kwargs.pop('delay', 0.1) 280 | 281 | # Use derived class to set delay and test without cookies 282 | class Test(cfscrape.CloudflareScraper): 283 | def __init__(self, *args, **kwargs): 284 | kwargs.setdefault('delay', delay) 285 | super(Test, self).__init__(*args, **kwargs) 286 | 287 | message = re.compile(r'Unable to find Cloudflare cookies') 288 | Test.get_tokens.when.called_with(url, **kwargs) \ 289 | .should.have.raised(ValueError, message) 290 | 291 | @responses.activate 292 | def test_get_tokens_request_error(self, caplog): 293 | # get_tokens doesn't accept the delay kwarg. 294 | kwargs = cfscrape_kwargs.copy() 295 | kwargs.pop('delay', None) 296 | 297 | responses.add(DefaultResponse(url=url, status=500)) 298 | cfscrape.get_tokens.when.called_with(url, **kwargs) \ 299 | .should.have.raised(requests.HTTPError) 300 | caplog.text.should.match(re.compile(r'Could not collect tokens')) 301 | 302 | @challenge_responses(filename='js_challenge_10_04_2019.html', jschl_answer='18.8766915031') 303 | def test_cloudflare_is_bypassed(self, **kwargs): 304 | # Use a class to workaround a `responses` bug where 305 | # cookies aren't mocked correctly. 306 | class Test(cfscrape.CloudflareScraper): 307 | def __init__(self, *args, **kwargs): 308 | super(Test, self).__init__(*args, **kwargs) 309 | 310 | cf_clearance = cloudflare_cookies()[1] 311 | self.cookies.set('cf_clearance', cf_clearance) 312 | 313 | scraper = Test(**kwargs) 314 | scraper.cloudflare_is_bypassed(url).should.be.ok 315 | 316 | def test_create_scraper_with_session(self): 317 | session = requests.session() 318 | session.headers = {'foo': 'bar'} 319 | session.data = None 320 | 321 | scraper = cfscrape.create_scraper(sess=session) 322 | scraper.headers.should.equal(session.headers) 323 | scraper.should_not.have.property('data') 324 | 325 | session.data = {'bar': 'foo'} 326 | scraper = cfscrape.create_scraper(sess=session) 327 | scraper.data.should.equal(session.data) 328 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py33, py34, py35, py36, py37, pypy 3 | skip_missing_interpreters = true 4 | 5 | [testenv] 6 | deps = pipenv 7 | commands= 8 | pipenv install --dev 9 | pipenv run py.test tests 10 | --------------------------------------------------------------------------------