├── .gitignore ├── .travis.yml ├── LICENCE ├── MANIFEST.in ├── Pipfile ├── Pipfile.lock ├── README.md ├── requirements ├── requirements-test.txt └── requirements.txt ├── scrapy_puppeteer ├── __init__.py ├── cli.py ├── http.py └── middlewares.py ├── setup.cfg ├── setup.py └── tests ├── __init__.py └── test_middlewares.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | reports/ 50 | .pytest_cache 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Flask instance folder 60 | instance/ 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | .tmpdocs/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # IPython Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | venv-jenkins*/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # .idea is the directory for pycharm project files 97 | .idea 98 | 99 | # MACOS stuff 100 | .DS_Store 101 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | env: 2 | global: 3 | - CODECLIMATE_REPO_TOKEN=5a52609c6ab7d746d6f6a33583d9b40a7d1b78ae16734b5fd269e3027a27bf0a 4 | 5 | language: python 6 | 7 | python: 8 | - 3.6 9 | 10 | install: 11 | - pip install -r requirements/requirements-test.txt 12 | 13 | script: 14 | - pytest --cov-config .coveragerc --cov=scrapy_puppeteer tests/ 15 | - codeclimate-test-reporter 16 | 17 | notifications: 18 | email: false 19 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Clément Denoix 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements/requirements.txt 2 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | pyppeteer = "*" 8 | Scrapy = ">=1.0.0" 9 | 10 | [dev-packages] 11 | ipdb = "*" 12 | asynctest = "*" 13 | pytest = "*" 14 | twine = "*" 15 | 16 | [requires] 17 | python_version = "3.6" 18 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "fd6c0593728a8d5539c1a30f1e4967e13062c30cd8e7de2e870c4990a6923de8" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.python.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "appdirs": { 20 | "hashes": [ 21 | "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92", 22 | "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e" 23 | ], 24 | "version": "==1.4.3" 25 | }, 26 | "attrs": { 27 | "hashes": [ 28 | "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", 29 | "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" 30 | ], 31 | "version": "==19.3.0" 32 | }, 33 | "automat": { 34 | "hashes": [ 35 | "sha256:269a09dfb063a3b078983f4976d83f0a0d3e6e7aaf8e27d8df1095e09dc4a484", 36 | "sha256:81c93c55d2742c55e74e6497a48e048a859fa01d7aa0b91a032be432229837e2" 37 | ], 38 | "version": "==0.8.0" 39 | }, 40 | "cffi": { 41 | "hashes": [ 42 | "sha256:00d890313797d9fe4420506613384b43099ad7d2b905c0752dbcc3a6f14d80fa", 43 | "sha256:0cf9e550ac6c5e57b713437e2f4ac2d7fd0cd10336525a27224f5fc1ec2ee59a", 44 | "sha256:0ea23c9c0cdd6778146a50d867d6405693ac3b80a68829966c98dd5e1bbae400", 45 | "sha256:193697c2918ecdb3865acf6557cddf5076bb39f1f654975e087b67efdff83365", 46 | "sha256:1ae14b542bf3b35e5229439c35653d2ef7d8316c1fffb980f9b7647e544baa98", 47 | "sha256:1e389e069450609c6ffa37f21f40cce36f9be7643bbe5051ab1de99d5a779526", 48 | "sha256:263242b6ace7f9cd4ea401428d2d45066b49a700852334fd55311bde36dcda14", 49 | "sha256:33142ae9807665fa6511cfa9857132b2c3ee6ddffb012b3f0933fc11e1e830d5", 50 | "sha256:364f8404034ae1b232335d8c7f7b57deac566f148f7222cef78cf8ae28ef764e", 51 | "sha256:47368f69fe6529f8f49a5d146ddee713fc9057e31d61e8b6dc86a6a5e38cecc1", 52 | "sha256:4895640844f17bec32943995dc8c96989226974dfeb9dd121cc45d36e0d0c434", 53 | "sha256:558b3afef987cf4b17abd849e7bedf64ee12b28175d564d05b628a0f9355599b", 54 | "sha256:5ba86e1d80d458b338bda676fd9f9d68cb4e7a03819632969cf6d46b01a26730", 55 | "sha256:63424daa6955e6b4c70dc2755897f5be1d719eabe71b2625948b222775ed5c43", 56 | "sha256:6381a7d8b1ebd0bc27c3bc85bc1bfadbb6e6f756b4d4db0aa1425c3719ba26b4", 57 | "sha256:6381ab708158c4e1639da1f2a7679a9bbe3e5a776fc6d1fd808076f0e3145331", 58 | "sha256:6fd58366747debfa5e6163ada468a90788411f10c92597d3b0a912d07e580c36", 59 | "sha256:728ec653964655d65408949b07f9b2219df78badd601d6c49e28d604efe40599", 60 | "sha256:7cfcfda59ef1f95b9f729c56fe8a4041899f96b72685d36ef16a3440a0f85da8", 61 | "sha256:819f8d5197c2684524637f940445c06e003c4a541f9983fd30d6deaa2a5487d8", 62 | "sha256:825ecffd9574557590e3225560a8a9d751f6ffe4a49e3c40918c9969b93395fa", 63 | "sha256:9009e917d8f5ef780c2626e29b6bc126f4cb2a4d43ca67aa2b40f2a5d6385e78", 64 | "sha256:9c77564a51d4d914ed5af096cd9843d90c45b784b511723bd46a8a9d09cf16fc", 65 | "sha256:a19089fa74ed19c4fe96502a291cfdb89223a9705b1d73b3005df4256976142e", 66 | "sha256:a40ed527bffa2b7ebe07acc5a3f782da072e262ca994b4f2085100b5a444bbb2", 67 | "sha256:bb75ba21d5716abc41af16eac1145ab2e471deedde1f22c6f99bd9f995504df0", 68 | "sha256:e22a00c0c81ffcecaf07c2bfb3672fa372c50e2bd1024ffee0da191c1b27fc71", 69 | "sha256:e55b5a746fb77f10c83e8af081979351722f6ea48facea79d470b3731c7b2891", 70 | "sha256:ec2fa3ee81707a5232bf2dfbd6623fdb278e070d596effc7e2d788f2ada71a05", 71 | "sha256:fd82eb4694be712fcae03c717ca2e0fc720657ac226b80bbb597e971fc6928c2" 72 | ], 73 | "version": "==1.13.1" 74 | }, 75 | "constantly": { 76 | "hashes": [ 77 | "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", 78 | "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d" 79 | ], 80 | "version": "==15.1.0" 81 | }, 82 | "cryptography": { 83 | "hashes": [ 84 | "sha256:02079a6addc7b5140ba0825f542c0869ff4df9a69c360e339ecead5baefa843c", 85 | "sha256:1df22371fbf2004c6f64e927668734070a8953362cd8370ddd336774d6743595", 86 | "sha256:369d2346db5934345787451504853ad9d342d7f721ae82d098083e1f49a582ad", 87 | "sha256:3cda1f0ed8747339bbdf71b9f38ca74c7b592f24f65cdb3ab3765e4b02871651", 88 | "sha256:44ff04138935882fef7c686878e1c8fd80a723161ad6a98da31e14b7553170c2", 89 | "sha256:4b1030728872c59687badcca1e225a9103440e467c17d6d1730ab3d2d64bfeff", 90 | "sha256:58363dbd966afb4f89b3b11dfb8ff200058fbc3b947507675c19ceb46104b48d", 91 | "sha256:6ec280fb24d27e3d97aa731e16207d58bd8ae94ef6eab97249a2afe4ba643d42", 92 | "sha256:7270a6c29199adc1297776937a05b59720e8a782531f1f122f2eb8467f9aab4d", 93 | "sha256:73fd30c57fa2d0a1d7a49c561c40c2f79c7d6c374cc7750e9ac7c99176f6428e", 94 | "sha256:7f09806ed4fbea8f51585231ba742b58cbcfbfe823ea197d8c89a5e433c7e912", 95 | "sha256:90df0cc93e1f8d2fba8365fb59a858f51a11a394d64dbf3ef844f783844cc793", 96 | "sha256:971221ed40f058f5662a604bd1ae6e4521d84e6cad0b7b170564cc34169c8f13", 97 | "sha256:a518c153a2b5ed6b8cc03f7ae79d5ffad7315ad4569b2d5333a13c38d64bd8d7", 98 | "sha256:b0de590a8b0979649ebeef8bb9f54394d3a41f66c5584fff4220901739b6b2f0", 99 | "sha256:b43f53f29816ba1db8525f006fa6f49292e9b029554b3eb56a189a70f2a40879", 100 | "sha256:d31402aad60ed889c7e57934a03477b572a03af7794fa8fb1780f21ea8f6551f", 101 | "sha256:de96157ec73458a7f14e3d26f17f8128c959084931e8997b9e655a39c8fde9f9", 102 | "sha256:df6b4dca2e11865e6cfbfb708e800efb18370f5a46fd601d3755bc7f85b3a8a2", 103 | "sha256:ecadccc7ba52193963c0475ac9f6fa28ac01e01349a2ca48509667ef41ffd2cf", 104 | "sha256:fb81c17e0ebe3358486cd8cc3ad78adbae58af12fc2bf2bc0bb84e8090fa5ce8" 105 | ], 106 | "version": "==2.8" 107 | }, 108 | "cssselect": { 109 | "hashes": [ 110 | "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", 111 | "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" 112 | ], 113 | "version": "==1.1.0" 114 | }, 115 | "hyperlink": { 116 | "hashes": [ 117 | "sha256:4288e34705da077fada1111a24a0aa08bb1e76699c9ce49876af722441845654", 118 | "sha256:ab4a308feb039b04f855a020a6eda3b18ca5a68e6d8f8c899cbe9e653721d04f" 119 | ], 120 | "version": "==19.0.0" 121 | }, 122 | "idna": { 123 | "hashes": [ 124 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 125 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 126 | ], 127 | "version": "==2.8" 128 | }, 129 | "incremental": { 130 | "hashes": [ 131 | "sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f", 132 | "sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3" 133 | ], 134 | "version": "==17.5.0" 135 | }, 136 | "lxml": { 137 | "hashes": [ 138 | "sha256:02ca7bf899da57084041bb0f6095333e4d239948ad3169443f454add9f4e9cb4", 139 | "sha256:096b82c5e0ea27ce9138bcbb205313343ee66a6e132f25c5ed67e2c8d960a1bc", 140 | "sha256:0a920ff98cf1aac310470c644bc23b326402d3ef667ddafecb024e1713d485f1", 141 | "sha256:17cae1730a782858a6e2758fd20dd0ef7567916c47757b694a06ffafdec20046", 142 | "sha256:17e3950add54c882e032527795c625929613adbd2ce5162b94667334458b5a36", 143 | "sha256:1f4f214337f6ee5825bf90a65d04d70aab05526c08191ab888cb5149501923c5", 144 | "sha256:2e8f77db25b0a96af679e64ff9bf9dddb27d379c9900c3272f3041c4d1327c9d", 145 | "sha256:4dffd405390a45ecb95ab5ab1c1b847553c18b0ef8ed01e10c1c8b1a76452916", 146 | "sha256:6b899931a5648862c7b88c795eddff7588fb585e81cecce20f8d9da16eff96e0", 147 | "sha256:726c17f3e0d7a7200718c9a890ccfeab391c9133e363a577a44717c85c71db27", 148 | "sha256:760c12276fee05c36f95f8040180abc7fbebb9e5011447a97cdc289b5d6ab6fc", 149 | "sha256:796685d3969815a633827c818863ee199440696b0961e200b011d79b9394bbe7", 150 | "sha256:891fe897b49abb7db470c55664b198b1095e4943b9f82b7dcab317a19116cd38", 151 | "sha256:a471628e20f03dcdfde00770eeaf9c77811f0c331c8805219ca7b87ac17576c5", 152 | "sha256:a63b4fd3e2cabdcc9d918ed280bdde3e8e9641e04f3c59a2a3109644a07b9832", 153 | "sha256:b0b84408d4eabc6de9dd1e1e0bc63e7731e890c0b378a62443e5741cfd0ae90a", 154 | "sha256:be78485e5d5f3684e875dab60f40cddace2f5b2a8f7fede412358ab3214c3a6f", 155 | "sha256:c27eaed872185f047bb7f7da2d21a7d8913457678c9a100a50db6da890bc28b9", 156 | "sha256:c81cb40bff373ab7a7446d6bbca0190bccc5be3448b47b51d729e37799bb5692", 157 | "sha256:d11874b3c33ee441059464711cd365b89fa1a9cf19ae75b0c189b01fbf735b84", 158 | "sha256:e9c028b5897901361d81a4718d1db217b716424a0283afe9d6735fe0caf70f79", 159 | "sha256:fe489d486cd00b739be826e8c1be188ddb74c7a1ca784d93d06fda882a6a1681" 160 | ], 161 | "markers": "python_version != '3.4'", 162 | "version": "==4.4.1" 163 | }, 164 | "parsel": { 165 | "hashes": [ 166 | "sha256:4da4262ba4605573b6b72a5f557616a2fc9dee7a47a1efad562752a28d366723", 167 | "sha256:74f8e9d3b345b14cb1416bd777a03982cde33a74d8b32e0c71e651d07d41d40a" 168 | ], 169 | "version": "==1.5.2" 170 | }, 171 | "pyasn1": { 172 | "hashes": [ 173 | "sha256:62cdade8b5530f0b185e09855dd422bc05c0bbff6b72ff61381c09dac7befd8c", 174 | "sha256:a9495356ca1d66ed197a0f72b41eb1823cf7ea8b5bd07191673e8147aecf8604" 175 | ], 176 | "version": "==0.4.7" 177 | }, 178 | "pyasn1-modules": { 179 | "hashes": [ 180 | "sha256:0c35a52e00b672f832e5846826f1fb7507907f7d52fba6faa9e3c4cbe874fe4b", 181 | "sha256:b6ada4f840fe51abf5a6bd545b45bf537bea62221fa0dde2e8a553ed9f06a4e3" 182 | ], 183 | "version": "==0.2.7" 184 | }, 185 | "pycparser": { 186 | "hashes": [ 187 | "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" 188 | ], 189 | "version": "==2.19" 190 | }, 191 | "pydispatcher": { 192 | "hashes": [ 193 | "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", 194 | "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" 195 | ], 196 | "version": "==2.0.5" 197 | }, 198 | "pyee": { 199 | "hashes": [ 200 | "sha256:a9c9b60e8693a260dd942ef5a71358cfcbba15792d5e72caf0e3c891c4e91c3b", 201 | "sha256:dbe44f61c40a995d2bdfd83d9fcb87ae025882d2c7f366513325e3daa09d7ede" 202 | ], 203 | "version": "==6.0.0" 204 | }, 205 | "pyhamcrest": { 206 | "hashes": [ 207 | "sha256:6b672c02fdf7470df9674ab82263841ce8333fb143f32f021f6cb26f0e512420", 208 | "sha256:8ffaa0a53da57e89de14ced7185ac746227a8894dbd5a3c718bf05ddbd1d56cd" 209 | ], 210 | "version": "==1.9.0" 211 | }, 212 | "pyopenssl": { 213 | "hashes": [ 214 | "sha256:aeca66338f6de19d1aa46ed634c3b9ae519a64b458f8468aec688e7e3c20f200", 215 | "sha256:c727930ad54b10fc157015014b666f2d8b41f70c0d03e83ab67624fd3dd5d1e6" 216 | ], 217 | "version": "==19.0.0" 218 | }, 219 | "pyppeteer": { 220 | "hashes": [ 221 | "sha256:51fe769b722a1718043b74d12c20420f29e0dd9eeea2b66652b7f93a9ad465dd" 222 | ], 223 | "index": "pypi", 224 | "version": "==0.0.25" 225 | }, 226 | "queuelib": { 227 | "hashes": [ 228 | "sha256:42b413295551bdc24ed9376c1a2cd7d0b1b0fa4746b77b27ca2b797a276a1a17", 229 | "sha256:ff43b5b74b9266f8df4232a8f768dc4d67281a271905e2ed4a3689d4d304cd02" 230 | ], 231 | "version": "==1.5.0" 232 | }, 233 | "scrapy": { 234 | "hashes": [ 235 | "sha256:5a398bf6818f87dcc817c919408a195f19ba46414ae12f259119336cfa862bb6", 236 | "sha256:5b9621731e26b0d195ca3e25ab34d559f45b0b906c0a0cc359199f1b6b612184" 237 | ], 238 | "index": "pypi", 239 | "version": "==1.5.1" 240 | }, 241 | "service-identity": { 242 | "hashes": [ 243 | "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36", 244 | "sha256:0858a54aabc5b459d1aafa8a518ed2081a285087f349fe3e55197989232e2e2d" 245 | ], 246 | "version": "==18.1.0" 247 | }, 248 | "six": { 249 | "hashes": [ 250 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 251 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 252 | ], 253 | "version": "==1.12.0" 254 | }, 255 | "tqdm": { 256 | "hashes": [ 257 | "sha256:abc25d0ce2397d070ef07d8c7e706aede7920da163c64997585d42d3537ece3d", 258 | "sha256:dd3fcca8488bb1d416aa7469d2f277902f26260c45aa86b667b074cd44b3b115" 259 | ], 260 | "version": "==4.36.1" 261 | }, 262 | "twisted": { 263 | "hashes": [ 264 | "sha256:02214ef6f125804969aedd55daccea57060b98dae6a2aa0a4cb60c4d0acb8a2c", 265 | "sha256:15b51047ab116ee61d791cf9fe6f037f35e909a6d344ccb437d1691627c4d8a1", 266 | "sha256:17704d98d58c9c52d97e88570732e4c094a93fe5df937d01b759bab593345eec", 267 | "sha256:222e0cfd60b0c867dd303bce6355a3ffac46574079dff11ae7a1775235ad12c8", 268 | "sha256:23090c9fcec01ce4e102912a39eb4645b2bf916abe459804f87853d977ced6e3", 269 | "sha256:5102fc2bf0d870c1e217aa09ed7a48b633cc579950a31ecae9cecc556ebffdf2", 270 | "sha256:6bc71d5a2320576a3ac7f2dac7802c290fcf9f1972c59f9ef5c5b85b8bac1e1e", 271 | "sha256:6c7703b62de08fd5873d60e6ed30478cdb39e3a37b1ead3a5d2fed10deb6e112", 272 | "sha256:6ca398abd58730070e9bc34e8a01d1198438b2ff130e95492090a2fec5fb683b", 273 | "sha256:98840f28c44894f44dc597747b4cddc740197dc6f6f18ba4dd810422094e35cb", 274 | "sha256:998e3baf509c7cf7973b8174c1050ac10f6a8bc1aaf0178ad6a7c422c75a0c68", 275 | "sha256:a5f2de00c6630c8f5ad32fca64fc4c853536c21e9ea8d0d2ae54804ef5836b9c", 276 | "sha256:aad65a24b27253eb94f2749131a872487b093c599c5873c03d90a65cc9b8a2fc", 277 | "sha256:ab788465701f553f764f4442d22b850f39a6a6abd4861e70c05b4c27119c9b50", 278 | "sha256:c7244e24fcb72f838be57d3e117ad7df135ff5af4c9d4c565417d671cd1e68c9", 279 | "sha256:d5db93026568f60cacdc0615fcd21d46f694a6bfad0ef3ff53cde2b4bb85a39d", 280 | "sha256:da92426002703b02d8fccff3acfea2d8baf76a9052e8c55ea76d0407eeaa06ce", 281 | "sha256:f4f0af14d288140ecb00861a3bd1e0b94ffdc63057cc1abe8b9dc84f6b6dcf18", 282 | "sha256:f985f31e3244d18610816b55becf8fbf445c8e30fe0731500cadaf19f296baf0" 283 | ], 284 | "index": "pypi", 285 | "version": "==19.7.0" 286 | }, 287 | "urllib3": { 288 | "hashes": [ 289 | "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", 290 | "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" 291 | ], 292 | "version": "==1.25.6" 293 | }, 294 | "w3lib": { 295 | "hashes": [ 296 | "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823", 297 | "sha256:8b1854fef570b5a5fc84d960e025debd110485d73fd283580376104762774315" 298 | ], 299 | "version": "==1.21.0" 300 | }, 301 | "websockets": { 302 | "hashes": [ 303 | "sha256:049e694abe33f8a1d99969fee7bfc0ae6761f7fd5f297c58ea933b27dd6805f2", 304 | "sha256:73ce69217e4655783ec72ce11c151053fcbd5b837cc39de7999e19605182e28a", 305 | "sha256:83e63aa73331b9ca21af61df8f115fb5fbcba3f281bee650a4ad16a40cd1ef15", 306 | "sha256:882a7266fa867a2ebb2c0baaa0f9159cabf131cf18c1b4270d79ad42f9208dc5", 307 | "sha256:8c77f7d182a6ea2a9d09c2612059f3ad859a90243e899617137ee3f6b7f2b584", 308 | "sha256:8d7a20a2f97f1e98c765651d9fb9437201a9ccc2c70e94b0270f1c5ef29667a3", 309 | "sha256:a7affaeffbc5d55681934c16bb6b8fc82bb75b175e7fd4dcca798c938bde8dda", 310 | "sha256:c82e286555f839846ef4f0fdd6910769a577952e1e26aa8ee7a6f45f040e3c2b", 311 | "sha256:e906128532a14b9d264a43eb48f9b3080d53a9bda819ab45bf56b8039dc606ac", 312 | "sha256:e9102043a81cdc8b7c8032ff4bce39f6229e4ac39cb2010946c912eeb84e2cb6", 313 | "sha256:f5cb2683367e32da6a256b60929a3af9c29c212b5091cf5bace9358d03011bf5" 314 | ], 315 | "version": "==8.0.2" 316 | }, 317 | "zope.interface": { 318 | "hashes": [ 319 | "sha256:086707e0f413ff8800d9c4bc26e174f7ee4c9c8b0302fbad68d083071822316c", 320 | "sha256:1157b1ec2a1f5bf45668421e3955c60c610e31913cc695b407a574efdbae1f7b", 321 | "sha256:11ebddf765bff3bbe8dbce10c86884d87f90ed66ee410a7e6c392086e2c63d02", 322 | "sha256:14b242d53f6f35c2d07aa2c0e13ccb710392bcd203e1b82a1828d216f6f6b11f", 323 | "sha256:1b3d0dcabc7c90b470e59e38a9acaa361be43b3a6ea644c0063951964717f0e5", 324 | "sha256:20a12ab46a7e72b89ce0671e7d7a6c3c1ca2c2766ac98112f78c5bddaa6e4375", 325 | "sha256:298f82c0ab1b182bd1f34f347ea97dde0fffb9ecf850ecf7f8904b8442a07487", 326 | "sha256:2f6175722da6f23dbfc76c26c241b67b020e1e83ec7fe93c9e5d3dd18667ada2", 327 | "sha256:3b877de633a0f6d81b600624ff9137312d8b1d0f517064dfc39999352ab659f0", 328 | "sha256:4265681e77f5ac5bac0905812b828c9fe1ce80c6f3e3f8574acfb5643aeabc5b", 329 | "sha256:550695c4e7313555549aa1cdb978dc9413d61307531f123558e438871a883d63", 330 | "sha256:5f4d42baed3a14c290a078e2696c5f565501abde1b2f3f1a1c0a94fbf6fbcc39", 331 | "sha256:62dd71dbed8cc6a18379700701d959307823b3b2451bdc018594c48956ace745", 332 | "sha256:7040547e5b882349c0a2cc9b50674b1745db551f330746af434aad4f09fba2cc", 333 | "sha256:7e099fde2cce8b29434684f82977db4e24f0efa8b0508179fce1602d103296a2", 334 | "sha256:7e5c9a5012b2b33e87980cee7d1c82412b2ebabcb5862d53413ba1a2cfde23aa", 335 | "sha256:81295629128f929e73be4ccfdd943a0906e5fe3cdb0d43ff1e5144d16fbb52b1", 336 | "sha256:95cc574b0b83b85be9917d37cd2fad0ce5a0d21b024e1a5804d044aabea636fc", 337 | "sha256:968d5c5702da15c5bf8e4a6e4b67a4d92164e334e9c0b6acf080106678230b98", 338 | "sha256:9e998ba87df77a85c7bed53240a7257afe51a07ee6bc3445a0bf841886da0b97", 339 | "sha256:a0c39e2535a7e9c195af956610dba5a1073071d2d85e9d2e5d789463f63e52ab", 340 | "sha256:a15e75d284178afe529a536b0e8b28b7e107ef39626a7809b4ee64ff3abc9127", 341 | "sha256:a6a6ff82f5f9b9702478035d8f6fb6903885653bff7ec3a1e011edc9b1a7168d", 342 | "sha256:b639f72b95389620c1f881d94739c614d385406ab1d6926a9ffe1c8abbea23fe", 343 | "sha256:bad44274b151d46619a7567010f7cde23a908c6faa84b97598fd2f474a0c6891", 344 | "sha256:bbcef00d09a30948756c5968863316c949d9cedbc7aabac5e8f0ffbdb632e5f1", 345 | "sha256:d788a3999014ddf416f2dc454efa4a5dbeda657c6aba031cf363741273804c6b", 346 | "sha256:eed88ae03e1ef3a75a0e96a55a99d7937ed03e53d0cffc2451c208db445a2966", 347 | "sha256:f99451f3a579e73b5dd58b1b08d1179791d49084371d9a47baad3b22417f0317" 348 | ], 349 | "version": "==4.6.0" 350 | } 351 | }, 352 | "develop": { 353 | "asynctest": { 354 | "hashes": [ 355 | "sha256:56bd75b03df55956d57437db26700503d1013616314db5d1ea1a73be1186fd71", 356 | "sha256:77520850ae21620ec31738f4a7b467acaa44de6d3752d8ac7a9f4dcf55d77853" 357 | ], 358 | "index": "pypi", 359 | "version": "==0.12.2" 360 | }, 361 | "atomicwrites": { 362 | "hashes": [ 363 | "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", 364 | "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" 365 | ], 366 | "version": "==1.3.0" 367 | }, 368 | "attrs": { 369 | "hashes": [ 370 | "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", 371 | "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" 372 | ], 373 | "version": "==19.3.0" 374 | }, 375 | "backcall": { 376 | "hashes": [ 377 | "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", 378 | "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" 379 | ], 380 | "version": "==0.1.0" 381 | }, 382 | "bleach": { 383 | "hashes": [ 384 | "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", 385 | "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" 386 | ], 387 | "version": "==3.1.0" 388 | }, 389 | "certifi": { 390 | "hashes": [ 391 | "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", 392 | "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" 393 | ], 394 | "version": "==2019.9.11" 395 | }, 396 | "chardet": { 397 | "hashes": [ 398 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 399 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 400 | ], 401 | "version": "==3.0.4" 402 | }, 403 | "decorator": { 404 | "hashes": [ 405 | "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", 406 | "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" 407 | ], 408 | "version": "==4.4.0" 409 | }, 410 | "docutils": { 411 | "hashes": [ 412 | "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0", 413 | "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", 414 | "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" 415 | ], 416 | "version": "==0.15.2" 417 | }, 418 | "idna": { 419 | "hashes": [ 420 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 421 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 422 | ], 423 | "version": "==2.8" 424 | }, 425 | "importlib-metadata": { 426 | "hashes": [ 427 | "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", 428 | "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af" 429 | ], 430 | "markers": "python_version < '3.8'", 431 | "version": "==0.23" 432 | }, 433 | "ipdb": { 434 | "hashes": [ 435 | "sha256:7081c65ed7bfe7737f83fa4213ca8afd9617b42ff6b3f1daf9a3419839a2a00a" 436 | ], 437 | "index": "pypi", 438 | "version": "==0.11" 439 | }, 440 | "ipython": { 441 | "hashes": [ 442 | "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b", 443 | "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1" 444 | ], 445 | "version": "==7.8.0" 446 | }, 447 | "ipython-genutils": { 448 | "hashes": [ 449 | "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", 450 | "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" 451 | ], 452 | "version": "==0.2.0" 453 | }, 454 | "jedi": { 455 | "hashes": [ 456 | "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27", 457 | "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e" 458 | ], 459 | "version": "==0.15.1" 460 | }, 461 | "more-itertools": { 462 | "hashes": [ 463 | "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", 464 | "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" 465 | ], 466 | "version": "==7.2.0" 467 | }, 468 | "parso": { 469 | "hashes": [ 470 | "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc", 471 | "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c" 472 | ], 473 | "version": "==0.5.1" 474 | }, 475 | "pexpect": { 476 | "hashes": [ 477 | "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", 478 | "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" 479 | ], 480 | "markers": "sys_platform != 'win32'", 481 | "version": "==4.7.0" 482 | }, 483 | "pickleshare": { 484 | "hashes": [ 485 | "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", 486 | "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" 487 | ], 488 | "version": "==0.7.5" 489 | }, 490 | "pkginfo": { 491 | "hashes": [ 492 | "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb", 493 | "sha256:a6d9e40ca61ad3ebd0b72fbadd4fba16e4c0e4df0428c041e01e06eb6ee71f32" 494 | ], 495 | "version": "==1.5.0.1" 496 | }, 497 | "pluggy": { 498 | "hashes": [ 499 | "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6", 500 | "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34" 501 | ], 502 | "version": "==0.13.0" 503 | }, 504 | "prompt-toolkit": { 505 | "hashes": [ 506 | "sha256:46642344ce457641f28fc9d1c9ca939b63dadf8df128b86f1b9860e59c73a5e4", 507 | "sha256:e7f8af9e3d70f514373bf41aa51bc33af12a6db3f71461ea47fea985defb2c31", 508 | "sha256:f15af68f66e664eaa559d4ac8a928111eebd5feda0c11738b5998045224829db" 509 | ], 510 | "version": "==2.0.10" 511 | }, 512 | "ptyprocess": { 513 | "hashes": [ 514 | "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", 515 | "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" 516 | ], 517 | "version": "==0.6.0" 518 | }, 519 | "py": { 520 | "hashes": [ 521 | "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", 522 | "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" 523 | ], 524 | "version": "==1.8.0" 525 | }, 526 | "pygments": { 527 | "hashes": [ 528 | "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", 529 | "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" 530 | ], 531 | "version": "==2.4.2" 532 | }, 533 | "pytest": { 534 | "hashes": [ 535 | "sha256:1d131cc532be0023ef8ae265e2a779938d0619bb6c2510f52987ffcba7fa1ee4", 536 | "sha256:ca4761407f1acc85ffd1609f464ca20bb71a767803505bd4127d0e45c5a50e23" 537 | ], 538 | "index": "pypi", 539 | "version": "==4.0.1" 540 | }, 541 | "readme-renderer": { 542 | "hashes": [ 543 | "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f", 544 | "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d" 545 | ], 546 | "version": "==24.0" 547 | }, 548 | "requests": { 549 | "hashes": [ 550 | "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", 551 | "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" 552 | ], 553 | "version": "==2.22.0" 554 | }, 555 | "requests-toolbelt": { 556 | "hashes": [ 557 | "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f", 558 | "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0" 559 | ], 560 | "version": "==0.9.1" 561 | }, 562 | "six": { 563 | "hashes": [ 564 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 565 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 566 | ], 567 | "version": "==1.12.0" 568 | }, 569 | "tqdm": { 570 | "hashes": [ 571 | "sha256:abc25d0ce2397d070ef07d8c7e706aede7920da163c64997585d42d3537ece3d", 572 | "sha256:dd3fcca8488bb1d416aa7469d2f277902f26260c45aa86b667b074cd44b3b115" 573 | ], 574 | "version": "==4.36.1" 575 | }, 576 | "traitlets": { 577 | "hashes": [ 578 | "sha256:70b4c6a1d9019d7b4f6846832288f86998aa3b9207c6821f3578a6a6a467fe44", 579 | "sha256:d023ee369ddd2763310e4c3eae1ff649689440d4ae59d7485eb4cfbbe3e359f7" 580 | ], 581 | "version": "==4.3.3" 582 | }, 583 | "twine": { 584 | "hashes": [ 585 | "sha256:5319dd3e02ac73fcddcd94f035b9631589ab5d23e1f4699d57365199d85261e1", 586 | "sha256:9fe7091715c7576df166df8ef6654e61bada39571783f2fd415bdcba867c6993" 587 | ], 588 | "index": "pypi", 589 | "version": "==2.0.0" 590 | }, 591 | "urllib3": { 592 | "hashes": [ 593 | "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", 594 | "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" 595 | ], 596 | "version": "==1.25.6" 597 | }, 598 | "wcwidth": { 599 | "hashes": [ 600 | "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", 601 | "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" 602 | ], 603 | "version": "==0.1.7" 604 | }, 605 | "webencodings": { 606 | "hashes": [ 607 | "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", 608 | "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" 609 | ], 610 | "version": "==0.5.1" 611 | }, 612 | "zipp": { 613 | "hashes": [ 614 | "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", 615 | "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" 616 | ], 617 | "version": "==0.6.0" 618 | } 619 | } 620 | } 621 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy with Puppeteer 2 | [![PyPI](https://img.shields.io/pypi/v/scrapy-puppeteer.svg)](https://pypi.python.org/pypi/scrapy-puppeteer) [![Build Status](https://travis-ci.org/clemfromspace/scrapy-puppeteer.svg?branch=master)](https://travis-ci.org/clemfromspace/scrapy-puppeteer) [![Test Coverage](https://api.codeclimate.com/v1/badges/86603b736e684dd4f8c9/test_coverage)](https://codeclimate.com/github/clemfromspace/scrapy-puppeteer/test_coverage) [![Maintainability](https://api.codeclimate.com/v1/badges/86603b736e684dd4f8c9/maintainability)](https://codeclimate.com/github/clemfromspace/scrapy-puppeteer/maintainability) 3 | 4 | Scrapy middleware to handle javascript pages using [puppeteer](https://github.com/GoogleChrome/puppeteer). 5 | 6 | ## ⚠ IN ACTIVE DEVELOPMENT - READ BEFORE USING ⚠ 7 | 8 | This is an attempt to make Scrapy and Puppeteer work together to handle Javascript-rendered pages. 9 | The design is strongly inspired of the Scrapy [Splash plugin](https://github.com/scrapy-plugins/scrapy-splash). 10 | 11 | **Scrapy and Puppeteer** 12 | 13 | The main issue when running Scrapy and Puppeteer together is that Scrapy is using [Twisted](https://twistedmatrix.com/trac/) and that [Pyppeteeer](https://miyakogi.github.io/pyppeteer/) (the python port of puppeteer we are using) is using [asyncio](https://docs.python.org/3/library/asyncio.html) for async stuff. 14 | 15 | Luckily, we can use the Twisted's [asyncio reactor](https://twistedmatrix.com/documents/18.4.0/api/twisted.internet.asyncioreactor.html) to make the two talking with each other. 16 | 17 | That's why you **cannot** use the buit-in `scrapy` command line (installing the default reactor), you will have to use the `scrapyp` one, provided by this module. 18 | 19 | If you are running your spiders from a script, you will have to make sure you install the asyncio reactor before importing scrapy or doing anything else: 20 | 21 | ```python 22 | import asyncio 23 | from twisted.internet import asyncioreactor 24 | 25 | asyncioreactor.install(asyncio.get_event_loop()) 26 | ``` 27 | 28 | 29 | ## Installation 30 | ``` 31 | $ pip install scrapy-puppeteer 32 | ``` 33 | 34 | ## Configuration 35 | Add the `PuppeteerMiddleware` to the downloader middlewares: 36 | ```python 37 | DOWNLOADER_MIDDLEWARES = { 38 | 'scrapy_puppeteer.PuppeteerMiddleware': 800 39 | } 40 | ``` 41 | 42 | 43 | ## Usage 44 | Use the `scrapy_puppeteer.PuppeteerRequest` instead of the Scrapy built-in `Request` like below: 45 | ```python 46 | from scrapy_puppeteer import PuppeteerRequest 47 | 48 | def your_parse_method(self, response): 49 | # Your code... 50 | yield PuppeteerRequest('http://httpbin.org', self.parse_result) 51 | ``` 52 | The request will be then handled by puppeteer. 53 | 54 | The `selector` response attribute work as usual (but contains the html processed by puppeteer). 55 | 56 | ```python 57 | def parse_result(self, response): 58 | print(response.selector.xpath('//title/@text')) 59 | ``` 60 | 61 | ### Additional arguments 62 | The `scrapy_puppeteer.PuppeteerRequest` accept 2 additional arguments: 63 | 64 | #### `wait_until` 65 | 66 | Will be passed to the [`waitUntil`](https://miyakogi.github.io/pyppeteer/_modules/pyppeteer/page.html#Page.goto) parameter of puppeteer. 67 | Default to `domcontentloaded`. 68 | 69 | #### `wait_for` 70 | Will be passed to the [`waitFor`](https://miyakogi.github.io/pyppeteer/reference.html?highlight=image#pyppeteer.page.Page.waitFor) to puppeteer. 71 | 72 | #### `screenshot` 73 | When used, puppeteer will take a [screenshot](https://miyakogi.github.io/pyppeteer/reference.html?highlight=headers#pyppeteer.page.Page.screenshot) of the page and the binary data of the .png captured will be added to the response `meta`: 74 | ```python 75 | yield PuppeteerRequest( 76 | url, 77 | self.parse_result, 78 | screenshot=True 79 | ) 80 | 81 | def parse_result(self, response): 82 | with open('image.png', 'wb') as image_file: 83 | image_file.write(response.meta['screenshot']) 84 | ``` 85 | 86 | -------------------------------------------------------------------------------- /requirements/requirements-test.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | pytest==3.4.0 4 | coverage<4.4 5 | pytest-cov==2.4.0 6 | codeclimate-test-reporter==0.2.3 7 | attrs>=17.4.0 8 | asynctest 9 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=1.0.0 2 | pyppeteer 3 | -------------------------------------------------------------------------------- /scrapy_puppeteer/__init__.py: -------------------------------------------------------------------------------- 1 | from .http import PuppeteerRequest 2 | from .middlewares import PuppeteerMiddleware 3 | -------------------------------------------------------------------------------- /scrapy_puppeteer/cli.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | from twisted.internet import asyncioreactor 4 | 5 | # Need to install the asyncio reactor before importing Scrapy (?) 6 | # Maybe there is a cleaner way to to it? 7 | asyncioreactor.install(asyncio.get_event_loop()) 8 | 9 | from scrapy.cmdline import execute 10 | 11 | 12 | def __main__(): 13 | execute(argv=sys.argv) 14 | -------------------------------------------------------------------------------- /scrapy_puppeteer/http.py: -------------------------------------------------------------------------------- 1 | """This module contains the ``SeleniumRequest`` class""" 2 | 3 | from scrapy import Request 4 | 5 | 6 | class PuppeteerRequest(Request): 7 | """Scrapy ``Request`` subclass providing additional arguments""" 8 | 9 | def __init__(self, url, callback=None, screenshot=False, wait_until=None, wait_for=None, *args, **kwargs): 10 | """Initialize a new Puppeteer request 11 | 12 | Parameters 13 | ---------- 14 | wait_until: basestring 15 | One of "load", "domcontentloaded", "networkidle0", "networkidle2". 16 | See https://miyakogi.github.io/pyppeteer/reference.html#pyppeteer.page.Page.goto 17 | screenshot: bool 18 | If True, a screenshot of the page will be taken and the data of the screenshot 19 | will be returned in the response "meta" attribute. 20 | 21 | """ 22 | 23 | self.wait_until = wait_until or 'domcontentloaded' 24 | self.wait_for = wait_for 25 | self.screenshot = screenshot 26 | 27 | super().__init__(url, callback, *args, **kwargs) 28 | -------------------------------------------------------------------------------- /scrapy_puppeteer/middlewares.py: -------------------------------------------------------------------------------- 1 | """This module contains the ``SeleniumMiddleware`` scrapy middleware""" 2 | 3 | import asyncio 4 | 5 | from pyppeteer import launch 6 | from scrapy import signals 7 | from scrapy.http import HtmlResponse 8 | from twisted.internet.defer import Deferred 9 | 10 | from .http import PuppeteerRequest 11 | 12 | 13 | def as_deferred(f): 14 | """Transform a Twisted Deffered to an Asyncio Future""" 15 | 16 | return Deferred.fromFuture(asyncio.ensure_future(f)) 17 | 18 | 19 | class PuppeteerMiddleware: 20 | """Downloader middleware handling the requests with Puppeteer""" 21 | 22 | @classmethod 23 | async def _from_crawler(cls, crawler): 24 | """Start the browser""" 25 | 26 | middleware = cls() 27 | middleware.browser = await launch({'logLevel': crawler.settings.get('LOG_LEVEL')}) 28 | crawler.signals.connect(middleware.spider_closed, signals.spider_closed) 29 | 30 | return middleware 31 | 32 | @classmethod 33 | def from_crawler(cls, crawler): 34 | """Initialize the middleware""" 35 | 36 | loop = asyncio.get_event_loop() 37 | middleware = loop.run_until_complete( 38 | asyncio.ensure_future(cls._from_crawler(crawler)) 39 | ) 40 | 41 | return middleware 42 | 43 | async def _process_request(self, request, spider): 44 | """Handle the request using Puppeteer""" 45 | 46 | page = await self.browser.newPage() 47 | 48 | # Cookies 49 | if isinstance(request.cookies, dict): 50 | await page.setCookie(*[ 51 | {'name': k, 'value': v} 52 | for k, v in request.cookies.items() 53 | ]) 54 | else: 55 | await page.setCookie(request.cookies) 56 | 57 | # The headers must be set using request interception 58 | await page.setRequestInterception(True) 59 | 60 | @page.on('request') 61 | async def _handle_headers(pu_request): 62 | overrides = { 63 | 'headers': { 64 | k.decode(): ','.join(map(lambda v: v.decode(), v)) 65 | for k, v in request.headers.items() 66 | } 67 | } 68 | await pu_request.continue_(overrides=overrides) 69 | 70 | response = await page.goto( 71 | request.url, 72 | { 73 | 'waitUntil': request.wait_until 74 | }, 75 | ) 76 | 77 | if request.wait_for: 78 | await page.waitFor(request.wait_for) 79 | 80 | if request.screenshot: 81 | request.meta['screenshot'] = await page.screenshot() 82 | 83 | content = await page.content() 84 | body = str.encode(content) 85 | await page.close() 86 | 87 | # Necessary to bypass the compression middleware (?) 88 | response.headers.pop('content-encoding', None) 89 | response.headers.pop('Content-Encoding', None) 90 | 91 | return HtmlResponse( 92 | page.url, 93 | status=response.status, 94 | headers=response.headers, 95 | body=body, 96 | encoding='utf-8', 97 | request=request 98 | ) 99 | 100 | def process_request(self, request, spider): 101 | """Check if the Request should be handled by Puppeteer""" 102 | 103 | if not isinstance(request, PuppeteerRequest): 104 | return None 105 | 106 | return as_deferred(self._process_request(request, spider)) 107 | 108 | async def _spider_closed(self): 109 | await self.browser.close() 110 | 111 | def spider_closed(self): 112 | """Shutdown the browser when spider is closed""" 113 | 114 | return as_deferred(self._spider_closed()) 115 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = scrapy-puppeteer 3 | version = 0.0.1b 4 | url = https://github.com/clemfromspace/scrapy-puppeteer 5 | licence = MIT 6 | description = Scrapy with puppeteer 7 | long_description = file:README.md 8 | 9 | [options] 10 | include_package_data = true 11 | 12 | [pep8] 13 | max-line-length = 100 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """This module contains the packaging routine for the pybook package""" 2 | 3 | from setuptools import setup, find_packages 4 | try: 5 | from pip.download import PipSession 6 | from pip.req import parse_requirements 7 | except ImportError: 8 | # It is quick hack to support pip 10 that has changed its internal 9 | # structure of the modules. 10 | from pip._internal.download import PipSession 11 | from pip._internal.req.req_file import parse_requirements 12 | 13 | 14 | def get_requirements(source): 15 | """Get the requirements from the given ``source`` 16 | 17 | Parameters 18 | ---------- 19 | source: str 20 | The filename containing the requirements 21 | 22 | """ 23 | 24 | install_reqs = parse_requirements(filename=source, session=PipSession()) 25 | 26 | return [str(ir.req) for ir in install_reqs] 27 | 28 | 29 | setup( 30 | packages=find_packages(), 31 | install_requires=get_requirements('requirements/requirements.txt'), 32 | entry_points={ 33 | 'console_scripts': [ 34 | 'scrapyp = scrapy_puppeteer.cli:__main__', 35 | ], 36 | } 37 | ) 38 | 39 | 40 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clemfromspace/scrapy-puppeteer/512cfe99b2c3f9aad6c0d3d35299d3ccd6c91121/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_middlewares.py: -------------------------------------------------------------------------------- 1 | """This module contains the base test cases for the ``scrapy_selenium`` package""" 2 | 3 | import asyncio 4 | from twisted.internet import asyncioreactor 5 | 6 | # Need to install the asyncio reactor before importing Scrapy (?) 7 | # Maybe there is a cleaner way to to it? 8 | asyncioreactor.install(asyncio.get_event_loop()) 9 | 10 | import scrapy 11 | from scrapy.crawler import CrawlerRunner 12 | from twisted.internet import defer 13 | from twisted.trial.unittest import TestCase 14 | 15 | import scrapy_puppeteer 16 | 17 | 18 | class ScrapyPuppeteerTestCase(TestCase): 19 | """Test case for the ``scrapy-puppeteer`` package""" 20 | 21 | class PuppeteerSpider(scrapy.Spider): 22 | name = 'puppeteer_crawl_spider' 23 | allowed_domains = ['codesandbox.io'] 24 | custom_settings = { 25 | 'DOWNLOADER_MIDDLEWARES': { 26 | 'scrapy_puppeteer.PuppeteerMiddleware': 800 27 | } 28 | } 29 | 30 | items = [] 31 | 32 | def start_requests(self): 33 | yield scrapy_puppeteer.PuppeteerRequest( 34 | 'https://codesandbox.io/search?page=1', 35 | wait_until='networkidle2' 36 | ) 37 | 38 | def parse(self, response): 39 | for selector_item in response.selector.xpath('//li[@class="ais-Hits-item"]'): 40 | self.items.append(selector_item.xpath('.//h2').extract_first()) 41 | 42 | def setUp(self): 43 | """Store the Scrapy runner to use in the tests""" 44 | 45 | self.runner = CrawlerRunner() 46 | 47 | @defer.inlineCallbacks 48 | def test_items_number(self): 49 | crawler = self.runner.create_crawler(self.PuppeteerSpider) 50 | yield crawler.crawl() 51 | self.assertEqual(len(crawler.spider.items), 12) 52 | --------------------------------------------------------------------------------