├── .gitignore ├── MANIFEST.in ├── Pipfile ├── Pipfile.lock ├── README.md ├── examples ├── Pipfile ├── README.md └── example │ ├── example │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── ccidcom.py │ └── scrapy.cfg ├── setup.py ├── src └── scrapy_rabbitmq_scheduler │ ├── __init__.py │ ├── connection.py │ ├── middleware.py │ ├── picklecompat.py │ ├── pipelines.py │ ├── queue.py │ ├── scheduler.py │ └── spiders.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "http://mirrors.aliyun.com/pypi/simple/" 4 | verify_ssl = false 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | scrapy = "*" 10 | pika = "*" 11 | 12 | [requires] 13 | python_version = "3.6" 14 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "939d7ff7700ba26b0ac79e50d1e904a9b39f0f42216cb7b0b88b1ded2a92a422" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "http://mirrors.aliyun.com/pypi/simple/", 14 | "verify_ssl": false 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "attrs": { 20 | "hashes": [ 21 | "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594", 22 | "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc" 23 | ], 24 | "version": "==20.2.0" 25 | }, 26 | "automat": { 27 | "hashes": [ 28 | "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33", 29 | "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111" 30 | ], 31 | "version": "==20.2.0" 32 | }, 33 | "cffi": { 34 | "hashes": [ 35 | "sha256:005f2bfe11b6745d726dbb07ace4d53f057de66e336ff92d61b8c7e9c8f4777d", 36 | "sha256:09e96138280241bd355cd585148dec04dbbedb4f46128f340d696eaafc82dd7b", 37 | "sha256:0b1ad452cc824665ddc682400b62c9e4f5b64736a2ba99110712fdee5f2505c4", 38 | "sha256:0ef488305fdce2580c8b2708f22d7785ae222d9825d3094ab073e22e93dfe51f", 39 | "sha256:15f351bed09897fbda218e4db5a3d5c06328862f6198d4fb385f3e14e19decb3", 40 | "sha256:22399ff4870fb4c7ef19fff6eeb20a8bbf15571913c181c78cb361024d574579", 41 | "sha256:23e5d2040367322824605bc29ae8ee9175200b92cb5483ac7d466927a9b3d537", 42 | "sha256:2791f68edc5749024b4722500e86303a10d342527e1e3bcac47f35fbd25b764e", 43 | "sha256:2f9674623ca39c9ebe38afa3da402e9326c245f0f5ceff0623dccdac15023e05", 44 | "sha256:3363e77a6176afb8823b6e06db78c46dbc4c7813b00a41300a4873b6ba63b171", 45 | "sha256:33c6cdc071ba5cd6d96769c8969a0531be2d08c2628a0143a10a7dcffa9719ca", 46 | "sha256:3b8eaf915ddc0709779889c472e553f0d3e8b7bdf62dab764c8921b09bf94522", 47 | "sha256:3cb3e1b9ec43256c4e0f8d2837267a70b0e1ca8c4f456685508ae6106b1f504c", 48 | "sha256:3eeeb0405fd145e714f7633a5173318bd88d8bbfc3dd0a5751f8c4f70ae629bc", 49 | "sha256:44f60519595eaca110f248e5017363d751b12782a6f2bd6a7041cba275215f5d", 50 | "sha256:4d7c26bfc1ea9f92084a1d75e11999e97b62d63128bcc90c3624d07813c52808", 51 | "sha256:529c4ed2e10437c205f38f3691a68be66c39197d01062618c55f74294a4a4828", 52 | "sha256:6642f15ad963b5092d65aed022d033c77763515fdc07095208f15d3563003869", 53 | "sha256:85ba797e1de5b48aa5a8427b6ba62cf69607c18c5d4eb747604b7302f1ec382d", 54 | "sha256:8f0f1e499e4000c4c347a124fa6a27d37608ced4fe9f7d45070563b7c4c370c9", 55 | "sha256:a624fae282e81ad2e4871bdb767e2c914d0539708c0f078b5b355258293c98b0", 56 | "sha256:b0358e6fefc74a16f745afa366acc89f979040e0cbc4eec55ab26ad1f6a9bfbc", 57 | "sha256:bbd2f4dfee1079f76943767fce837ade3087b578aeb9f69aec7857d5bf25db15", 58 | "sha256:bf39a9e19ce7298f1bd6a9758fa99707e9e5b1ebe5e90f2c3913a47bc548747c", 59 | "sha256:c11579638288e53fc94ad60022ff1b67865363e730ee41ad5e6f0a17188b327a", 60 | "sha256:c150eaa3dadbb2b5339675b88d4573c1be3cb6f2c33a6c83387e10cc0bf05bd3", 61 | "sha256:c53af463f4a40de78c58b8b2710ade243c81cbca641e34debf3396a9640d6ec1", 62 | "sha256:cb763ceceae04803adcc4e2d80d611ef201c73da32d8f2722e9d0ab0c7f10768", 63 | "sha256:cc75f58cdaf043fe6a7a6c04b3b5a0e694c6a9e24050967747251fb80d7bce0d", 64 | "sha256:d80998ed59176e8cba74028762fbd9b9153b9afc71ea118e63bbf5d4d0f9552b", 65 | "sha256:de31b5164d44ef4943db155b3e8e17929707cac1e5bd2f363e67a56e3af4af6e", 66 | "sha256:e66399cf0fc07de4dce4f588fc25bfe84a6d1285cc544e67987d22663393926d", 67 | "sha256:f0620511387790860b249b9241c2f13c3a80e21a73e0b861a2df24e9d6f56730", 68 | "sha256:f4eae045e6ab2bb54ca279733fe4eb85f1effda392666308250714e01907f394", 69 | "sha256:f92cdecb618e5fa4658aeb97d5eb3d2f47aa94ac6477c6daf0f306c5a3b9e6b1", 70 | "sha256:f92f789e4f9241cd262ad7a555ca2c648a98178a953af117ef7fad46aa1d5591" 71 | ], 72 | "version": "==1.14.3" 73 | }, 74 | "constantly": { 75 | "hashes": [ 76 | "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", 77 | "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d" 78 | ], 79 | "version": "==15.1.0" 80 | }, 81 | "cryptography": { 82 | "hashes": [ 83 | "sha256:22f8251f68953553af4f9c11ec5f191198bc96cff9f0ac5dd5ff94daede0ee6d", 84 | "sha256:284e275e3c099a80831f9898fb5c9559120d27675c3521278faba54e584a7832", 85 | "sha256:3e17d02941c0f169c5b877597ca8be895fca0e5e3eb882526a74aa4804380a98", 86 | "sha256:52a47e60953679eea0b4d490ca3c241fb1b166a7b161847ef4667dfd49e7699d", 87 | "sha256:57b8c1ed13b8aa386cabbfde3be175d7b155682470b0e259fecfe53850967f8a", 88 | "sha256:6a8f64ed096d13f92d1f601a92d9fd1f1025dc73a2ca1ced46dcf5e0d4930943", 89 | "sha256:6e8a3c7c45101a7eeee93102500e1b08f2307c717ff553fcb3c1127efc9b6917", 90 | "sha256:7ef41304bf978f33cfb6f43ca13bb0faac0c99cda33693aa20ad4f5e34e8cb8f", 91 | "sha256:87c2fffd61e934bc0e2c927c3764c20b22d7f5f7f812ee1a477de4c89b044ca6", 92 | "sha256:88069392cd9a1e68d2cfd5c3a2b0d72a44ef3b24b8977a4f7956e9e3c4c9477a", 93 | "sha256:8a0866891326d3badb17c5fd3e02c926b635e8923fa271b4813cd4d972a57ff3", 94 | "sha256:8f0fd8b0751d75c4483c534b209e39e918f0d14232c0d8a2a76e687f64ced831", 95 | "sha256:9a07e6d255053674506091d63ab4270a119e9fc83462c7ab1dbcb495b76307af", 96 | "sha256:9a8580c9afcdcddabbd064c0a74f337af74ff4529cdf3a12fa2e9782d677a2e5", 97 | "sha256:bd80bc156d3729b38cb227a5a76532aef693b7ac9e395eea8063ee50ceed46a5", 98 | "sha256:d1cbc3426e6150583b22b517ef3720036d7e3152d428c864ff0f3fcad2b97591", 99 | "sha256:e15ac84dcdb89f92424cbaca4b0b34e211e7ce3ee7b0ec0e4f3c55cee65fae5a", 100 | "sha256:e4789b84f8dedf190148441f7c5bfe7244782d9cbb194a36e17b91e7d3e1cca9", 101 | "sha256:f01c9116bfb3ad2831e125a73dcd957d173d6ddca7701528eff1e7d97972872c", 102 | "sha256:f0e3986f6cce007216b23c490f093f35ce2068f3c244051e559f647f6731b7ae", 103 | "sha256:f2aa3f8ba9e2e3fd49bd3de743b976ab192fbf0eb0348cebde5d2a9de0090a9f", 104 | "sha256:fb70a4cedd69dc52396ee114416a3656e011fb0311fca55eb55c7be6ed9c8aef" 105 | ], 106 | "index": "pypi", 107 | "version": "==3.2" 108 | }, 109 | "cssselect": { 110 | "hashes": [ 111 | "sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf", 112 | "sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc" 113 | ], 114 | "version": "==1.1.0" 115 | }, 116 | "hyperlink": { 117 | "hashes": [ 118 | "sha256:47fcc7cd339c6cb2444463ec3277bdcfe142c8b1daf2160bdd52248deec815af", 119 | "sha256:c528d405766f15a2c536230de7e160b65a08e20264d8891b3eb03307b0df3c63" 120 | ], 121 | "version": "==20.0.1" 122 | }, 123 | "idna": { 124 | "hashes": [ 125 | "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", 126 | "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" 127 | ], 128 | "version": "==2.10" 129 | }, 130 | "incremental": { 131 | "hashes": [ 132 | "sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f", 133 | "sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3" 134 | ], 135 | "version": "==17.5.0" 136 | }, 137 | "lxml": { 138 | "hashes": [ 139 | "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b", 140 | "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5", 141 | "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301", 142 | "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b", 143 | "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d", 144 | "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b", 145 | "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9", 146 | "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b", 147 | "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311", 148 | "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891", 149 | "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a", 150 | "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1", 151 | "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856", 152 | "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810", 153 | "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51", 154 | "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360", 155 | "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4", 156 | "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f", 157 | "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230", 158 | "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a", 159 | "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f", 160 | "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174", 161 | "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf", 162 | "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd", 163 | "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3", 164 | "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a", 165 | "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5", 166 | "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367", 167 | "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c", 168 | "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1", 169 | "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8", 170 | "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f", 171 | "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc", 172 | "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d", 173 | "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9", 174 | "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f" 175 | ], 176 | "version": "==4.6.1" 177 | }, 178 | "parsel": { 179 | "hashes": [ 180 | "sha256:70efef0b651a996cceebc69e55a85eb2233be0890959203ba7c3a03c72725c79", 181 | "sha256:9e1fa8db1c0b4a878bf34b35c043d89c9d1cbebc23b4d34dbc3c0ec33f2e087d" 182 | ], 183 | "version": "==1.6.0" 184 | }, 185 | "pika": { 186 | "hashes": [ 187 | "sha256:4e1a1a6585a41b2341992ec32aadb7a919d649eb82904fd8e4a4e0871c8cf3af", 188 | "sha256:9fa76ba4b65034b878b2b8de90ff8660a59d925b087c5bb88f8fdbb4b64a1dbf" 189 | ], 190 | "index": "pypi", 191 | "version": "==1.1.0" 192 | }, 193 | "protego": { 194 | "hashes": [ 195 | "sha256:a682771bc7b51b2ff41466460896c1a5a653f9a1e71639ef365a72e66d8734b4" 196 | ], 197 | "version": "==0.1.16" 198 | }, 199 | "pyasn1": { 200 | "hashes": [ 201 | "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d", 202 | "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba" 203 | ], 204 | "version": "==0.4.8" 205 | }, 206 | "pyasn1-modules": { 207 | "hashes": [ 208 | "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e", 209 | "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74" 210 | ], 211 | "version": "==0.2.8" 212 | }, 213 | "pycparser": { 214 | "hashes": [ 215 | "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", 216 | "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" 217 | ], 218 | "version": "==2.20" 219 | }, 220 | "pydispatcher": { 221 | "hashes": [ 222 | "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", 223 | "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" 224 | ], 225 | "version": "==2.0.5" 226 | }, 227 | "pyhamcrest": { 228 | "hashes": [ 229 | "sha256:412e00137858f04bde0729913874a48485665f2d36fe9ee449f26be864af9316", 230 | "sha256:7ead136e03655af85069b6f47b23eb7c3e5c221aa9f022a4fbb499f5b7308f29" 231 | ], 232 | "version": "==2.0.2" 233 | }, 234 | "pyopenssl": { 235 | "hashes": [ 236 | "sha256:621880965a720b8ece2f1b2f54ea2071966ab00e2970ad2ce11d596102063504", 237 | "sha256:9a24494b2602aaf402be5c9e30a0b82d4a5c67528fe8fb475e3f3bc00dd69507" 238 | ], 239 | "version": "==19.1.0" 240 | }, 241 | "queuelib": { 242 | "hashes": [ 243 | "sha256:42b413295551bdc24ed9376c1a2cd7d0b1b0fa4746b77b27ca2b797a276a1a17", 244 | "sha256:ff43b5b74b9266f8df4232a8f768dc4d67281a271905e2ed4a3689d4d304cd02" 245 | ], 246 | "version": "==1.5.0" 247 | }, 248 | "scrapy": { 249 | "hashes": [ 250 | "sha256:4352c64c7ffc70148a7988db837bb25bccafb3350ab9c978c1f9a8930521959b", 251 | "sha256:fe06576f9a4971de9dc0175c60fd92561e8275f2bad585c1cb5d65c5181b2db0" 252 | ], 253 | "index": "pypi", 254 | "version": "==1.8.0" 255 | }, 256 | "service-identity": { 257 | "hashes": [ 258 | "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36", 259 | "sha256:0858a54aabc5b459d1aafa8a518ed2081a285087f349fe3e55197989232e2e2d" 260 | ], 261 | "version": "==18.1.0" 262 | }, 263 | "six": { 264 | "hashes": [ 265 | "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", 266 | "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" 267 | ], 268 | "version": "==1.15.0" 269 | }, 270 | "twisted": { 271 | "hashes": [ 272 | "sha256:040eb6641125d2a9a09cf198ec7b83dd8858c6f51f6770325ed9959c00f5098f", 273 | "sha256:147780b8caf21ba2aef3688628eaf13d7e7fe02a86747cd54bfaf2140538f042", 274 | "sha256:158ddb80719a4813d292293ac44ba41d8b56555ed009d90994a278237ee63d2c", 275 | "sha256:2182000d6ffc05d269e6c03bfcec8b57e20259ca1086180edaedec3f1e689292", 276 | "sha256:25ffcf37944bdad4a99981bc74006d735a678d2b5c193781254fbbb6d69e3b22", 277 | "sha256:3281d9ce889f7b21bdb73658e887141aa45a102baf3b2320eafcfba954fcefec", 278 | "sha256:356e8d8dd3590e790e3dba4db139eb8a17aca64b46629c622e1b1597a4a92478", 279 | "sha256:70952c56e4965b9f53b180daecf20a9595cf22b8d0935cd3bd664c90273c3ab2", 280 | "sha256:7408c6635ee1b96587289283ebe90ee15dbf9614b05857b446055116bc822d29", 281 | "sha256:7c547fd0215db9da8a1bc23182b309e84a232364cc26d829e9ee196ce840b114", 282 | "sha256:894f6f3cfa57a15ea0d0714e4283913a5f2511dbd18653dd148eba53b3919797", 283 | "sha256:94ac3d55a58c90e2075c5fe1853f2aa3892b73e3bf56395f743aefde8605eeaa", 284 | "sha256:a58e61a2a01e5bcbe3b575c0099a2bcb8d70a75b1a087338e0c48dd6e01a5f15", 285 | "sha256:c09c47ff9750a8e3aa60ad169c4b95006d455a29b80ad0901f031a103b2991cd", 286 | "sha256:ca3a0b8c9110800e576d89b5337373e52018b41069bc879f12fa42b7eb2d0274", 287 | "sha256:cd1dc5c85b58494138a3917752b54bb1daa0045d234b7c132c37a61d5483ebad", 288 | "sha256:cdbc4c7f0cd7a2218b575844e970f05a1be1861c607b0e048c9bceca0c4d42f7", 289 | "sha256:d267125cc0f1e8a0eed6319ba4ac7477da9b78a535601c49ecd20c875576433a", 290 | "sha256:d72c55b5d56e176563b91d11952d13b01af8725c623e498db5507b6614fc1e10", 291 | "sha256:d95803193561a243cb0401b0567c6b7987d3f2a67046770e1dccd1c9e49a9780", 292 | "sha256:e92703bed0cc21d6cb5c61d66922b3b1564015ca8a51325bd164a5e33798d504", 293 | "sha256:f058bd0168271de4dcdc39845b52dd0a4a2fecf5f1246335f13f5e96eaebb467", 294 | "sha256:f3c19e5bd42bbe4bf345704ad7c326c74d3fd7a1b3844987853bef180be638d4" 295 | ], 296 | "markers": "python_version >= '3.5'", 297 | "version": "==20.3.0" 298 | }, 299 | "w3lib": { 300 | "hashes": [ 301 | "sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53", 302 | "sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df" 303 | ], 304 | "version": "==1.22.0" 305 | }, 306 | "zope.interface": { 307 | "hashes": [ 308 | "sha256:040f833694496065147e76581c0bf32b229a8b8c5eda120a0293afb008222387", 309 | "sha256:11198b44e4a3d8c7a80cc20bbdd65522258a4d82fe467cd310c9fcce8ffe2ed2", 310 | "sha256:121a9dccfe0c34be9c33b2c28225f0284f9b8e090580ffdff26c38fa16c7ffe1", 311 | "sha256:15f3082575e7e19581a80b866664f843719b647a7f7189c811ba7f9ab3309f83", 312 | "sha256:1d73d8986f948525536956ddd902e8a587a6846ebf4492117db16daba2865ddf", 313 | "sha256:208e82f73b242275b8566ac07a25158e7b21fa2f14e642a7881048430612d1a6", 314 | "sha256:2557833df892558123d791d6ff80ac4a2a0351f69c7421c7d5f0c07db72c8865", 315 | "sha256:25ea6906f9987d42546329d06f9750e69f0ee62307a2e7092955ed0758e64f09", 316 | "sha256:2c867914f7608674a555ac8daf20265644ac7be709e1da7d818089eebdfe544e", 317 | "sha256:2eadac20711a795d3bb7a2bfc87c04091cb5274d9c3281b43088a1227099b662", 318 | "sha256:37999d5ebd5d7bcd32438b725ca3470df05a7de8b1e9c0395bef24296b31ca99", 319 | "sha256:3ae8946d51789779f76e4fa326fd6676d8c19c1c3b4c4c5e9342807185264875", 320 | "sha256:5636cd7e60583b1608044ae4405e91575399430e66a5e1812f4bf30bcc55864e", 321 | "sha256:570e637cb6509998555f7e4af13006d89fad6c09cfc5c4795855385391063e4b", 322 | "sha256:590a40447ff3803c44050ce3c17c3958f11ca028dae3eacdd7b96775184394fa", 323 | "sha256:5aab51b9c1af1b8a84f40aa49ffe1684d41810b18d6c3e94aa50194e0a563f01", 324 | "sha256:5ffe4e0753393bcbcfc9a58133ed3d3a584634cc7cc2e667f8e3e6fbcbb2155d", 325 | "sha256:663982381bd428a275a841009e52983cc69c471a4979ce01344fadbf72cf353d", 326 | "sha256:6d06bf8e24dd6c473c4fbd8e16a83bd2e6d74add6ba25169043deb46d497b211", 327 | "sha256:6e5b9a4bf133cf1887b4a04c21c10ca9f548114f19c83957b2820d5c84254940", 328 | "sha256:70a2aed9615645bbe9d82c0f52bc7e676d2c0f8a63933d68418e0cb307f30536", 329 | "sha256:7750746421c4395e3d2cc3d805919f4f57bb9f2a9a0ccd955566a9341050a1b4", 330 | "sha256:7fc8708bc996e50fc7a9a2ad394e1f015348e389da26789fa6916630237143d7", 331 | "sha256:91abd2f080065a7c007540f6bbd93ef7bdbbffa6df4a4cfab3892d8623b83c98", 332 | "sha256:988f8b2281f3d95c66c01bdb141cefef1cc97db0d473c25c3fe2927ef00293b9", 333 | "sha256:9f56121d8a676802044584e6cc41250bbcde069d8adf725b9b817a6b0fd87f09", 334 | "sha256:a0f51536ce6e817a7aa25b0dca8b62feb210d4dc22cabfe8d1a92d47979372cd", 335 | "sha256:a1cdd7390d7f66ddcebf545203ca3728c4890d605f9f2697bc8e31437906e8e7", 336 | "sha256:b10eb4d0a77609679bf5f23708e20b1cd461a1643bd8ea42b1ca4149b1a5406c", 337 | "sha256:b274ac8e511b55ffb62e8292316bd2baa80c10e9fe811b1aa5ce81da6b6697d8", 338 | "sha256:c75b502af2c83fcfa2ee9c2257c1ba5806634a91a50db6129ff70e67c42c7e7b", 339 | "sha256:c9c8e53a5472b77f6a391b515c771105011f4b40740ce53af8428d1c8ca20004", 340 | "sha256:d867998a56c5133b9d31992beb699892e33b72150a8bf40f86cb52b8c606c83f", 341 | "sha256:eb566cab630ec176b2d6115ed08b2cf4d921b47caa7f02cca1b4a9525223ee94", 342 | "sha256:f61e6b95b414431ffe9dc460928fe9f351095fde074e2c2f5c6dda7b67a2192d", 343 | "sha256:f718675fd071bcce4f7cbf9250cbaaf64e2e91ef1b0b32a1af596e7412647556", 344 | "sha256:f9d4bfbd015e4b80dbad11c97049975f94592a6a0440e903ee647309f6252a1f", 345 | "sha256:fae50fc12a5e8541f6f1cc4ed744ca8f76a9543876cf63f618fb0e6aca8f8375", 346 | "sha256:fcf9c8edda7f7b2fd78069e97f4197815df5e871ec47b0f22580d330c6dec561", 347 | "sha256:fdedce3bc5360bd29d4bb90396e8d4d3c09af49bc0383909fe84c7233c5ee675" 348 | ], 349 | "version": "==5.1.2" 350 | } 351 | }, 352 | "develop": {} 353 | } 354 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Scrapy分布式RabbitMQ调度器 2 | ## 安装 3 | 4 | 使用pip安装 5 | 6 | ``` 7 | pip install scrapy-rabbitmq-scheduler 8 | ``` 9 | 或者克隆这个项目并且执行setup.py安装 10 | ``` 11 | python setup.py install 12 | ``` 13 | 14 | ## 使用 15 | ### 第一步: 在你的项目中的settings.py中添加配置项 16 | ``` 17 | # 指定项目的调度器 18 | SCHEDULER = "scrapy_rabbitmq_scheduler.scheduler.SaaS" 19 | 20 | # 指定rabbitmq的连接DSN 21 | RABBITMQ_CONNECTION_PARAMETERS = 'amqp://guest:guest@localhost:5672/' 22 | 23 | # 指定重试的http状态码(重新加回队列重试) 24 | SCHEDULER_REQUEUE_ON_STATUS = [500] 25 | 26 | # 指定下载器中间件, 确认任务是否成功 27 | DOWNLOADER_MIDDLEWARES = { 28 | 'scrapy_rabbitmq_scheduler.middleware.RabbitMQMiddleware': 999 29 | } 30 | # 指定item处理方式, item会加入到rabbitmq中 31 | ITEM_PIPELINES = { 32 | 'scrapy_rabbitmq_scheduler.pipelines.RabbitmqPipeline': 300, 33 | } 34 | ``` 35 | 36 | ### 第二步: 修改Spider的继承类 37 | ``` 38 | import scrapy 39 | from scrapy_rabbitmq_scheduler.spiders import RabbitSpider 40 | 41 | class CustomSpider(RabbitSpider): 42 | name = 'custom_spider' 43 | queue_name = 'test_urls' # 指定任务队列的名称 44 | items_key = 'test_item' # 指定item队列名称 45 | 46 | def parse(self, response): 47 | item = ... # parse item 48 | yield item 49 | ``` 50 | 51 | ### 第三步: 将任务写入到RabbitMQ队列 52 | ``` 53 | #!/usr/bin/env python 54 | import pika 55 | import settings 56 | 57 | connection = pika.BlockingConnection(pika.URLParameters(settings.RABBITMQ_CONNECTION_PARAMETERS)) 58 | channel = connection.channel() 59 | 60 | queue_key = 'test_urls' 61 | 62 | # 读取文件中的链接并写入到队列中 63 | with open('urls.txt') as f: 64 | for url in f: 65 | url = url.strip(' \n\r') 66 | channel.basic_publish(exchange='', 67 | routing_key=queue_key, 68 | body=url, 69 | properties=pika.BasicProperties( 70 | content_type='text/plain', 71 | delivery_mode=2 72 | )) 73 | 74 | connection.close() 75 | ``` 76 | urls.txt 77 | ```text 78 | http://www.baidu.com 79 | ``` 80 | ## 高级特色 81 | ### 1. 支持消息优先级 82 | 1. 消息优先级的范围为0~255, 数字越大, 优先级越高 83 | ```python 84 | yield scrapy.Request(url, priority=优先级) 85 | ``` 86 | 则可以直接指定消息的优先级 87 | 88 | ### 2. 队列持久化 89 | ```python 90 | # settings.py 91 | RABBITMQ_DURABLE = True # 是否持久化队列, True为持久化 False为非持久化, 默认True 92 | ``` 93 | 94 | ### 3. 消息确认 95 | ```python 96 | # settings.py 97 | RABBITMQ_CONFIRM_DELIVERY = True # 消息是否需要确认, True为需要, False为不需要, 默认是True 98 | ``` 99 | 100 | ### 4. 增加消息延时 101 | scrapy-rabbitmq-scheduler的消息延时是使用`rabbitmq-delayed-message-exchange`插件实现的, 所以在使用之前需要先安装以及开启这个插件 102 | `rabbitmq-delayed-message-exchange`: https://github.com/rabbitmq/rabbitmq-delayed-message-exchange 103 | 104 | **在spider中开启延时队列** 105 | ```python 106 | # -*- coding: utf-8 -*- 107 | import scrapy 108 | from scrapy_rabbitmq_scheduler.spiders import RabbitSpider 109 | from example.items import ArticleItem 110 | 111 | 112 | class CcidcomSpider(RabbitSpider): 113 | .... 114 | # 队列名称 115 | queue_name = 'ccidcom' 116 | # 是否是延迟队列 117 | is_delay_queue = True 118 | ... 119 | ``` 120 | `is_delay_queue`设置为True,则自动会开启延时 121 | 122 | **使用延时** 123 | ```python 124 | yield scrapy.Request('http://www.ccidcom.com/', callback=self.parse, meta={'_delay_time': 10000}) 125 | ``` 126 | 在meta中增加`_delay_time`, 指定延时毫秒数, 则自动生效 127 | ## TODO 128 | - [x] 支持延时请求 129 | - [x] 增加任务持久化配置 130 | -------------------------------------------------------------------------------- /examples/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.mirrors.ustc.edu.cn/simple/" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | scrapy = "*" 10 | scrapy-rabbitmq-scheduler = "*" 11 | 12 | [requires] 13 | python_version = "3.6" 14 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aox-lei/scrapy-rabbitmq-scheduler/4d5615a29be7db48ad7549b5f1607de6d7d866c9/examples/README.md -------------------------------------------------------------------------------- /examples/example/example/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import sys 4 | sys.path.append(os.path.abspath( 5 | os.path.dirname( 6 | os.path.dirname( 7 | os.path.dirname( 8 | os.path.dirname(__file__) 9 | ) 10 | )))+'/src') 11 | -------------------------------------------------------------------------------- /examples/example/example/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TestspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | 17 | 18 | class ArticleItem(scrapy.Item): 19 | title = scrapy.Field() 20 | url = scrapy.Field() 21 | -------------------------------------------------------------------------------- /examples/example/example/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ExampleSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ExampleDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /examples/example/example/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ExamplePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /examples/example/example/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for example project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'example' 13 | 14 | SPIDER_MODULES = ['example.spiders'] 15 | NEWSPIDER_MODULE = 'example.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'example (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'example.middlewares.ExampleSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'example.middlewares.ExampleDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'example.pipelines.ExamplePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | # 指定调度器 93 | SCHEDULER = "scrapy_rabbitmq_scheduler.scheduler.SaaS" 94 | RABBITMQ_CONNECTION_PARAMETERS = 'amqp://guest:guest@localhost:5672/?heartbeat=0' 95 | SCHEDULER_REQUEUE_ON_STATUS = [500] 96 | DOWNLOADER_MIDDLEWARES = { 97 | 'scrapy_rabbitmq_scheduler.middleware.RabbitMQMiddleware': 999 98 | } 99 | ITEM_PIPELINES = { 100 | 'scrapy_rabbitmq_scheduler.pipelines.RabbitmqPipeline': 300, 101 | } 102 | LOG_LEVEL = 'INFO' 103 | -------------------------------------------------------------------------------- /examples/example/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /examples/example/example/spiders/ccidcom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy_rabbitmq_scheduler.spiders import RabbitSpider 4 | from example.items import ArticleItem 5 | 6 | 7 | class CcidcomSpider(RabbitSpider): 8 | name = 'ccidcom' 9 | allowed_domains = ['ccidcom.com'] 10 | # 队列名称 11 | queue_name = 'ccidcom' 12 | # 是否是延迟队列 13 | is_delay_queue = True 14 | # item队列名称 15 | items_key = 'item_ccidcom' 16 | 17 | def start_requests(self): 18 | yield scrapy.Request('http://www.ccidcom.com/', callback=self.parse, meta={'_delay_time': 0}) 19 | 20 | def parse(self, response): 21 | navigation_list = response.css( 22 | '#nav > div.nav-main.clearfix > ul > li > div > a::attr("href")') 23 | for _index, _link in enumerate(navigation_list): 24 | yield response.follow(_link, 25 | dont_filter=True, 26 | callback=self.parse_list, meta={'_delay_time': 0}) 27 | 28 | def parse_list(self, response): 29 | article_list = response.css('div.article-item') 30 | for info in article_list: 31 | item = ArticleItem() 32 | item['title'] = info.css('div.title a>font::text').get() 33 | item['url'] = info.css('div.title a::attr("href")').get() 34 | yield item 35 | 36 | yield scrapy.Request('http://www.ccidcom.com/', callback=self.parse, meta={'_delay_time': 0}) 37 | -------------------------------------------------------------------------------- /examples/example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-*- 2 | from setuptools import setup, find_packages 3 | setup( 4 | # 以下为必需参数 5 | name='scrapy-rabbitmq-scheduler', # 模块名 6 | version='1.0.9', # 当前版本 7 | description='Rabbitmq for Distributed scraping', # 简短描述 8 | author='aox lei', 9 | author_email='2387813033@qq.com', 10 | license='MIT', 11 | url='https://github.com/aox-lei/scrapy-rabbitmq-scheduler', 12 | install_requires=[ 13 | 'pika', 14 | 'Scrapy>=0.14' 15 | ], 16 | packages=['scrapy_rabbitmq_scheduler'], 17 | package_dir={'': 'src'} 18 | ) 19 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __title__ = 'scrapy-rabbitmq-ds' 3 | __version__ = '1.0.3' 4 | __author__ = 'aox.lei' 5 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/connection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pika 3 | 4 | 5 | def get_channel(connection, queue_name, durable=True, confirm_delivery=True, is_delay=False): 6 | """ Init method to return a prepared channel for consuming 7 | """ 8 | channel = connection.channel() 9 | channel.queue_declare(queue=queue_name, durable=durable, arguments={ 10 | 'x-max-priority': 255 11 | }) 12 | if confirm_delivery: 13 | channel.confirm_delivery() 14 | 15 | if is_delay is True: 16 | exchange_name = "{}-delay".format(queue_name) 17 | channel.exchange_declare(exchange_name, 18 | exchange_type="x-delayed-message", 19 | arguments={"x-delayed-type": "direct"}) 20 | channel.queue_bind( 21 | queue=queue_name, exchange=exchange_name, routing_key=queue_name) 22 | return channel 23 | 24 | 25 | def connect(connection_url): 26 | """ Create and return a fresh connection 27 | """ 28 | return pika.BlockingConnection(pika.URLParameters(connection_url)) 29 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/middleware.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import logging 3 | 4 | from scrapy.exceptions import IgnoreRequest 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class RabbitMQMiddleware(object): 10 | """ Middleware used to close message from current queue or 11 | send unsuccessful messages to be rescheduled. 12 | """ 13 | def __init__(self, settings): 14 | self.requeue_list = settings.get('SCHEDULER_REQUEUE_ON_STATUS', []) 15 | self.init = True 16 | 17 | @classmethod 18 | def from_settings(self, settings): 19 | return RabbitMQMiddleware(settings) 20 | 21 | @classmethod 22 | def from_crawler(self, crawler): 23 | return RabbitMQMiddleware(crawler.settings) 24 | 25 | def ensure_init(self, spider): 26 | if self.init: 27 | self.spider = spider 28 | self.scheduler = spider.crawler.engine.slot.scheduler 29 | self.stats = spider.crawler.stats 30 | self.init = False 31 | 32 | def process_response(self, request, response, spider): 33 | self.ensure_init(spider) 34 | if not is_a_picture(response): 35 | if response.status in self.requeue_list: 36 | self.requeue(response) 37 | self.ack(request, response) 38 | request.meta['requeued'] = True 39 | raise IgnoreRequest 40 | else: 41 | self.ack(request, response) 42 | else: 43 | self.process_picture(response) 44 | return response 45 | 46 | def has_delivery_tag(self, request): 47 | if self.spider.settings.get('RABBITMQ_CONFIRM_DELIVERY', True) is not True: 48 | return False 49 | if 'delivery_tag' not in request.meta: 50 | logger.error('Request %(request)s does not have a deliver tag.' % 51 | {'request': request}) 52 | return False 53 | return True 54 | 55 | def process_picture(self, response): 56 | logger.info('Picture (%(status)d): %(url)s', { 57 | 'url': response.url, 58 | 'status': response.status 59 | }) 60 | self.inc_stat('picture') 61 | 62 | def requeue(self, response): 63 | self.scheduler.requeue_message(response.url) 64 | logger.info('Requeued (%(status)d): %(url)s', { 65 | 'url': response.url, 66 | 'status': response.status 67 | }) 68 | self.inc_stat('requeued') 69 | 70 | def ack(self, request, response): 71 | if self.has_delivery_tag(request): 72 | delivery_tag = request.meta.get('delivery_tag') 73 | self.scheduler.ack_message(delivery_tag) 74 | logger.info('Acked (%(status)d): %(url)s' % { 75 | 'url': response.url, 76 | 'status': response.status 77 | }) 78 | self.inc_stat('acked') 79 | 80 | def inc_stat(self, stat): 81 | self.stats.inc_value('scheduler/acking/%(stat)s/rabbitmq' % 82 | {'stat': stat}, 83 | spider=self.spider) 84 | 85 | 86 | def is_a_picture(response): 87 | picture_exts = ['.png', '.jpg'] 88 | return any([response.url.endswith(ext) for ext in picture_exts]) 89 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/picklecompat.py: -------------------------------------------------------------------------------- 1 | """A pickle wrapper module with protocol=-1 by default.""" 2 | 3 | try: 4 | import cPickle as pickle # PY2 5 | except ImportError: 6 | import pickle 7 | 8 | 9 | def loads(s): 10 | return pickle.loads(s) 11 | 12 | 13 | def dumps(obj): 14 | return pickle.dumps(obj, protocol=-1) -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/pipelines.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from scrapy.utils.misc import load_object 3 | from scrapy.utils.serialize import ScrapyJSONEncoder 4 | from twisted.internet.threads import deferToThread 5 | 6 | from . import connection 7 | 8 | default_serialize = ScrapyJSONEncoder().encode 9 | 10 | logger = logging.getLogger('scrapy_rabbitmq_scheduler.pipeline.RabbitmqPipeline') 11 | 12 | class RabbitmqPipeline(object): 13 | def __init__(self, item_key, connection_url): 14 | self.server = connection.connect(connection_url) 15 | self.item_key = item_key 16 | self.serialize = default_serialize 17 | self.channel = connection.get_channel(self.server, self.item_key) 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | if hasattr(crawler.spider, 'items_key'): 22 | item_key = crawler.spider.items_key 23 | else: 24 | item_key = 'items_{spider_name}'.format( 25 | spider_name=crawler.spider.name) 26 | return cls(item_key=item_key, 27 | connection_url=crawler.settings.get( 28 | 'RABBITMQ_CONNECTION_PARAMETERS')) 29 | 30 | def process_item(self, item, spider): 31 | data = self.serialize(item) 32 | try_time = 1 33 | while try_time <= 10: 34 | try: 35 | self.channel.basic_publish(exchange='', 36 | routing_key=self.item_key, 37 | body=data) 38 | return 39 | except Exception as e: 40 | logger.exception(e) 41 | logger.error('process item failed! try_time:{}'.format(try_time)) 42 | try_time += 1 43 | self.channel = connection.get_channel(self.server, self.item_key) 44 | return item 45 | 46 | def close(self): 47 | """Close channel""" 48 | logger.error('pipeline channel is closed!!!!!!!!!!!') 49 | self.channel.close() 50 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/queue.py: -------------------------------------------------------------------------------- 1 | # system packages 2 | import sys 3 | import time 4 | import pika 5 | import logging 6 | from scrapy.utils.reqser import request_to_dict 7 | # module packages 8 | from . import connection 9 | from . import picklecompat 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class IQueue(object): 15 | """Per-spider queue/stack base class""" 16 | 17 | def __init__(self): 18 | """Init method""" 19 | raise NotImplementedError 20 | 21 | def __len__(self): 22 | """Return the length of the queue""" 23 | raise NotImplementedError 24 | 25 | def push(self, url): 26 | """Push an url""" 27 | raise NotImplementedError 28 | 29 | def pop(self, timeout=0): 30 | """Pop an url""" 31 | raise NotImplementedError 32 | 33 | def clear(self): 34 | """Clear queue/stack""" 35 | raise NotImplementedError 36 | 37 | 38 | class RabbitMQQueue(IQueue): 39 | """Per-spider FIFO queue""" 40 | 41 | def __init__(self, connection_url, key, exchange=None, spider=None): 42 | """Initialize per-spider RabbitMQ queue. 43 | 44 | Parameters: 45 | connection_url -- rabbitmq connection url 46 | key -- rabbitmq routing key 47 | """ 48 | self.key = key 49 | self.connection_url = connection_url 50 | self.server = None 51 | self.serializer = picklecompat 52 | self.spider = spider 53 | self.connect() 54 | 55 | def __len__(self): 56 | """Return the length of the queue""" 57 | declared = self.channel.queue_declare(self.key, passive=True) 58 | return declared.method.message_count 59 | 60 | def _try_operation(function): 61 | """Wrap unary method by reconnect procedure""" 62 | 63 | def wrapper(self, *args, **kwargs): 64 | try: 65 | return function(self, *args, **kwargs) 66 | except Exception as e: 67 | msg = 'Function %s failed. ErrorMsg... (%s)' %\ 68 | (str(function), e) 69 | logger.info(msg) 70 | 71 | return wrapper 72 | 73 | def _encode_request(self, request): 74 | """Encode a request object""" 75 | obj = request_to_dict(request, self.spider) 76 | return self.serializer.dumps(obj) 77 | 78 | @_try_operation 79 | def pop(self, no_ack=False): 80 | try_time = 1 81 | while try_time <= 10: 82 | """Pop a message""" 83 | try: 84 | return self.channel.basic_get(queue=self.key, auto_ack=no_ack) 85 | except Exception as e: 86 | try_time += 1 87 | logger.exception(e) 88 | logger.error( 89 | 'pop a message failed, trying: {}...'.format(try_time)) 90 | self.connect() 91 | 92 | @_try_operation 93 | def ack(self, delivery_tag): 94 | try_time = 1 95 | while try_time <= 3: 96 | try: 97 | """Ack a message""" 98 | self.channel.basic_ack(delivery_tag=delivery_tag) 99 | return 100 | except Exception as e: 101 | try_time += 1 102 | # logger.exception(e) 103 | logger.error( 104 | 'ask a message failed, trying: {}...'.format(try_time)) 105 | self.connect() 106 | 107 | @_try_operation 108 | def push(self, body, headers={}): 109 | """Push a message""" 110 | properties = pika.BasicProperties() 111 | properties.priority = body.priority 112 | if body.priority < 0 or body.priority > 255: 113 | properties.priority = 0 114 | 115 | # 处理延时消息 116 | if '_delay_time' in body.meta: 117 | headers['x-delay'] = body.meta.get('_delay_time') 118 | 119 | if hasattr(self.spider, 'is_delay_queue') and self.spider.is_delay_queue is True: 120 | exchange = '{}-delay'.format(self.key) 121 | else: 122 | exchange = '' 123 | properties.headers = headers 124 | 125 | try_time = 1 126 | while try_time <= 10: 127 | try: 128 | """Ack a message""" 129 | self.channel.basic_publish(exchange=exchange, 130 | routing_key=self.key, 131 | body=self._encode_request(body), 132 | properties=properties) 133 | return 134 | except Exception as e: 135 | try_time += 1 136 | logger.exception(e) 137 | logger.error( 138 | 'push a message failed, trying: {}...'.format(try_time)) 139 | self.connect() 140 | 141 | def connect(self): 142 | """Make a connection""" 143 | if self.server: 144 | try: 145 | self.server.close() 146 | except: 147 | pass 148 | 149 | self.server = connection.connect(self.connection_url) 150 | 151 | is_delay = False 152 | if hasattr(self.spider, 'is_delay_queue'): 153 | is_delay = self.spider.is_delay_queue 154 | 155 | self.channel = connection.get_channel( 156 | self.server, 157 | self.key, 158 | durable=self.spider.settings.get('RABBITMQ_DURABLE', True), 159 | confirm_delivery=self.spider.settings.get( 160 | 'RABBITMQ_CONFIRM_DELIVERY', True), 161 | is_delay=is_delay) 162 | 163 | def close(self): 164 | """Close channel""" 165 | logger.error('channel is closed!!!!!!!!!!!') 166 | self.channel.close() 167 | 168 | def clear(self): 169 | """Clear queue/stack""" 170 | self.channel.queue_purge(self.key) 171 | 172 | 173 | __all__ = ['SpiderQueue'] 174 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/scheduler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import signal 4 | import logging 5 | import pickle 6 | 7 | from scrapy.http import Request 8 | from . import connection 9 | from .queue import RabbitMQQueue 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class IScheduler(object): 15 | """ Base Scrapy scheduler class. """ 16 | 17 | def __init__(self): 18 | raise NotImplementedError 19 | 20 | def open(self, spider): 21 | """Start scheduling""" 22 | raise NotImplementedError 23 | 24 | def close(self, reason): 25 | """Stop scheduling""" 26 | raise NotImplementedError 27 | 28 | def enqueue_request(self, request): 29 | """Add request to queue""" 30 | raise NotImplementedError 31 | 32 | def next_request(self): 33 | """Pop a request""" 34 | raise NotImplementedError 35 | 36 | def has_pending_requests(self): 37 | """Check if queue is not empty""" 38 | raise NotImplementedError 39 | 40 | 41 | class Scheduler(IScheduler): 42 | # TODO: to be extended in future 43 | @staticmethod 44 | def _ensure_settings(settings, key): 45 | if not settings.get(key): 46 | msg = 'Please set "{}" at settings.'.format(key) 47 | raise ValueError(msg) 48 | 49 | 50 | repo_url = 'https://github.com/aox-lei/scrapy_rabbitmq' 51 | 52 | 53 | class RabbitMQScheduler(Scheduler): 54 | """ A RabbitMQ Scheduler for Scrapy. """ 55 | queue = None 56 | stats = None 57 | 58 | def __init__(self, connection_url, *args, **kwargs): 59 | self.connection_url = connection_url 60 | self.waiting = False 61 | self.closing = False 62 | 63 | @classmethod 64 | def from_settings(cls, settings): 65 | cls._ensure_settings(settings, 'RABBITMQ_CONNECTION_PARAMETERS') 66 | connection_url = settings.get('RABBITMQ_CONNECTION_PARAMETERS') 67 | return cls(connection_url) 68 | 69 | @classmethod 70 | def from_crawler(cls, crawler): 71 | scheduler = cls.from_settings(crawler.settings) 72 | scheduler.stats = crawler.stats 73 | signal.signal(signal.SIGINT, scheduler.on_sigint) 74 | return scheduler 75 | 76 | def __len__(self): 77 | return len(self.queue) 78 | 79 | def open(self, spider): 80 | if not hasattr(spider, '_make_request'): 81 | msg = 'Method _make_request not found in spider. ' 82 | msg += 'Please add it to spider or see manual at ' 83 | msg += repo_url 84 | raise NotImplementedError(msg) 85 | 86 | if not hasattr(spider, 'queue_name'): 87 | msg = 'Please set queue_name parameter to spider. ' 88 | msg += 'Consult manual at ' + repo_url 89 | raise ValueError(msg) 90 | 91 | self.spider = spider 92 | self.queue = self._make_queue(spider.queue_name) 93 | msg_count = len(self.queue) 94 | if msg_count: 95 | logger.info( 96 | 'Resuming crawling ({} urls scheduled)'.format(msg_count)) 97 | else: 98 | logger.info('No items to crawl in {}'.format(self.queue.key)) 99 | 100 | def _make_queue(self, key): 101 | return RabbitMQQueue(self.connection_url, key, spider=self.spider) 102 | 103 | def on_sigint(self, signal, frame): 104 | self.closing = True 105 | 106 | def close(self, reason): 107 | try: 108 | logger.error('scheduler schchannel is closed!!!!!!!!!!!') 109 | self.queue.close() 110 | self.queue = None 111 | except: 112 | pass 113 | 114 | def enqueue_request(self, request): 115 | """ Enqueues request to main queues back 116 | """ 117 | if self.queue is not None: 118 | if self.stats: 119 | self.stats.inc_value('scheduler/enqueued/rabbitmq', 120 | spider=self.spider) 121 | self.queue.push(request) 122 | return True 123 | 124 | def next_request(self): 125 | """ Creates and returns a request to fire 126 | """ 127 | if self.closing: 128 | self.close('user close') 129 | return 130 | 131 | no_ack = True if self.spider.settings.get( 132 | 'RABBITMQ_CONFIRM_DELIVERY', True) is False else False 133 | mframe, hframe, body = self.queue.pop(no_ack=no_ack) 134 | 135 | if any([mframe, hframe, body]): 136 | self.waiting = False 137 | if self.stats: 138 | self.stats.inc_value('scheduler/dequeued/rabbitmq', 139 | spider=self.spider) 140 | 141 | request = self.spider._make_request(mframe, hframe, body) 142 | if self.spider.settings.get('RABBITMQ_CONFIRM_DELIVERY', True): 143 | request.meta['delivery_tag'] = mframe.delivery_tag 144 | 145 | logger.info('Running request {}'.format(request.url)) 146 | return request 147 | else: 148 | if not self.waiting: 149 | msg = 'Queue {} is empty. Waiting for messages...' 150 | self.waiting = True 151 | logger.info(msg.format(self.queue.key)) 152 | return None 153 | 154 | def has_pending_requests(self): 155 | return not self.closing 156 | 157 | 158 | class SaaS(RabbitMQScheduler): 159 | """ Scheduler as a RabbitMQ service. 160 | """ 161 | 162 | def __init__(self, connection_url, *args, **kwargs): 163 | super(SaaS, self).__init__(connection_url, *args, **kwargs) 164 | 165 | def ack_message(self, delivery_tag): 166 | if self.queue: 167 | self.queue.ack(delivery_tag) 168 | 169 | def requeue_message(self, body, headers=None): 170 | if self.queue: 171 | self.queue.push(body, headers) 172 | -------------------------------------------------------------------------------- /src/scrapy_rabbitmq_scheduler/spiders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import pickle 4 | from scrapy.utils.reqser import request_from_dict 5 | 6 | 7 | class RabbitSpider(scrapy.Spider): 8 | def _make_request(self, mframe, hframe, body): 9 | try: 10 | request = request_from_dict(pickle.loads(body), self) 11 | except Exception as e: 12 | body = body.decode() 13 | request = scrapy.Request(body, callback=self.parse, dont_filter=True) 14 | return request 15 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pika 2 | connection = pika.BlockingConnection(pika.ConnectionParameters('localhost')) 3 | channel = connection.channel() 4 | channel.exchange_declare("test-x", exchange_type="x-delayed-message", 5 | arguments={"x-delayed-type": "direct"}) 6 | channel.queue_declare(queue='task_queue', durable=True) 7 | channel.queue_bind(queue="task_queue", exchange="test-x", 8 | routing_key="task_queue") 9 | channel.basic_publish( 10 | exchange='test-x', 11 | routing_key='task_queue', 12 | body='Hello World! Delayed', 13 | properties=pika.BasicProperties(headers={"x-delay": 10000}) 14 | ) 15 | print(" [x] Sent 'Hello World! Delayed'") 16 | connection.close() 17 | --------------------------------------------------------------------------------