├── .gitignore ├── MANIFEST.in ├── Pipfile ├── Pipfile.lock ├── README.md ├── docker ├── Shanghai ├── custom_proxies_site.py ├── deploy.py ├── https_check.py ├── pf.dockerfile └── timezone ├── proxy_factory ├── __init__.py ├── factory.py ├── proxy_site_spider.py ├── settings.py └── utils.py ├── requirements.txt ├── setup.py └── tests └── test_site.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea/ 6 | # C extensions 7 | *.so 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | .spyproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # mkdocs documentation 97 | /site 98 | 99 | # mypy 100 | .mypy_cache/ 101 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | include Pipfile -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | pytest = "*" 8 | pytest-apistellar = "*" 9 | pytest-cov = "*" 10 | 11 | [packages] 12 | requests = "*" 13 | pytesseract = "*" 14 | redis = "~=2.9.0" 15 | bs4 = "*" 16 | Pillow = "*" 17 | toolkity = "~=1.9.0" 18 | 19 | [requires] 20 | python_version = "3.6" 21 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "cfc66072d29adbe5986699d7f8085fbef82c1f65a4394ea45ce6eec21ec8bdd7" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "appnope": { 20 | "hashes": [ 21 | "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", 22 | "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" 23 | ], 24 | "markers": "sys_platform == 'darwin'", 25 | "version": "==0.1.0" 26 | }, 27 | "backcall": { 28 | "hashes": [ 29 | "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", 30 | "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" 31 | ], 32 | "version": "==0.1.0" 33 | }, 34 | "beautifulsoup4": { 35 | "hashes": [ 36 | "sha256:05668158c7b85b791c5abde53e50265e16f98ad601c402ba44d70f96c4159612", 37 | "sha256:25288c9e176f354bf277c0a10aa96c782a6a18a17122dba2e8cec4a97e03343b", 38 | "sha256:f040590be10520f2ea4c2ae8c3dae441c7cfff5308ec9d58a0ec0c1b8f81d469" 39 | ], 40 | "version": "==4.8.0" 41 | }, 42 | "bs4": { 43 | "hashes": [ 44 | "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" 45 | ], 46 | "index": "pypi", 47 | "version": "==0.0.1" 48 | }, 49 | "certifi": { 50 | "hashes": [ 51 | "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", 52 | "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" 53 | ], 54 | "version": "==2019.9.11" 55 | }, 56 | "chardet": { 57 | "hashes": [ 58 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 59 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 60 | ], 61 | "version": "==3.0.4" 62 | }, 63 | "decorator": { 64 | "hashes": [ 65 | "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", 66 | "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" 67 | ], 68 | "version": "==4.4.0" 69 | }, 70 | "future": { 71 | "hashes": [ 72 | "sha256:67045236dcfd6816dc439556d009594abf643e5eb48992e36beac09c2ca659b8" 73 | ], 74 | "version": "==0.17.1" 75 | }, 76 | "idna": { 77 | "hashes": [ 78 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 79 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 80 | ], 81 | "version": "==2.8" 82 | }, 83 | "ipdb": { 84 | "hashes": [ 85 | "sha256:473fdd798a099765f093231a8b1fabfa95b0b682fce12de0c74b61a4b4d8ee57" 86 | ], 87 | "version": "==0.12.2" 88 | }, 89 | "ipython": { 90 | "hashes": [ 91 | "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b", 92 | "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1" 93 | ], 94 | "version": "==7.8.0" 95 | }, 96 | "ipython-genutils": { 97 | "hashes": [ 98 | "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", 99 | "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" 100 | ], 101 | "version": "==0.2.0" 102 | }, 103 | "jedi": { 104 | "hashes": [ 105 | "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27", 106 | "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e" 107 | ], 108 | "version": "==0.15.1" 109 | }, 110 | "kafka-python": { 111 | "hashes": [ 112 | "sha256:08f83d8e0af2e64d25f94314d4bef6785b34e3b0df0effe9eebf76b98de66eeb", 113 | "sha256:3f55bb3e125764a37da550e9fa3d10a85fa09f8af8f8a40f223d2ec8486c2a5b" 114 | ], 115 | "version": "==1.4.6" 116 | }, 117 | "markdown": { 118 | "hashes": [ 119 | "sha256:2e50876bcdd74517e7b71f3e7a76102050edec255b3983403f1a63e7c8a41e7a", 120 | "sha256:56a46ac655704b91e5b7e6326ce43d5ef72411376588afa1dd90e881b83c7e8c" 121 | ], 122 | "version": "==3.1.1" 123 | }, 124 | "parso": { 125 | "hashes": [ 126 | "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc", 127 | "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c" 128 | ], 129 | "version": "==0.5.1" 130 | }, 131 | "pexpect": { 132 | "hashes": [ 133 | "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", 134 | "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" 135 | ], 136 | "markers": "sys_platform != 'win32'", 137 | "version": "==4.7.0" 138 | }, 139 | "pickleshare": { 140 | "hashes": [ 141 | "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", 142 | "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" 143 | ], 144 | "version": "==0.7.5" 145 | }, 146 | "pillow": { 147 | "hashes": [ 148 | "sha256:0804f77cb1e9b6dbd37601cee11283bba39a8d44b9ddb053400c58e0c0d7d9de", 149 | "sha256:0ab7c5b5d04691bcbd570658667dd1e21ca311c62dcfd315ad2255b1cd37f64f", 150 | "sha256:0b3e6cf3ea1f8cecd625f1420b931c83ce74f00c29a0ff1ce4385f99900ac7c4", 151 | "sha256:365c06a45712cd723ec16fa4ceb32ce46ad201eb7bbf6d3c16b063c72b61a3ed", 152 | "sha256:38301fbc0af865baa4752ddae1bb3cbb24b3d8f221bf2850aad96b243306fa03", 153 | "sha256:3aef1af1a91798536bbab35d70d35750bd2884f0832c88aeb2499aa2d1ed4992", 154 | "sha256:3fe0ab49537d9330c9bba7f16a5f8b02da615b5c809cdf7124f356a0f182eccd", 155 | "sha256:45a619d5c1915957449264c81c008934452e3fd3604e36809212300b2a4dab68", 156 | "sha256:49f90f147883a0c3778fd29d3eb169d56416f25758d0f66775db9184debc8010", 157 | "sha256:571b5a758baf1cb6a04233fb23d6cf1ca60b31f9f641b1700bfaab1194020555", 158 | "sha256:5ac381e8b1259925287ccc5a87d9cf6322a2dc88ae28a97fe3e196385288413f", 159 | "sha256:6153db744a743c0c8c91b8e3b9d40e0b13a5d31dbf8a12748c6d9bfd3ddc01ad", 160 | "sha256:6fd63afd14a16f5d6b408f623cc2142917a1f92855f0df997e09a49f0341be8a", 161 | "sha256:70acbcaba2a638923c2d337e0edea210505708d7859b87c2bd81e8f9902ae826", 162 | "sha256:70b1594d56ed32d56ed21a7fbb2a5c6fd7446cdb7b21e749c9791eac3a64d9e4", 163 | "sha256:76638865c83b1bb33bcac2a61ce4d13c17dba2204969dedb9ab60ef62bede686", 164 | "sha256:7b2ec162c87fc496aa568258ac88631a2ce0acfe681a9af40842fc55deaedc99", 165 | "sha256:7cee2cef07c8d76894ebefc54e4bb707dfc7f258ad155bd61d87f6cd487a70ff", 166 | "sha256:7d16d4498f8b374fc625c4037742fbdd7f9ac383fd50b06f4df00c81ef60e829", 167 | "sha256:b50bc1780681b127e28f0075dfb81d6135c3a293e0c1d0211133c75e2179b6c0", 168 | "sha256:bd0582f831ad5bcad6ca001deba4568573a4675437db17c4031939156ff339fa", 169 | "sha256:cfd40d8a4b59f7567620410f966bb1f32dc555b2b19f82a91b147fac296f645c", 170 | "sha256:e3ae410089de680e8f84c68b755b42bc42c0ceb8c03dbea88a5099747091d38e", 171 | "sha256:e9046e559c299b395b39ac7dbf16005308821c2f24a63cae2ab173bd6aa11616", 172 | "sha256:ef6be704ae2bc8ad0ebc5cb850ee9139493b0fc4e81abcc240fb392a63ebc808", 173 | "sha256:f8dc19d92896558f9c4317ee365729ead9d7bbcf2052a9a19a3ef17abbb8ac5b" 174 | ], 175 | "index": "pypi", 176 | "version": "==6.1.0" 177 | }, 178 | "prompt-toolkit": { 179 | "hashes": [ 180 | "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", 181 | "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", 182 | "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" 183 | ], 184 | "version": "==2.0.9" 185 | }, 186 | "ptyprocess": { 187 | "hashes": [ 188 | "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", 189 | "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" 190 | ], 191 | "version": "==0.6.0" 192 | }, 193 | "pygments": { 194 | "hashes": [ 195 | "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", 196 | "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" 197 | ], 198 | "version": "==2.4.2" 199 | }, 200 | "pytesseract": { 201 | "hashes": [ 202 | "sha256:ae1dce01413d1f8eb0614fd65d831e26e649dc1a31699b7275455c57aa563b59" 203 | ], 204 | "index": "pypi", 205 | "version": "==0.3.0" 206 | }, 207 | "python-json-logger": { 208 | "hashes": [ 209 | "sha256:b7a31162f2a01965a5efb94453ce69230ed208468b0bbc7fdfc56e6d8df2e281" 210 | ], 211 | "version": "==0.1.11" 212 | }, 213 | "redis": { 214 | "hashes": [ 215 | "sha256:af9747ec2727425b1b09252975e21502ee5a3d8d235c7f49869eb13e09ccf4e4" 216 | ], 217 | "index": "pypi", 218 | "version": "==2.9.1" 219 | }, 220 | "requests": { 221 | "hashes": [ 222 | "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", 223 | "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" 224 | ], 225 | "index": "pypi", 226 | "version": "==2.22.0" 227 | }, 228 | "six": { 229 | "hashes": [ 230 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 231 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 232 | ], 233 | "version": "==1.12.0" 234 | }, 235 | "soupsieve": { 236 | "hashes": [ 237 | "sha256:605f89ad5fdbfefe30cdc293303665eff2d188865d4dbe4eb510bba1edfbfce3", 238 | "sha256:b91d676b330a0ebd5b21719cb6e9b57c57d433671f65b9c28dd3461d9a1ed0b6" 239 | ], 240 | "version": "==1.9.4" 241 | }, 242 | "toolkity": { 243 | "hashes": [ 244 | "sha256:9752aaec7610aaef7d6e9b5ab779de9d6dba9603b8a71a9fa6a71adff92aa4bf" 245 | ], 246 | "index": "pypi", 247 | "version": "==1.9.0" 248 | }, 249 | "traitlets": { 250 | "hashes": [ 251 | "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", 252 | "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" 253 | ], 254 | "version": "==4.3.2" 255 | }, 256 | "urllib3": { 257 | "hashes": [ 258 | "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", 259 | "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" 260 | ], 261 | "version": "==1.25.6" 262 | }, 263 | "wcwidth": { 264 | "hashes": [ 265 | "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", 266 | "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" 267 | ], 268 | "version": "==0.1.7" 269 | } 270 | }, 271 | "develop": { 272 | "atomicwrites": { 273 | "hashes": [ 274 | "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", 275 | "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" 276 | ], 277 | "version": "==1.3.0" 278 | }, 279 | "attrs": { 280 | "hashes": [ 281 | "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", 282 | "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" 283 | ], 284 | "version": "==19.1.0" 285 | }, 286 | "coverage": { 287 | "hashes": [ 288 | "sha256:08907593569fe59baca0bf152c43f3863201efb6113ecb38ce7e97ce339805a6", 289 | "sha256:0be0f1ed45fc0c185cfd4ecc19a1d6532d72f86a2bac9de7e24541febad72650", 290 | "sha256:141f08ed3c4b1847015e2cd62ec06d35e67a3ac185c26f7635f4406b90afa9c5", 291 | "sha256:19e4df788a0581238e9390c85a7a09af39c7b539b29f25c89209e6c3e371270d", 292 | "sha256:23cc09ed395b03424d1ae30dcc292615c1372bfba7141eb85e11e50efaa6b351", 293 | "sha256:245388cda02af78276b479f299bbf3783ef0a6a6273037d7c60dc73b8d8d7755", 294 | "sha256:331cb5115673a20fb131dadd22f5bcaf7677ef758741312bee4937d71a14b2ef", 295 | "sha256:386e2e4090f0bc5df274e720105c342263423e77ee8826002dcffe0c9533dbca", 296 | "sha256:3a794ce50daee01c74a494919d5ebdc23d58873747fa0e288318728533a3e1ca", 297 | "sha256:60851187677b24c6085248f0a0b9b98d49cba7ecc7ec60ba6b9d2e5574ac1ee9", 298 | "sha256:63a9a5fc43b58735f65ed63d2cf43508f462dc49857da70b8980ad78d41d52fc", 299 | "sha256:6b62544bb68106e3f00b21c8930e83e584fdca005d4fffd29bb39fb3ffa03cb5", 300 | "sha256:6ba744056423ef8d450cf627289166da65903885272055fb4b5e113137cfa14f", 301 | "sha256:7494b0b0274c5072bddbfd5b4a6c6f18fbbe1ab1d22a41e99cd2d00c8f96ecfe", 302 | "sha256:826f32b9547c8091679ff292a82aca9c7b9650f9fda3e2ca6bf2ac905b7ce888", 303 | "sha256:93715dffbcd0678057f947f496484e906bf9509f5c1c38fc9ba3922893cda5f5", 304 | "sha256:9a334d6c83dfeadae576b4d633a71620d40d1c379129d587faa42ee3e2a85cce", 305 | "sha256:af7ed8a8aa6957aac47b4268631fa1df984643f07ef00acd374e456364b373f5", 306 | "sha256:bf0a7aed7f5521c7ca67febd57db473af4762b9622254291fbcbb8cd0ba5e33e", 307 | "sha256:bf1ef9eb901113a9805287e090452c05547578eaab1b62e4ad456fcc049a9b7e", 308 | "sha256:c0afd27bc0e307a1ffc04ca5ec010a290e49e3afbe841c5cafc5c5a80ecd81c9", 309 | "sha256:dd579709a87092c6dbee09d1b7cfa81831040705ffa12a1b248935274aee0437", 310 | "sha256:df6712284b2e44a065097846488f66840445eb987eb81b3cc6e4149e7b6982e1", 311 | "sha256:e07d9f1a23e9e93ab5c62902833bf3e4b1f65502927379148b6622686223125c", 312 | "sha256:e2ede7c1d45e65e209d6093b762e98e8318ddeff95317d07a27a2140b80cfd24", 313 | "sha256:e4ef9c164eb55123c62411f5936b5c2e521b12356037b6e1c2617cef45523d47", 314 | "sha256:eca2b7343524e7ba246cab8ff00cab47a2d6d54ada3b02772e908a45675722e2", 315 | "sha256:eee64c616adeff7db37cc37da4180a3a5b6177f5c46b187894e633f088fb5b28", 316 | "sha256:ef824cad1f980d27f26166f86856efe11eff9912c4fed97d3804820d43fa550c", 317 | "sha256:efc89291bd5a08855829a3c522df16d856455297cf35ae827a37edac45f466a7", 318 | "sha256:fa964bae817babece5aa2e8c1af841bebb6d0b9add8e637548809d040443fee0", 319 | "sha256:ff37757e068ae606659c28c3bd0d923f9d29a85de79bf25b2b34b148473b5025" 320 | ], 321 | "version": "==4.5.4" 322 | }, 323 | "future": { 324 | "hashes": [ 325 | "sha256:67045236dcfd6816dc439556d009594abf643e5eb48992e36beac09c2ca659b8" 326 | ], 327 | "version": "==0.17.1" 328 | }, 329 | "importlib-metadata": { 330 | "hashes": [ 331 | "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", 332 | "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af" 333 | ], 334 | "markers": "python_version < '3.8'", 335 | "version": "==0.23" 336 | }, 337 | "more-itertools": { 338 | "hashes": [ 339 | "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", 340 | "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" 341 | ], 342 | "version": "==7.2.0" 343 | }, 344 | "packaging": { 345 | "hashes": [ 346 | "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47", 347 | "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108" 348 | ], 349 | "version": "==19.2" 350 | }, 351 | "pluggy": { 352 | "hashes": [ 353 | "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6", 354 | "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34" 355 | ], 356 | "version": "==0.13.0" 357 | }, 358 | "py": { 359 | "hashes": [ 360 | "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", 361 | "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" 362 | ], 363 | "version": "==1.8.0" 364 | }, 365 | "pyaop": { 366 | "hashes": [ 367 | "sha256:cc57f5d292ab186e61016bbdb9636edb8bf05306fb871f0a0e9e41a9c28f9f89" 368 | ], 369 | "version": "==0.0.7" 370 | }, 371 | "pyparsing": { 372 | "hashes": [ 373 | "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", 374 | "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4" 375 | ], 376 | "version": "==2.4.2" 377 | }, 378 | "pytest": { 379 | "hashes": [ 380 | "sha256:813b99704b22c7d377bbd756ebe56c35252bb710937b46f207100e843440b3c2", 381 | "sha256:cc6620b96bc667a0c8d4fa592a8c9c94178a1bd6cc799dbb057dfd9286d31a31" 382 | ], 383 | "index": "pypi", 384 | "version": "==5.1.3" 385 | }, 386 | "pytest-apistellar": { 387 | "hashes": [ 388 | "sha256:469318f23f4a9faf37f0c3a4477c832275f389e471d24ed662f0bae52d4eb6e9" 389 | ], 390 | "index": "pypi", 391 | "version": "==0.2.0" 392 | }, 393 | "pytest-cov": { 394 | "hashes": [ 395 | "sha256:2b097cde81a302e1047331b48cadacf23577e431b61e9c6f49a1170bbe3d3da6", 396 | "sha256:e00ea4fdde970725482f1f35630d12f074e121a23801aabf2ae154ec6bdd343a" 397 | ], 398 | "index": "pypi", 399 | "version": "==2.7.1" 400 | }, 401 | "six": { 402 | "hashes": [ 403 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 404 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 405 | ], 406 | "version": "==1.12.0" 407 | }, 408 | "wcwidth": { 409 | "hashes": [ 410 | "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", 411 | "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" 412 | ], 413 | "version": "==0.1.7" 414 | }, 415 | "zipp": { 416 | "hashes": [ 417 | "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", 418 | "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" 419 | ], 420 | "version": "==0.6.0" 421 | } 422 | } 423 | } 424 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 自动从网上抓取免费代理,并对代理的可用性和匿名性进行检查,同时定时检查有效代理和无效代理,对于多次检查始终无效的代理,做放弃处理。同时检查函数可以自定义指定,用来针对不同的检查结果做出不同的反应。当然代理网站也可以自定义,简单的几行代码几条配置信息,实现最大限度的free-style。 2 | 3 | # INSTALL 4 | 5 | ```angular2html 6 | # python3 以上版本 7 | pip install proxy-factory 8 | # 依赖 redis(必须), tesseract-ocr(可选) 9 | ``` 10 | 11 | # USAGE 12 | ```angular2html 13 | mashichaodeMac-mini:toolkit mashichao$ product -h 14 | usage: product [-h] [-cm CHECK_METHOD] [-sm SPIDER_MODULE] [--console] 15 | [--console-host CONSOLE_HOST] [--console-port CONSOLE_PORT] 16 | [-s SETTINGS] [-ls LOCALSETTINGS] [-d] 17 | [{stop,start,restart,status}] 18 | 19 | positional arguments: 20 | {stop,start,restart,status} 21 | 22 | optional arguments: 23 | -h, --help show this help message and exit 24 | -cm CHECK_METHOD, --check-method CHECK_METHOD 25 | proivde a check method to check proxies. 26 | eg:module.func 27 | -sm SPIDER_MODULE, --spider-module SPIDER_MODULE 28 | proivde a module contains proxy site spider methods. 29 | eg:module1.module2 30 | --console start a console. 31 | --console-host CONSOLE_HOST 32 | console host. 33 | --console-port CONSOLE_PORT 34 | console port. 35 | -s SETTINGS, --settings SETTINGS 36 | Setting module. 37 | -ls LOCALSETTINGS, --localsettings LOCALSETTINGS 38 | Local setting module. 39 | -d, --daemon 40 | #################################################################### 41 | - product start: 程序开始(阻塞式) 42 | - product -d start: 程序开始(守护进程模式) 43 | - product restart 程序重启(守护进程模式) 44 | - product stop 程序关闭(守护进程模式) 45 | - product status 程序状态(守护进程模式) 46 | - product --console 开启一个console客户端,调试专用,详细请参见(https://github.com/ShichaoMa/toolkit) 47 | - product -s settings 指定一个配置模块。(只要在sys.path中就可以找到) 48 | - product -ls localsettings 指定一个自定义配置模块。(只要在sys.path中就可以找到) 49 | - product -cm check-method 指定一个自定义检查方法。(只要在sys.path中就可以找到) 50 | - product -sm spider-module 指定一个自定义的spider模块,存放自定义的spider方法。(只要在sys.path中就可以找到) 51 | ``` 52 | 53 | # CONFIG 54 | 55 | ### CUSTOM CHECK 56 | ```python 57 | def check(self, proxy): 58 | """ 59 | 自义定检查方法 60 | :param self: ProxyFactory对象 61 | :param proxy: 代理 62 | :return: True则代理可用,否则False 63 | """ 64 | import requests 65 | resp = requests.get("http://2017.ip138.com/ic.asp", proxies={"http": "http://%s"%proxy}) 66 | self.logger.info(resp.text) 67 | .... 68 | return resp.status_code < 300 69 | ``` 70 | ### CUSTOM PROXY SITE METHOD 71 | 72 | ```python 73 | def fetch_custom(self, page=5): 74 | """ 75 | 自定义代理网站抓取 76 | :param self:ProxyFactory对象 77 | :param page: 可以在里记录一些可选参数,但是方法只能接收一个必选参数 78 | :return: set类型的代理列表,ip:port 79 | """ 80 | proxies = set() 81 | url_tmpl = "http://www.kxdaili.com/dailiip/1/%d.html" 82 | for page_num in range(page): 83 | url = url_tmpl % (page_num + 1) 84 | soup = BeautifulSoup(get_html(url, self.headers), "html") 85 | table_tag = soup.find("table", attrs={"class": "segment"}) 86 | trs = table_tag.tbody.find_all("tr") 87 | for tr in trs: 88 | tds = tr.find_all("td") 89 | ip = tds[0].text 90 | port = tds[1].text 91 | latency = tds[4].text.split(" ")[0] 92 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理 93 | proxy = "%s:%s" % (ip, port) 94 | proxies.add(proxy) 95 | return proxies 96 | ``` 97 | 98 | ### SETTINGS 99 | 100 | ```python 101 | REDIS_HOST = "0.0.0.0" 102 | 103 | REDIS_PORT = 6379 104 | 105 | # 质量不好的代理检查的时间间隔 106 | BAD_CHECK_INTERVAL = 60 107 | 108 | # 质量不好的代理连续检查失败次数的最大值,超过则丢弃 109 | FAILED_TIMES = 5 110 | 111 | # 质量好的代理检查的时间间隔 112 | GOOD_CHECK_INTERVAL = 60 113 | 114 | # 抓取新代理的时间间隔 115 | FETCH_INTERVAL = 60 116 | 117 | # redis中用来存放有效代理的set 118 | GOOD_PROXY_SET = "good_proxies" 119 | 120 | # redis中用来存放无效代理的hash 121 | BAD_PROXY_HASH = "bad_proxies" 122 | 123 | 124 | ``` 125 | 126 | 参考资料 127 | [一键获取免费真实的匿名代理](https://zhuanlan.zhihu.com/p/31421147?group_id=918195817936896000) -------------------------------------------------------------------------------- /docker/Shanghai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShichaoMa/proxy_factory/44ba2ca108f41f945e730ee17038efb90c646f17/docker/Shanghai -------------------------------------------------------------------------------- /docker/custom_proxies_site.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | 4 | def fetch_custom(self, page=5): 5 | """ 6 | 自定义代理网站抓取 7 | :param self:ProxyFactory对象 8 | :param page: 可以在里记录一些可选参数,但是方法只能接收一个必选参数 9 | :return: set类型的代理列表,ip:port 10 | """ 11 | proxies = set() 12 | url_tmpl = "http://www.kxdaili.com/dailiip/1/%d.html" 13 | for page_num in range(page): 14 | url = url_tmpl % (page_num + 1) 15 | soup = BeautifulSoup(self.get_html(url), "html") 16 | table_tag = soup.find("table", attrs={"class": "segment"}) 17 | trs = table_tag.tbody.find_all("tr") 18 | for tr in trs: 19 | tds = tr.find_all("td") 20 | ip = tds[0].text 21 | port = tds[1].text 22 | latency = tds[4].text.split(" ")[0] 23 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理 24 | proxy = "%s:%s" % (ip, port) 25 | proxies.add(proxy) 26 | return proxies -------------------------------------------------------------------------------- /docker/deploy.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | 4 | os.system("docker build -f pf.dockerfile -t cnaafhvk/proxy-factory:latest .") 5 | os.system("docker push cnaafhvk/proxy-factory") 6 | -------------------------------------------------------------------------------- /docker/https_check.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import json 4 | 5 | my_ip = json.loads(requests.get("https://httpbin.org/ip").text)["origin"] 6 | 7 | 8 | def https_check(self, proxy): 9 | """ 10 | 自义定检查方法 11 | :param self: ProxyFactory对象 12 | :param proxy: 代理 13 | :return: True则代理可用,否则False 14 | """ 15 | resp = requests.get("http://www.whatismyip.com.tw/", 16 | headers=self.headers, timeout=10, proxies={"http": "http://%s" % proxy}) 17 | ip, real_ip = re.search(r'"ip": "(.*?)"[\s\S]+"ip-real": "(.*?)",', resp.text).groups() 18 | if resp.status_code < 300 and not real_ip: 19 | requests.head("https://httpbin.org/ip", timeout=10, proxies={"https": "http://%s" % proxy}) 20 | self.logger.debug("IP: %s. Real IP: %s. Proxy: %s" % (ip, real_ip, proxy)) 21 | return True 22 | return False 23 | -------------------------------------------------------------------------------- /docker/pf.dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | 3 | MAINTAINER Shichao Ma 4 | 5 | COPY Shanghai /etc/localtime 6 | 7 | COPY timezone /etc 8 | 9 | RUN apt-get clean && apt-get update 10 | 11 | RUN apt-get install -y locales 12 | 13 | RUN locale-gen en_US.UTF-8 14 | 15 | RUN update-locale LANG=en_US.UTF-8 16 | 17 | ENV LANG en_US.UTF-8 18 | 19 | RUN apt-get install -y --no-install-recommends gcc make wget 20 | 21 | RUN apt-get install -y --no-install-recommends tesseract-ocr 22 | 23 | RUN apt-get install -y --no-install-recommends python3-dev 24 | 25 | RUN wget https://bootstrap.pypa.io/get-pip.py --no-check-certificate 26 | 27 | RUN python3 get-pip.py 28 | 29 | RUN pip install proxy-factory==0.2.7 30 | 31 | RUN mkdir /app 32 | 33 | COPY custom_proxies_site.py /app/custom_proxies_site.py 34 | 35 | COPY https_check.py /app/https_check.py 36 | 37 | WORKDIR /app 38 | 39 | -------------------------------------------------------------------------------- /docker/timezone: -------------------------------------------------------------------------------- 1 | Asia/Shanghai -------------------------------------------------------------------------------- /proxy_factory/__init__.py: -------------------------------------------------------------------------------- 1 | from .factory import main 2 | 3 | 4 | __version__ = "0.3.3" 5 | -------------------------------------------------------------------------------- /proxy_factory/factory.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import time 5 | import requests 6 | import traceback 7 | 8 | from os import getcwd 9 | from redis import Redis 10 | from threading import Thread 11 | from functools import partial 12 | from argparse import ArgumentParser 13 | 14 | from toolkit import load, re_search 15 | from toolkit.settings import SettingsLoader 16 | from toolkit.service.monitors import ParallelMonitor 17 | from toolkit.tools.managers import Blocker, ExceptContext 18 | from toolkit.service.plugins import CommandlinePluginProxy, Supervisor, Console 19 | from toolkit.structures.thread_safe_collections import ThreadSafeSet, TreadSafeDict 20 | 21 | from . import proxy_site_spider 22 | from .utils import exception_wrapper 23 | from . import settings 24 | 25 | 26 | class ProxyFactory(ParallelMonitor): 27 | name = "proxy_factory" 28 | proxy_methods = dict() 29 | parser = ArgumentParser(conflict_handler="resolve") 30 | supervisor = CommandlinePluginProxy(Supervisor, parser) 31 | console = CommandlinePluginProxy(Console, parser) 32 | 33 | def __init__(self): 34 | """ 35 | 初始化logger, redis_conn 36 | """ 37 | self.enrich_parser_arguments() 38 | args = self.parser.parse_args() 39 | cwd = getcwd() 40 | sys.path.insert(0, cwd) 41 | self.settings = SettingsLoader().load(args.localsettings, args.settings) 42 | self.headers = self.settings.HEADERS 43 | self.proxies_check_in_channel = ThreadSafeSet() 44 | self.proxies_check_out_channel = TreadSafeDict() 45 | self.load_site(proxy_site_spider) 46 | self.load_site(args.spider_module) 47 | self.redis_conn = Redis(self.settings.get("REDIS_HOST"), 48 | self.settings.get_int("REDIS_PORT")) 49 | if args.check_method: 50 | self.is_anonymous = partial(load(args.check_method), self) 51 | super().__init__() 52 | self.supervisor.control(log_path=os.path.join(cwd, self.name) + ".log") 53 | self.console.init_console() 54 | self.my_ip = requests.get("https://httpbin.org/ip").json()["origin"] 55 | 56 | def log_err(self, func_name, *args): 57 | self.logger.error("Error in %s: %s. " % ( 58 | func_name, "".join(traceback.format_exception(*args)))) 59 | return True 60 | 61 | def load_site(self, module_str): 62 | if module_str: 63 | if isinstance(module_str, str): 64 | mod = load(module_str) 65 | else: 66 | mod = module_str 67 | for key, func in vars(mod).items(): 68 | if not key.startswith("fetch"): 69 | continue 70 | self.proxy_methods[key] = partial(exception_wrapper(func), self) 71 | 72 | def is_anonymous(self, proxy): 73 | url = "http://www.98bk.com/cycx/ip1/" 74 | resp = requests.get(url, timeout=10, headers=self.headers, 75 | proxies={"http": "http://%s" % proxy}) 76 | buf = resp.text.encode("iso-8859-1").decode("gbk") 77 | real_ip = re_search(r"您的真实IP是([\d\.]+)", buf) 78 | self.logger.info(f"My ip :{self.my_ip}, Real ip: {real_ip}") 79 | return real_ip == "" or not self.my_ip.count(real_ip) 80 | 81 | def check(self, proxy, good): 82 | """ 83 | 检查代理是否可用 84 | """ 85 | with ExceptContext(errback=lambda *args: True): 86 | if self.is_anonymous(proxy): 87 | good.add(proxy) 88 | 89 | def check_proxies(self): 90 | """ 91 | 对待检查队列中的代理进行检查 92 | :return: 93 | """ 94 | self.logger.debug("Start check thread. ") 95 | 96 | threads = dict() 97 | good = set() 98 | while self.alive: 99 | if len(self.proxies_check_in_channel): 100 | proxy = self.proxies_check_in_channel.pop() 101 | else: 102 | proxy = None 103 | if isinstance(proxy, bytes): 104 | proxy = proxy.decode() 105 | if len(threads) < 150 and proxy: 106 | th = Thread(target=self.check, args=(proxy, good)) 107 | th.setDaemon(True) 108 | th.start() 109 | threads[time.time()] = (th, proxy) 110 | time.sleep(.001) 111 | else: 112 | time.sleep(1) 113 | for start_time, (th, proxy) in threads.copy().items(): 114 | if start_time + 60 < time.time() or not th.is_alive(): 115 | del threads[start_time] 116 | self.proxies_check_out_channel[proxy] = proxy in good 117 | good.discard(proxy) 118 | 119 | self.logger.debug("Stop check thread. ") 120 | 121 | def bad_source(self): 122 | """ 123 | 每隔指定时间间隔将无效代理放到待检查队列进行检查 124 | :return: 125 | """ 126 | self.logger.debug("Start bad source thread. ") 127 | while self.alive: 128 | if len(self.proxies_check_in_channel): 129 | continue 130 | 131 | with ExceptContext(errback=self.log_err): 132 | proxies = self.redis_conn.hgetall( 133 | self.settings.get("BAD_PROXY_HASH", "bad_proxies")) 134 | if proxies: 135 | self.logger.debug( 136 | f"Bad proxy count is: {len(proxies)}, ready to check.") 137 | while proxies: 138 | proxy, times = proxies.popitem() 139 | self.proxies_check_in_channel.add(proxy) 140 | 141 | Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5)).\ 142 | wait_timeout_or_notify(notify=lambda: not self.alive) 143 | 144 | self.logger.debug("Stop bad source thread. ") 145 | 146 | def good_source(self): 147 | """ 148 | 每隔指定时间间隔将有效代理放到待检查队列进行检查 149 | :return: 150 | """ 151 | self.logger.debug("Start good source thread. ") 152 | while self.alive: 153 | with ExceptContext(errback=self.log_err): 154 | proxies = self.redis_conn.smembers( 155 | self.settings.get("GOOD_PROXY_SET", "good_proxies")) 156 | if proxies: 157 | self.logger.debug( 158 | f"Good proxy count is: {len(proxies)}, ready to check.") 159 | self.proxies_check_in_channel.update(proxies) 160 | Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5)).\ 161 | wait_timeout_or_notify(notify=lambda: not self.alive) 162 | 163 | self.logger.debug("Stop good source thread. ") 164 | 165 | def reset_proxies(self): 166 | """ 167 | 分发有效代理和无效代理 168 | :return: 169 | """ 170 | self.logger.debug("Start resets thread. ") 171 | while self.alive: 172 | with ExceptContext(errback=self.log_err): 173 | proxies = list(self.proxies_check_out_channel.pop_all()) 174 | if proxies: 175 | self.logger.debug(f"Got {len(proxies)} proxies to reset.") 176 | bp = self.settings.get("BAD_PROXY_HASH", "bad_proxies") 177 | gp = self.settings.get("GOOD_PROXY_SET", "good_proxies") 178 | for proxy, good in proxies: 179 | if good: 180 | self.redis_conn.sadd(gp, proxy) 181 | self.redis_conn.hdel(bp, proxy) 182 | else: 183 | count = self.redis_conn.hincrby(bp, proxy) 184 | if count > self.settings.get_int("FAILED_TIMES", 5): 185 | self.redis_conn.hdel(bp, proxy) 186 | self.logger.debug( 187 | f"Abandon {proxy} of failed {count} times.") 188 | self.redis_conn.srem(gp, proxy) 189 | else: 190 | time.sleep(1) 191 | time.sleep(1) 192 | self.logger.debug("Stop resets thread. ") 193 | 194 | def gen_thread(self, target, name=None, args=(), kwargs=None): 195 | thread = Thread(target=target, name=name, args=args, kwargs=kwargs) 196 | thread.setDaemon(True) 197 | thread.start() 198 | self.children.append(thread) 199 | 200 | def start(self): 201 | self.logger.debug("Start proxy factory. ") 202 | self.gen_thread(self.check_proxies) 203 | self.gen_thread(self.bad_source) 204 | self.gen_thread(self.good_source) 205 | self.gen_thread(self.reset_proxies) 206 | 207 | while self.alive or any(th for th in self.children if th.is_alive()): 208 | with ExceptContext(errback=self.log_err): 209 | if self.alive: 210 | self.logger.debug("Start to fetch proxies. ") 211 | proxies = self.fetch_all() 212 | self.logger.debug("%s proxies found. " % len(proxies)) 213 | self.proxies_check_in_channel.update(proxies) 214 | 215 | Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60)).\ 216 | wait_timeout_or_notify(notify=lambda: not self.alive) 217 | self.logger.debug("Stop proxy factory. ") 218 | 219 | def fetch_all(self): 220 | """ 221 | 获取全部网站代理,内部调用各网站代理获取函数 222 | """ 223 | proxies = set() 224 | for key, value in self.proxy_methods.items(): 225 | proxies.update(value()) 226 | return proxies 227 | 228 | def enrich_parser_arguments(self): 229 | self.parser.add_argument( 230 | "-s", "--settings", help="Setting module. ", default=settings) 231 | self.parser.add_argument( 232 | "-ls", "--localsettings", help="Local setting module.", 233 | default="localsettings") 234 | self.parser.add_argument( 235 | "-cm", "--check-method", 236 | help="provide a check method to check proxies. eg:module.func") 237 | self.parser.add_argument( 238 | "-sm", "--spider-module", 239 | help="provide a module contains proxy site spider methods. " 240 | "eg:module1.module2") 241 | 242 | 243 | def main(): 244 | ProxyFactory().start() 245 | 246 | 247 | if __name__ == '__main__': 248 | main() 249 | -------------------------------------------------------------------------------- /proxy_factory/proxy_site_spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from urllib.parse import urljoin 4 | 5 | from .utils import parse_class, parse_port, get_html, download 6 | 7 | 8 | def fetch_kxdaili(self, page=5): 9 | """ 10 | www.kxdaili.com 11 | """ 12 | proxies = set() 13 | url_tmpl = "http://www.kxdaili.com/dailiip/1/%d.html" 14 | for page_num in range(page): 15 | url = url_tmpl % (page_num + 1) 16 | soup = BeautifulSoup(get_html(url, self.headers), "html") 17 | table_tag = soup.find("table", attrs={"class": "active"}) 18 | trs = table_tag.tbody.find_all("tr") 19 | for tr in trs: 20 | tds = tr.find_all("td") 21 | ip = tds[0].text 22 | port = tds[1].text 23 | latency = tds[4].text.split(" ")[0] 24 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理 25 | proxy = "%s:%s" % (ip, port) 26 | proxies.add(proxy) 27 | return proxies 28 | 29 | 30 | def fetch_mimvp(self): 31 | """ 32 | http://proxy.mimvp.com/free.php 33 | """ 34 | proxies = set() 35 | url = "http://proxy.mimvp.com/free.php?proxy=in_hp" 36 | soup = BeautifulSoup(get_html(url, self.headers), "html") 37 | tds = soup.select("tbody > td") 38 | for i in range(0, len(tds), 10): 39 | ip = tds[i + 1].text 40 | port = parse_port(download(urljoin(url, tds[i + 2].img["src"]), self.headers)) 41 | proxies.add("%s:%s" % (ip, port)) 42 | return proxies 43 | 44 | 45 | def fetch_xici(self): 46 | """ 47 | http://www.xicidaili.com/nn/ 48 | """ 49 | proxies = set() 50 | url = "http://www.xicidaili.com/nn/" 51 | soup = BeautifulSoup(get_html(url, self.headers), "html") 52 | table = soup.find("table", attrs={"id": "ip_list"}) 53 | trs = table.find_all("tr") 54 | for i in range(1, len(trs)): 55 | tr = trs[i] 56 | tds = tr.find_all("td") 57 | ip = tds[1].text 58 | port = tds[2].text 59 | proxies.add("%s:%s" % (ip, port)) 60 | return proxies 61 | 62 | 63 | def fetch_66ip(self, page=5): 64 | """ 65 | http://www.66ip.cn/areaindex_2/1.html 66 | """ 67 | proxies = set() 68 | url_tmpl = "http://www.66ip.cn/areaindex_%s/1.html" 69 | for page_num in range(page): 70 | soup = BeautifulSoup(get_html(url_tmpl % (page_num + 1), self.headers), "html") 71 | trs = soup.select("tr") 72 | for i in range(4, len(trs)): 73 | tds = trs[i].find_all("td") 74 | ip = tds[0].text 75 | port = tds[1].text 76 | proxies.add("%s:%s" % (ip, port)) 77 | return proxies 78 | 79 | 80 | def fetch_goubanjia(self): 81 | """ 82 | http://www.goubanjia.com/ 83 | :return: 84 | """ 85 | proxies = set() 86 | url = "http://www.goubanjia.com" 87 | soup = BeautifulSoup(get_html(url, self.headers), "html") 88 | trs = soup.select("tbody > tr") 89 | for tr in trs: 90 | tds = tr.find_all("td") 91 | ip = "".join(re.findall(r'(?(.*?)<', str(tds[0]))).split(":")[0] 92 | port = parse_class(tds[0].select(".port")[0]["class"][-1]) 93 | type = tds[1].a.text 94 | if type.count("匿"): 95 | proxies.add("%s:%s" % (ip, port)) 96 | return proxies 97 | 98 | # def fetch_nianshao(self): 99 | # """ 100 | # 无法使用 101 | # http://www.nianshao.me/ 102 | # """ 103 | # proxies = set() 104 | # url = "http://www.nianshao.me/" 105 | # soup = BeautifulSoup(get_html(url, self.headers), "html") 106 | # table = soup.find("table", attrs={"class": "table"}) 107 | # trs = table.find_all("tr") 108 | # for i in range(1, len(trs)): 109 | # tr = trs[i] 110 | # tds = tr.find_all("td") 111 | # ip = tds[0].text 112 | # port = tds[1].text 113 | # proxies.add("%s:%s" % (ip, port)) 114 | # return proxies 115 | # 116 | # 117 | # def fetch_ip181(self): 118 | # """ 119 | # 无法使用 120 | # http://www.ip181.com/ 121 | # """ 122 | # proxies = set() 123 | # url = "http://www.ip181.com/" 124 | # soup = BeautifulSoup(get_html(url, self.headers), "html") 125 | # table = soup.find("table") 126 | # trs = table.find_all("tr") 127 | # for i in range(1, len(trs)): 128 | # tds = trs[i].find_all("td") 129 | # ip = tds[0].text 130 | # port = tds[1].text 131 | # proxies.add("%s:%s" % (ip, port)) 132 | # return proxies 133 | # 134 | # 135 | # def fetch_httpdaili(self): 136 | # """ 137 | # 无法使用 138 | # http://www.httpdaili.com/mfdl/ 139 | # """ 140 | # proxies = set() 141 | # url = "http://www.httpdaili.com/mfdl/" 142 | # soup = BeautifulSoup(get_html(url, self.headers), "html") 143 | # trs = soup.select(".kb-item-wrap11 tr") 144 | # 145 | # for i in range(len(trs)): 146 | # tds = trs[i].find_all("td") 147 | # if len(tds) > 2 and tds[1].text.isdigit(): 148 | # ip = tds[0].text 149 | # port = tds[1].text 150 | # type = tds[2].text 151 | # if type.encode("iso-8859-1").decode("utf-8") == "匿名": 152 | # proxies.add("%s:%s" % (ip, port)) 153 | # return proxies 154 | # 155 | # 156 | # def fetch_cn_proxy(self): 157 | # """ 158 | # 无法使用 159 | # http://cn-proxy.com/ 160 | # """ 161 | # proxies = set() 162 | # url = "http://cn-proxy.com/" 163 | # soup = BeautifulSoup(get_html(url, self.headers), "html") 164 | # trs = soup.select("tr") 165 | # for i in range(2, len(trs)): 166 | # tds = trs[i].find_all("td") 167 | # try: 168 | # ip = tds[0].text 169 | # port = tds[1].text 170 | # if port.isdigit(): 171 | # proxies.add("%s:%s" % (ip, port)) 172 | # except IndexError: 173 | # pass 174 | # return proxies 175 | # -------------------------------------------------------------------------------- /proxy_factory/settings.py: -------------------------------------------------------------------------------- 1 | REDIS_HOST = "0.0.0.0" 2 | REDIS_PORT = 6379 3 | 4 | # 质量不好的代理检查的时间间隔 5 | BAD_CHECK_INTERVAL = 5*60 6 | # 质量不好的代理连续检查失败次数的最大值,超过则丢弃 7 | FAILED_TIMES = 5 8 | # 质量好的代理检查的时间间隔 9 | GOOD_CHECK_INTERVAL = 5*60 10 | # 抓取新代理的时间间隔 11 | FETCH_INTERVAL = 10*60 12 | # redis中用来存放有效代理的set 13 | GOOD_PROXY_SET = "good_proxies" 14 | # redis中用来存放无效代理的hash 15 | BAD_PROXY_HASH = "bad_proxies" 16 | 17 | HEADERS = { 18 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) ' 19 | 'Gecko/20100101 Firefox/41.0', 20 | 'Accept': 'text/html,application/xhtml+xml,' 21 | 'application/xml;q=0.9,*/*;q=0.8', 22 | "Accept-Language": "en-US,en;q=0.5", 23 | "Accept-Encoding": "gzip, deflate", 24 | } -------------------------------------------------------------------------------- /proxy_factory/utils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pytesseract 3 | 4 | from PIL import Image 5 | from io import BytesIO 6 | 7 | from functools import wraps, reduce 8 | 9 | 10 | def parse_class(cls): 11 | """ 12 | 隐藏的解码函数 13 | :param cls: 14 | :return: 15 | """ 16 | meta = dict(zip("ABCDEFGHIZ", range(10))) 17 | num = reduce(lambda x, y: x + str(meta.get(y)), cls, "") 18 | return int(num) >> 3 19 | 20 | 21 | def parse_port(buffer): 22 | with Image.open(BytesIO(buffer)) as image: 23 | image = image.convert("RGB") 24 | gray_image = Image.new('1', image.size) 25 | width, height = image.size 26 | raw_data = image.load() 27 | image.close() 28 | for x in range(width): 29 | for y in range(height): 30 | value = raw_data[x, y] 31 | value = value[0] if isinstance(value, tuple) else value 32 | if value < 1: 33 | gray_image.putpixel((x, y), 0) 34 | else: 35 | gray_image.putpixel((x, y), 255) 36 | num = pytesseract.image_to_string(gray_image) 37 | result = guess(num) 38 | if result: 39 | return result 40 | else: 41 | new_char = list() 42 | for i in num: 43 | if i.isdigit(): 44 | new_char.append(i) 45 | else: 46 | new_char.append(guess(i)) 47 | return "".join(new_char) 48 | 49 | 50 | def guess(word): 51 | try: 52 | mapping = { 53 | "b": "8", 54 | "o": "0", 55 | "e": "8", 56 | "s": "9", 57 | "a": "9", 58 | "51234": "61234", 59 | "3737": "9797", 60 | "3000": "9000", 61 | "52385": "62386", 62 | } 63 | return mapping[word.lower()] 64 | except KeyError: 65 | if len(word) == 1: 66 | print(word) 67 | return word 68 | 69 | 70 | def exception_wrapper(func): 71 | @wraps(func) 72 | def wrapper(*args, **kwargs): 73 | self = args[0] 74 | try: 75 | return func(*args, **kwargs) 76 | except Exception as e: 77 | self.logger.warn("failed in %s: %s" % (func.__name__, e)) 78 | return set() 79 | 80 | return wrapper 81 | 82 | 83 | def get_html(url, headers=None, proxies=None): 84 | return requests.get(url, headers=headers, proxies=proxies).text 85 | 86 | 87 | def download(url, headers=None): 88 | buffer = b"" 89 | for chunk in requests.get(url, headers=headers, stream=True).iter_content(1024): 90 | buffer += chunk 91 | return buffer -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pytesseract 3 | redis~=2.9.0 4 | bs4 5 | pillow 6 | toolkity~=1.9.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import re 4 | import string 5 | 6 | from configparser import ConfigParser 7 | from contextlib import contextmanager 8 | from setuptools import setup, find_packages 9 | 10 | 11 | def get_version(package): 12 | """ 13 | Return package version as listed in `__version__` in `__init__.py`. 14 | """ 15 | init_py = open(os.path.join(package, '__init__.py')).read() 16 | mth = re.search("__version__\s?=\s?['\"]([^'\"]+)['\"]", init_py) 17 | if mth: 18 | return mth.group(1) 19 | else: 20 | raise RuntimeError("Cannot find version!") 21 | 22 | 23 | def _compact_ver(name, ver): 24 | if ver == '"*"' or ver.startswith("{"): 25 | ver = "" 26 | return '%s%s' % (name, ver.strip('"')) 27 | 28 | 29 | def install_requires(dev=False): 30 | """ 31 | Return requires in requirements.txt 32 | :return: 33 | """ 34 | try: 35 | cfg = ConfigParser() 36 | cfg.read('Pipfile') 37 | section_name = "%spackages" % ("dev-" if dev else "") 38 | requires = [_compact_ver(name, cfg.get(section_name, name))for name in cfg.options(section_name)] 39 | if not dev: 40 | with open("requirements.txt", "w") as f: 41 | f.write("\n".join(requires)) 42 | return requires 43 | except OSError: 44 | return [] 45 | 46 | try: 47 | LONG_DESCRIPTION = open("README.md").read() 48 | except UnicodeDecodeError: 49 | LONG_DESCRIPTION = open("README.md", encoding="utf-8").read() 50 | 51 | 52 | @contextmanager 53 | def cfg_manage(cfg_tpl_filename): 54 | if os.path.exists(cfg_tpl_filename): 55 | cfg_file_tpl = open(cfg_tpl_filename) 56 | buffer = cfg_file_tpl.read() 57 | try: 58 | with open(cfg_tpl_filename.rstrip(".tpl"), "w") as cfg_file: 59 | cfg_file.write(string.Template(buffer).substitute( 60 | pwd=os.path.abspath(os.path.dirname(__file__)))) 61 | yield 62 | finally: 63 | cfg_file_tpl.close() 64 | else: 65 | yield 66 | 67 | 68 | with cfg_manage(__file__.replace(".py", ".cfg.tpl")): 69 | setup( 70 | name="proxy-factory", 71 | version=get_version("proxy_factory"), 72 | description="provide anonymous proxies. ", 73 | long_description=LONG_DESCRIPTION, 74 | classifiers=[ 75 | "License :: OSI Approved :: MIT License", 76 | "Programming Language :: Python :: 3", 77 | "Intended Audience :: Developers", 78 | "Operating System :: Unix", 79 | ], 80 | long_description_content_type="text/markdown", 81 | keywords="anonymous proxies proxy", 82 | author="cn", 83 | author_email="cnaafhvk@foxmail.com", 84 | url="https://www.github.com/ShichaoMa/proxy_factory", 85 | entry_points={ 86 | 'console_scripts': [ 87 | 'product = proxy_factory:main', 88 | ], 89 | }, 90 | license="MIT", 91 | packages=find_packages(exclude=("tests*",)), 92 | install_requires=install_requires(), 93 | include_package_data=True, 94 | zip_safe=True, 95 | setup_requires=["pytest-runner"], 96 | tests_require=install_requires(dev=True) 97 | ) 98 | -------------------------------------------------------------------------------- /tests/test_site.py: -------------------------------------------------------------------------------- 1 | from proxy_factory.proxy_site_spider import * 2 | 3 | pf = type("AA", (object,), {})() 4 | pf.headers = { 5 | 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0', 6 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 7 | "Accept-Language": "en-US,en;q=0.5", 8 | "Accept-Encoding": "gzip, deflate", 9 | } 10 | 11 | 12 | class TestSite(object): 13 | 14 | def test_kxdaili(self): 15 | assert len(fetch_kxdaili(pf)) > 0 16 | 17 | def test_mimvp(self): 18 | assert fetch_mimvp(pf) 19 | 20 | def test_66ip(self): 21 | assert len(fetch_66ip(pf, 15)) > 0 22 | 23 | def test_goubanjia(self): 24 | assert len(fetch_goubanjia(pf)) > 0 25 | 26 | def test_xici(self): 27 | assert len(fetch_xici(pf)) > 0 28 | # 29 | # def test_cn_proxy(self): 30 | # assert len(fetch_cn_proxy(pf)) > 0 31 | # 32 | # def test_httpdaili(self): 33 | # assert len(fetch_httpdaili(pf)) > 0 34 | # 35 | # def test_nianshao(self): 36 | # assert len(fetch_nianshao(pf)) > 0 37 | --------------------------------------------------------------------------------