├── HTTP_Proxy_pool.txt ├── README.md ├── example.py ├── invalid.md ├── proxy_pool.py └── spiders.py /HTTP_Proxy_pool.txt: -------------------------------------------------------------------------------- 1 | [ 2 | "39.134.10.28:8080", 3 | "89.236.17.106:3128", 4 | "39.137.46.76:8080", 5 | "39.137.46.73:", 6 | "39.137.46.73:80", 7 | "47.89.10.103:80", 8 | "221.194.108.8:8060", 9 | "47.91.139.78:", 10 | "222.168.110.202:8060", 11 | "58.247.179.94:8060", 12 | "120.79.133.212:8088", 13 | "144.217.204.254:3128", 14 | "39.134.10.20:8088", 15 | "202.100.83.139:", 16 | "202.100.83.139:80", 17 | "103.232.147.16:1080", 18 | "117.44.247.38:8908", 19 | "112.21.164.58:1080", 20 | "39.137.46.74:8080", 21 | "148.72.168.175:1080", 22 | "123.57.76.102:80", 23 | "222.168.111.178:8060", 24 | "67.63.33.7:80", 25 | "39.137.46.69:", 26 | "61.97.130.196:31588", 27 | "207.38.90.134:1080", 28 | "39.134.10.7:8080", 29 | "39.137.46.72:", 30 | "117.127.0.201:8080", 31 | "222.135.121.30:8908", 32 | "120.92.119.229:10010", 33 | "39.137.46.75:8080", 34 | "39.137.46.77:8080", 35 | "45.115.39.139:7777", 36 | "39.134.10.100:8080", 37 | "119.41.193.19:8060", 38 | "120.198.230.18:8080", 39 | "117.127.0.198:80", 40 | "223.241.79.86:18118", 41 | "117.141.215.3:1080", 42 | "39.137.46.72:8080", 43 | "62.181.34.5:3129", 44 | "178.130.36.121:80", 45 | "211.147.67.150:80", 46 | "120.92.88.202:10000", 47 | "47.91.139.78:80", 48 | "180.121.135.92:808", 49 | "128.199.199.41:3128", 50 | "39.137.46.70:", 51 | "39.137.46.70:8080", 52 | "39.134.10.22:8080", 53 | "42.114.93.255:3128", 54 | "198.50.143.31:80", 55 | "222.132.65.91:8908", 56 | "114.215.174.227:8080", 57 | "39.134.10.4:8080", 58 | "45.55.67.118:8118", 59 | "125.75.0.179:8060", 60 | "148.72.168.177:1080", 61 | "39.137.46.76:", 62 | "49.79.196.169:61234", 63 | "39.137.46.77:", 64 | "212.8.240.123:1080", 65 | "39.137.46.78:8080", 66 | "217.23.6.217:1080", 67 | "39.137.46.71:8080", 68 | "118.114.77.47:8080", 69 | "54.38.100.100:1080", 70 | "39.137.46.74:", 71 | "191.252.184.91:80", 72 | "117.158.220.82:8908", 73 | "39.137.46.78:", 74 | "66.70.166.200:80", 75 | "117.127.0.203:8080", 76 | "151.106.25.189:1080", 77 | "58.240.53.196:80", 78 | "39.137.46.75:", 79 | "159.65.110.167:3128", 80 | "39.137.46.69:8080", 81 | "222.222.243.124:8060", 82 | "54.38.100.100:1080", 83 | "39.137.46.70:8080", 84 | "222.76.187.168:8118", 85 | "39.134.10.7:8080", 86 | "39.134.10.20:8088", 87 | "117.57.95.174:61234", 88 | "67.63.33.7:80", 89 | "148.72.168.177:1080", 90 | "39.137.46.74:8080", 91 | "221.194.108.16:8060", 92 | "222.132.65.91:8908", 93 | "222.179.230.146:8060", 94 | "39.137.46.76:8080", 95 | "39.134.10.5:90", 96 | "128.199.199.41:3128", 97 | "117.146.19.161:3128", 98 | "39.134.10.8:8000", 99 | "117.57.99.225:61234", 100 | "61.178.117.38:8908", 101 | "183.159.83.241:18118", 102 | "39.134.10.28:8088", 103 | "115.59.70.255:8118", 104 | "23.235.133.57:80", 105 | "119.49.22.198:8060", 106 | "120.198.230.17:8080", 107 | "123.160.35.118:61234", 108 | "212.8.240.123:1080", 109 | "39.134.10.22:8080", 110 | "114.55.142.137:9999", 111 | "39.134.10.21:8000", 112 | "39.137.46.73:", 113 | "118.114.77.47:8080", 114 | "60.182.237.242:33633", 115 | "112.21.164.58:1080", 116 | "39.137.46.72:", 117 | "103.232.147.16:1080", 118 | "182.106.201.140:8060", 119 | "59.48.237.6:8060", 120 | "39.137.46.71:8080", 121 | "113.254.44.242:8383", 122 | "148.72.168.175:1080", 123 | "117.127.0.204:8080", 124 | "39.134.10.19:8080", 125 | "115.151.205.3:61234", 126 | "119.177.16.34:53281", 127 | "117.127.0.203:8080", 128 | "47.91.139.78:80", 129 | "120.198.230.15:8080", 130 | "198.50.143.31:80", 131 | "117.127.0.195:80", 132 | "151.106.25.189:1080", 133 | "60.177.228.14:18118", 134 | "39.137.46.75:", 135 | "62.181.34.5:3129", 136 | "220.249.149.252:61234", 137 | "123.57.76.102:80", 138 | "58.22.248.11:8908", 139 | "39.134.10.27:8088", 140 | "39.134.10.4:8080", 141 | "119.49.124.202:8060", 142 | "207.38.90.134:1080", 143 | "101.81.76.141:8060", 144 | "39.137.46.78:", 145 | "111.180.140.139:61234", 146 | "39.137.46.69:8080", 147 | "117.127.0.197:8080", 148 | "61.97.130.196:31588", 149 | "222.76.187.142:8118", 150 | "39.137.46.74:", 151 | "120.198.230.18:8080", 152 | "117.127.0.201:8080", 153 | "125.32.189.187:8060", 154 | "114.215.174.227:8080", 155 | "39.137.46.73:80", 156 | "39.137.46.76:", 157 | "39.134.10.6:8088", 158 | "39.134.10.7:8000", 159 | "39.137.46.77:8080", 160 | "39.134.10.28:8080", 161 | "120.92.88.202:10000", 162 | "222.135.121.26:8908", 163 | "39.134.10.26:8088", 164 | "120.198.230.16:8080", 165 | "49.79.192.65:61234", 166 | "45.115.39.139:7777", 167 | "58.240.53.196:80", 168 | "222.33.192.238:8118", 169 | "117.127.0.206:80", 170 | "117.44.247.38:8908", 171 | "39.137.46.75:8080", 172 | "60.211.239.206:8908", 173 | "221.8.169.170:8060", 174 | "144.217.204.254:3128", 175 | "112.66.76.2:8060", 176 | "89.236.17.106:3128", 177 | "222.132.102.218:8908", 178 | "39.137.46.69:", 179 | "39.137.46.78:8080", 180 | "47.91.139.78:", 181 | "39.137.46.77:", 182 | "121.234.38.132:8888", 183 | "182.39.34.42:61234", 184 | "159.65.110.167:3128", 185 | "117.127.0.198:80", 186 | "39.137.46.70:", 187 | "47.89.10.103:80", 188 | "39.137.46.72:8080", 189 | "114.104.176.151:1080", 190 | "202.100.83.139:", 191 | "119.49.17.135:8060", 192 | "61.178.49.52:8908", 193 | "117.141.215.3:1080", 194 | "39.134.10.10:8080", 195 | "125.32.246.172:80", 196 | "165.227.192.42:80", 197 | "222.135.121.26:8908", 198 | "222.132.65.89:8908", 199 | "101.53.101.172:9999", 200 | "119.49.113.120:8060", 201 | "39.137.46.78:8080", 202 | "117.127.0.205:8080", 203 | "39.134.10.12:8080", 204 | "117.158.220.82:8908", 205 | "222.222.243.124:8060", 206 | "125.32.189.187:8060", 207 | "123.117.255.218:8060", 208 | "117.127.0.197:80", 209 | "117.127.0.200:80", 210 | "39.134.10.26:8080", 211 | "159.65.110.167:3128", 212 | "113.214.13.1:8000", 213 | "128.199.199.41:3128", 214 | "39.134.10.8:8080", 215 | "39.137.46.76:80", 216 | "39.134.10.16:90", 217 | "183.245.242.195:8060", 218 | "120.79.133.212:8088", 219 | "39.134.10.3:8088", 220 | "39.134.10.21:8088", 221 | "91.206.19.193:8081", 222 | "39.137.46.73:8080", 223 | "39.137.46.74:8080", 224 | "123.53.119.153:61234", 225 | "119.49.17.135:8060", 226 | "39.134.10.24:8088", 227 | "121.17.18.178:8060", 228 | "39.134.10.25:8080", 229 | "39.134.10.28:8080", 230 | "47.91.139.78:80", 231 | "198.50.143.31:80", 232 | "120.198.230.17:8080", 233 | "39.134.10.28:8000", 234 | "14.120.183.48:61234", 235 | "39.134.10.19:8088", 236 | "175.11.0.84:8060", 237 | "47.95.36.86:8081", 238 | "117.127.0.201:80", 239 | "39.134.10.6:8088", 240 | "39.134.10.11:8088", 241 | "222.168.126.50:8060", 242 | "60.165.134.237:8908", 243 | "49.79.193.161:61234", 244 | "39.137.46.77:8080", 245 | "49.79.195.171:61234", 246 | "218.62.82.222:8060", 247 | "183.91.33.75:80", 248 | "120.198.230.19:8080", 249 | "39.107.204.193:8088", 250 | "119.28.50.37:82", 251 | "117.127.0.204:80", 252 | "39.134.10.27:90", 253 | "117.127.0.206:8080", 254 | "222.132.65.91:8908", 255 | "39.134.10.24:8080", 256 | "47.95.33.79:8081", 257 | "114.55.142.137:9999", 258 | "120.78.140.82:8118", 259 | "118.114.77.47:8080", 260 | "39.134.10.13:8080", 261 | "120.92.118.64:10000", 262 | "39.134.10.5:8080", 263 | "112.86.140.212:8888", 264 | "39.134.10.22:8080", 265 | "61.178.59.107:8908", 266 | "39.134.10.20:8080", 267 | "39.134.10.14:8080", 268 | "217.194.255.217:3128", 269 | "39.134.10.18:8088", 270 | "89.236.17.106:3128", 271 | "221.124.18.73:8080", 272 | "124.235.149.86:8888", 273 | "39.134.10.12:8088", 274 | "120.194.189.122:8908", 275 | "117.36.103.170:8118", 276 | "112.27.177.86:8908", 277 | "112.21.164.58:1080", 278 | "39.137.46.76:8080", 279 | "202.100.83.139:80", 280 | "123.57.76.102:80", 281 | "117.127.0.199:80", 282 | "175.42.158.124:61234", 283 | "117.69.230.141:61234", 284 | "222.222.236.207:8060", 285 | "39.134.10.7:8080", 286 | "39.134.10.27:8080", 287 | "60.211.239.206:8908", 288 | "39.137.46.75:8080", 289 | "175.16.213.25:8060", 290 | "39.134.10.18:8080", 291 | "123.161.119.37:61234", 292 | "39.137.46.71:8080", 293 | "117.127.0.196:8080", 294 | "117.158.220.83:8908", 295 | "39.137.46.70:8080", 296 | "117.127.0.208:80", 297 | "183.91.33.50:80", 298 | "183.194.76.246:8060", 299 | "117.127.0.196:8080", 300 | "92.126.153.83:8080", 301 | "58.38.143.138:8088", 302 | "117.127.0.198:80", 303 | "117.127.0.208:80", 304 | "112.24.107.105:8908", 305 | "39.134.10.7:8088", 306 | "113.67.73.245:8118", 307 | "117.127.0.201:80", 308 | "117.127.0.200:80", 309 | "39.134.10.22:8000", 310 | "89.175.58.20:3128", 311 | "117.127.0.205:8080", 312 | "39.134.10.10:8088", 313 | "114.215.174.227:8080", 314 | "171.217.56.87:9000", 315 | "117.127.0.199:80", 316 | "217.194.255.217:3128", 317 | "117.127.0.197:80", 318 | "117.127.0.207:8080", 319 | "117.185.105.122:8060", 320 | "117.127.0.203:8080", 321 | "1.195.250.224:61234", 322 | "112.24.105.134:8908", 323 | "113.214.13.1:8000", 324 | "39.134.10.6:8000", 325 | "117.127.0.195:8080", 326 | "175.16.141.119:8060", 327 | "117.127.0.204:80", 328 | "117.127.0.202:80", 329 | "47.95.33.79:8081", 330 | "39.134.10.27:8000", 331 | "114.55.142.137:9999", 332 | "117.127.0.206:8080", 333 | "120.9.97.119:8060", 334 | "92.53.73.138:8118", 335 | "139.196.144.222:9999", 336 | "60.173.242.141:808", 337 | "119.28.50.37:82", 338 | "183.159.82.100:18118", 339 | "118.114.77.47:8080", 340 | "60.189.127.90:61234", 341 | "218.50.2.102:8080", 342 | "60.177.228.44:18118", 343 | "116.24.155.240:61234", 344 | "220.177.83.242:8908", 345 | "183.159.87.252:18118", 346 | "60.177.231.49:18118", 347 | "118.193.26.18:8080", 348 | "190.2.133.74:1080", 349 | "89.236.17.106:3128", 350 | "47.95.33.79:8081", 351 | "54.37.160.92:1080", 352 | "183.159.82.102:18118", 353 | "183.159.89.212:18118", 354 | "222.88.144.173:8060", 355 | "1.196.63.84:61234", 356 | "212.8.240.141:1080", 357 | "183.159.84.176:18118", 358 | "54.37.160.93:1080", 359 | "183.159.94.61:18118", 360 | "183.159.86.117:18118", 361 | "218.88.177.152:8908", 362 | "14.120.181.178:61234", 363 | "49.79.195.182:61234", 364 | "113.214.13.1:8000", 365 | "67.63.33.7:80", 366 | "101.53.101.172:9999", 367 | "123.57.76.102:80", 368 | "183.159.86.147:18118", 369 | "59.44.27.89:80", 370 | "42.2.67.51:8080", 371 | "183.91.33.50:80", 372 | "202.100.83.139:80", 373 | "223.241.78.217:18118", 374 | "58.51.126.166:8908", 375 | "220.191.102.40:6666", 376 | "39.134.231.133:8088", 377 | "182.86.208.208:8908", 378 | "61.183.55.28:8908", 379 | "39.134.108.89:80", 380 | "112.24.107.111:8908", 381 | "173.212.203.209:8080", 382 | "186.46.156.202:65309", 383 | "111.78.3.225:8908", 384 | "219.139.55.251:8908", 385 | "221.182.107.70:8908", 386 | "39.134.231.133:8000", 387 | "39.134.231.133:8080", 388 | "39.134.231.133:80", 389 | "39.134.108.91:80", 390 | "175.1.25.169:8060" 391 | ] 392 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Free_proxy_pool 2 | 对免费代理IP网站进行爬取,收集汇总为自己的代理池。其中关键是验证代理的有效性、匿名性、去重复。
3 | 4 | + 本代理池的定位是初学者能看懂,能使用的单机库。所以不打算使用高大上的Redis或者MongoDB等数据库。抓到的代理仅与磁盘文件交互。
5 | + 本项目**无需安装,下载后查看example**即可学会使用。简洁易用的get_a_proxy(),便于在请求网页的参数中直接使用。
6 | + 运行所依赖的第三方库:requests、bs4、lxml、chardet。
7 | + 爬虫组件<200行代码,把高可用率的代理网站一网打尽。如果爬虫全开,一次性可以采集高匿代理300--700个。 8 | + 下表数据仅表明各网站在某一时段的代理质量,每天不同时段会有波动,整体质量比较高。 9 | 10 | 代理网站|评估数|可用数|有效率|更新周期 11 | :-:|-:|-:|-:|:-: 12 | 66ip|536|113|21.1%|10分钟 13 | 89ip|600|79|11.7%|10分钟 14 | 爱家网|1274|105|8.2%|每天2篇 15 | codebusy|278|84|30.2%|1分钟 16 | 小舒代理|534|78|14.6%|每天2篇 17 | 小河虾|600|70|11.7%|10分钟 18 | 19 | 如果感觉对您有帮助,欢迎给我加一个星星,或者fork。
20 | 为了避免大家浪费精力,经验证无实用价值的免费代理网站列举如下,是为“黑名单”:
21 | - http://www.ip181.com/ 22 | 23 | - https://list.proxylistplus.com/ 24 | - http://www.xicidaili.com/nn    百度排名靠前,可用率仅1%左右。 25 | - http://www.kuaidaili.com/free/inha   较新的只有前5页,但可用仅1--2个。 26 | - 更新日期:2019-11-5
27 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @author: yaleimeng@sina.com 4 | @license: (C) Copyright 2017 5 | @desc: 本IP池使用示例。 6 | @DateTime: Created on 2017/10/16,at 11:38 ''' 7 | 8 | import proxy_pool, requests 9 | from bs4 import BeautifulSoup as bs 10 | 11 | my_pro = proxy_pool.Free_proxy_pool() 12 | a_pro = my_pro.get_a_proxy() 13 | print('当前代理为:',a_pro) 14 | 15 | r = requests.get('http://www.ipip.net/',proxies = a_pro) 16 | info = bs(r.text,'lxml').select('div.location')[0].text.rstrip() 17 | print(info) 18 | 19 | # 实际使用时,可以首先运行proxy_pool.py,保障数据库在持续更新。 20 | # 其他程序调用,只需要导入,简单地使用get_a_proxy()即可。 21 | # HTTP请求结果无效时,需要重新获取代理。请自行添加try catch语句。 -------------------------------------------------------------------------------- /invalid.md: -------------------------------------------------------------------------------- 1 | 纪念一下曾经有用,现已无法访问的代理网站: 2 | 3 | + http://www.3366ip.net/ 4 | + http://www.proxy360.cn/Proxy 5 | + http://bugng.com/gngn 6 | + http://ip.seofangfa.com/ 7 | + http://www.yun-daili.com/ 8 | + http://www.xiaohexia.cn/ 9 | + https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx? 10 | + http://www.ajshw.net/news/?list_9.html 11 | -------------------------------------------------------------------------------- /proxy_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @author: yaleimeng@sina.com (C) Copyright 2017. 4 | @desc:代理池主调度类。对外提供服务接口。 5 | @DateTime: Created on 2017/10/16,at 11:33 6 | ''' 7 | import threading, multiprocessing 8 | import json, spiders 9 | 10 | 11 | class Free_proxy_pool(object): 12 | ''' 13 | 代理IP池的主类。对外提供代理IP。当前代理池默认是HTTP代理。HTTPS的后面再另外扩充。 14 | 属性:min_limit:设置代理池的下限,达到下限时自动更新代理池。确保数量一直高于下限。默认值15个。 15 | 对外方法: get_a_proxy() 16 | 参数:返回一个高匿的HTTP代理地址。是 {'http': 'http://' + pro, }字典形式可直接使用。 17 | 类内使用的代理,都是单纯的“IP地址+端口”形式。 18 | ''' 19 | 20 | def __init__(self): 21 | self.__datafile = 'HTTP_Proxy_pool.txt' 22 | self.__proxies_ok = [] # 对它的操作主要是write_file,读取并清空。 23 | self.min_limit = 24 24 | self.__spider = spiders.Proxy_Spider() 25 | 26 | def pro_count(self): # 代理池中的数量,以文件中剩余数量为准。 27 | return len(self.__read_file()) 28 | 29 | def light_update(self): # 轻度更新,读取文件中的代理,校验可用性。将有效代理写回去。 30 | print('更新前,代理池数量:', self.pro_count()) 31 | self.__write_file(self.verify_Proxies(self.__read_file())) 32 | print('更新后,代理池数量:%d\n' % self.pro_count()) 33 | if self.pro_count() < self.min_limit: # 当可用数量低于下限时,在新的线程执行深度更新。 34 | pp = multiprocessing.Process(target= self.update_all()) 35 | pp.start() # 要保证安全库存代理全部失效以前,批量爬取的代理能够验证完,写入文件。 36 | global A_timer, free_p # 循环启动定时器,每隔10分钟重新检测已保存代理有效性。 37 | A_timer = threading.Timer(600, free_p.light_update) 38 | A_timer.start() 39 | 40 | def get_a_proxy(self): # 对外给出的是直接可用的代理形式。 41 | tmp, good = self.__read_file(), [] 42 | if tmp: 43 | while len(good) < 1: 44 | good = self.verify_Proxies([tmp.pop()]) 45 | if not tmp: 46 | break 47 | if len(tmp) < self.min_limit: # 如果没有获取到有效代理。需要大量更新。 48 | th = threading.Thread(target=self.light_update) 49 | th.start() 50 | self.__write_file(tmp) 51 | return {'http': 'http://' + good.pop(), } 52 | 53 | def __write_file(self, gList=None): 54 | with open(self.__datafile, 'w', encoding='utf-8')as f: 55 | if gList: 56 | json.dump(gList, fp=f, indent=4) 57 | else: 58 | json.dump(self.__proxies_ok, fp=f, indent=4) 59 | self.__proxies_ok.clear() 60 | 61 | def __read_file(self): 62 | with open(self.__datafile, 'r', encoding='utf-8')as fp: 63 | return json.load(fp) 64 | 65 | def verify_Proxies(self, pro_set): 66 | output = [] 67 | #print('待验证代理总数:%d\n' % len(pro_set)) 68 | for one_p in pro_set: 69 | pro = {'http': 'http://' + one_p, } 70 | # 因为代理匿名性验证比较麻烦,取消验证直接收集 71 | try: 72 | soup = self.__spider.request_page('http://ip111.cn/', pro) 73 | if soup is None: continue 74 | output.append(one_p) # 把代理加入到.proxies_ok中去。 75 | # ip = soup.select('p.getlist')[0].text.split(':')[1][:-6] 76 | # if ip == one_p.split(':')[0] and one_p not in output: 77 | # output.append(one_p) # 把代理加入到.proxies_ok中去。 78 | # print('当前代理:%s是高匿类型!已收集总数:%d' % (one_p, len(output))) 79 | # except Exception as ex:pass 80 | # #print('……解析失败……', end='\t') 81 | print('验证完毕!HTTP代理总数:%d\n' % len(output)) 82 | return output 83 | 84 | def update_all(self): # 深度更新函数。 85 | tmp = self.__spider.crawl() 86 | tmp.update(self.__read_file()) 87 | self.__write_file(self.verify_Proxies(tmp)) 88 | 89 | def first_crawl(self): 90 | tmp = self.__spider.crawl_for_init() 91 | tmp.update(self.__read_file()) 92 | self.__write_file(self.verify_Proxies(tmp)) 93 | 94 | 95 | if __name__ == '__main__': 96 | free_p = Free_proxy_pool() # 定时器回调。刚启动会执行1次轻度更新,从库中剔除失效代理。后面每隔10分钟更新1次。 97 | A_timer = threading.Timer(0.5, free_p.light_update) 98 | A_timer.start() 99 | pp = multiprocessing.Process(target= free_p.first_crawl) 100 | pp.start() # 另外开启一个新的进程执行深度更新。 101 | -------------------------------------------------------------------------------- /spiders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @author: yaleimeng@sina.com (C) Copyright 2017. 4 | @desc: 爬虫类。为代理池提供抓取代理IP功能。 5 | @DateTime: Created on 2017/10/16,at 8:47 ''' 6 | import chardet, re, time 7 | import requests as rq 8 | from bs4 import BeautifulSoup as bs 9 | 10 | 11 | class Proxy_Spider(object): 12 | proxies_got = set() 13 | 14 | def request_page(cls, page, proxy=None, wait=2): 15 | head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,' 16 | ' like Gecko) Chrome/55.0.2883.75 Safari/537.36'} 17 | try: 18 | r = rq.get(page, headers=head, proxies=proxy, timeout=wait) 19 | r.encoding = chardet.detect(r.content)['encoding'] 20 | return bs(r.text, 'lxml') 21 | except Exception: 22 | print('……无法获取网页。', end='\t') 23 | return None 24 | 25 | def crawl(self): 26 | self.__get_All66() # 数量自定义。抓取到的代理越多,验证用时越长。 27 | self.__get_All89() # 考虑验证耗时因素,后面几个可以选择性启用。 28 | return self.proxies_got 29 | 30 | def crawl_for_init(self): 31 | self.__xiao_shu() # 每天更新2篇文章。只适合一次性采集 32 | # self.__ihuan() 站点压力大,不要频繁抓取。 33 | self.__zhima() 34 | self.__kaixin() 35 | return self.proxies_got 36 | 37 | def __rows_from(self, url, exp=None): # 从网页表格中提取,seo方法、codeBusy采用了这种方式。 38 | express = 'table tbody tr' if exp is None else exp 39 | soup = self.request_page(url, wait=3) 40 | return None if soup is None else soup.select(express) 41 | 42 | def __ihuan(self): 43 | urls = ['https://ip.ihuan.me/?page={}&anonymity=2'.format(str(i)) for i in range(1, 31)] 44 | for url in urls: 45 | for info in self.__rows_from(url): 46 | item = info.select('td') 47 | address = item[0].text + ':' + item[1].text 48 | if address not in self.proxies_got: 49 | self.proxies_got.add(address) 50 | print('已采集小幻代理,代理池IP总数:', len(self.proxies_got)) 51 | 52 | def __parse_by_re(self, url, reg_exp=re.compile('\w+\.\w+\.\w+\.\w+:\w+')): # 正则提取, 66ip、89ip、QQ_room、开心代理采用了这种解析方式 53 | article = None if self.request_page(url) is None else self.request_page(url).__unicode__() 54 | return reg_exp.findall(article) 55 | 56 | def __get_All66(self): 57 | urls = ['http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype={}&start=&ports=&export=&ipaddress=' 58 | '&area=1&proxytype=0&api=66ip '.format(str(i)) for i in range(3, 5)] 59 | # 采集国内的,高匿、超匿2种HTTP代理。如果想采集国外的,area改为2。【如果要采集HTPPS,proxytpye = 1 】 60 | for url in urls: 61 | self.proxies_got.update(self.__parse_by_re(url)) # 把找到的代理IP添加到集合里面 62 | print('已采集66ip.cn,代理池IP总数:', len(self.proxies_got)) 63 | time.sleep(1.1) 64 | 65 | def __get_All89(self): 66 | url = 'http://www.89ip.cn/tiqv.php?sxb=&tqsl=400&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1' 67 | find_out = self.__parse_by_re(url) 68 | self.proxies_got.update(find_out) 69 | print('已采集89ip.cn,代理池IP总数:', len(self.proxies_got)) 70 | 71 | def __xiao_shu(self): 72 | page_list = [] 73 | soup = self.request_page('http://www.xsdaili.com/') 74 | news = soup.select('div.title a')[:6] # 获取最新的4篇文章。 75 | for info in news: 76 | link = 'http://www.xsdaili.com' + info.get('href') 77 | page_list.append(link) 78 | for page in page_list: 79 | self.proxies_got.update(self.__parse_by_re(page)) 80 | print('已采集小舒代理,代理池IP总数:', len(self.proxies_got)) 81 | time.sleep(0.5) 82 | 83 | def __zhima(self): 84 | # 芝麻代理,每小时更新国内代理IP。 85 | page_list = [] 86 | soup = self.request_page('https://h.zhimaruanjian.com/free/') 87 | 88 | for info in soup.select('div.titles a')[:4]: # 只抓取首页的4篇文章 89 | link = 'https://h.zhimaruanjian.com{}'.format(info.get('href')) 90 | page_list.append(link) 91 | 92 | for page in page_list: 93 | self.proxies_got.update(self.__parse_by_re(page)) 94 | print('已采集zhima代理,代理池IP总数:', len(self.proxies_got)) 95 | time.sleep(0.5) 96 | 97 | def __kaixin(self): 98 | # 开心代理,每天更新一篇国内代理IP。 div.cont_list > ul > li:nth-child(1) > a.title 99 | page_list = [] 100 | soup = self.request_page('http://www.kxdaili.com/daili.html') 101 | 102 | for info in soup.select('div.cont_list a.title')[:1]: # 只抓取首页的前2篇文章 103 | link = 'http://www.kxdaili.com{}'.format(info.get('href')) 104 | page_list.append(link) 105 | 106 | for page in page_list: 107 | self.proxies_got.update(self.__parse_by_re(page)) 108 | print('已采集kaixin代理,代理池IP总数:', len(self.proxies_got)) 109 | time.sleep(0.5) 110 | 111 | --------------------------------------------------------------------------------