├── LICENSE ├── README.md ├── test_proxy.py └── zdy.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 HongWei Deng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # squid 代理池搭建 2 | 3 | ## 简介 4 | 5 | 使用squid以及收费代理搭建爬虫代理池,自动获取最新可用代理并写入squid配置文件。 6 | 7 | 具体介绍及思路参考文章:[自己搭建亿级爬虫IP代理池](http://www.xnathan.com/2017/03/02/squid-proxy-pool/) 8 | 9 | ## 运行 10 | 11 | 1. 备份原始squid配置文件 12 | `sudo cp /etc/squid/squid.conf /etc/squid/squid.conf` 13 | 14 | 2. 购买 [站大爷](http://ip.zdaya.com) 短效代理 API,修改 `zdy.py`,将 `api_url = 'http://s.zdaye.com/?api=YOUR_API&count=100&fitter=1&px=2'` 改为自己的 api 地址 15 | 16 | 3. 运行 `sudo python zdy.py`,由于要写文件和重新加载配置,所以要使用 `sudo` 运行。 17 | 18 | ## 检测 19 | 20 | 修改 `test_proxy.py` 中 `139.xxx.xxx.66:3188` 为自己的 squid 服务器地址,每次运行 `test_proxy.py` 都会有不同的ip,表明代理搭建成功。 21 | 22 | ## TODO 23 | 24 | - [ ] 重构代码,避免使用全局变量 25 | - [ ] 支持多种类型的代理 API 接口,不只限定于站大爷 -------------------------------------------------------------------------------- /test_proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | """ 4 | 代理IP池实例演示 5 | """ 6 | from __future__ import print_function 7 | 8 | import requests 9 | 10 | s = requests.Session() 11 | s.proxies.update({"http": "139.xxx.xxx.66:3188"}) 12 | print(s.get("http://httpbin.org/ip")) 13 | -------------------------------------------------------------------------------- /zdy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # zdy.py 4 | """ 5 | Squid+站大爷搭建代理IP池 6 | Author: xNathan 7 | Blog: https://xnathan.com 8 | Github: https://github.com/xNathan 9 | """ 10 | from gevent import monkey # isort:skip 11 | 12 | monkey.patch_all() # isort:skip 13 | 14 | import logging 15 | import os 16 | import time 17 | 18 | import requests 19 | from gevent.pool import Pool 20 | 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(logging.INFO) 23 | formatter = logging.Formatter( 24 | "%(asctime)s - %(name)s - %(levelname)s: - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" 25 | ) 26 | 27 | # 使用StreamHandler输出到屏幕 28 | ch = logging.StreamHandler() 29 | ch.setLevel(logging.INFO) 30 | ch.setFormatter(formatter) 31 | 32 | logger.addHandler(ch) 33 | 34 | # Squid的配置文件语法 35 | # 将请求转发到父代理 36 | PEER_CONF = "cache_peer %s parent %s 0 no-query weighted-round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n" 37 | 38 | # 可用代理 39 | GOOD_PROXIES = [] 40 | 41 | pool = Pool(50) 42 | 43 | 44 | def check_proxy(proxy): 45 | """验证代理是否可用 46 | :param proxy list:[ip, port]""" 47 | global GOOD_PROXIES 48 | ip, port = proxy 49 | _proxies = {"http": "{}:{}".format(ip, port)} 50 | try: 51 | ip_url = "http://2019.ip138.com/ic.asp" 52 | res = requests.get(ip_url, proxies=_proxies, timeout=10) 53 | assert ip in res.content 54 | logger.info("[GOOD] - {}:{}".format(ip, port)) 55 | GOOD_PROXIES.append(proxy) 56 | except Exception as e: 57 | logger.error("[BAD] - {}:{}, {}".format(ip, port, e)) 58 | 59 | 60 | def update_conf(): 61 | with open("/etc/squid/squid.conf.original", "r") as F: 62 | squid_conf = F.readlines() 63 | squid_conf.append("\n# Cache peer config\n") 64 | for proxy in GOOD_PROXIES: 65 | squid_conf.append(PEER_CONF % (proxy[0], proxy[1])) 66 | with open("/etc/squid/squid.conf", "w") as F: 67 | F.writelines(squid_conf) 68 | 69 | 70 | def get_proxy(): 71 | global GOOD_PROXIES 72 | GOOD_PROXIES = [] 73 | # 1. 获取代理IP资源 74 | api_url = "http://s.zdaye.com/?api=YOUR_API&count=100&fitter=1&px=2" 75 | res = requests.get(api_url).content 76 | if len(res) == 0: 77 | logger.error("no data") 78 | elif "bad" in res: 79 | logger.error("bad request") 80 | else: 81 | logger.info("get all proxies") 82 | proxies = [] 83 | for line in res.split(): 84 | proxies.append(line.strip().split(":")) 85 | pool.map(check_proxy, proxies) 86 | pool.join() 87 | # 2. 写入Squid配置文件 88 | update_conf() 89 | # 3. 重新加载配置文件 90 | os.system("squid -k reconfigure") 91 | logger.info(">>>> DONE! <<<<") 92 | 93 | 94 | def main(): 95 | start = time.time() 96 | while True: 97 | # 每30秒获取一批新IP 98 | if time.time() - start >= 30: 99 | get_proxy() 100 | start = time.time() 101 | time.sleep(5) 102 | 103 | 104 | if __name__ == "__main__": 105 | main() 106 | --------------------------------------------------------------------------------