├── .gitignore ├── LICENSE ├── README.md ├── demo.png ├── get-proxy.py ├── proxy_ip.json ├── requirements.txt └── rules.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Mario 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FreeProxyPool 免费代理IP池 2 | 多线程爬取多个免费代理IP网站,获取并验证可用代理IP,构建免费代理池,内含多种爬取规则,支持定时爬取,对接 10+ 免费代理源,返回JSON格式,方便对接其他程序 3 | 4 | ``` 5 | _____ _____ _____ _____ _____ _____ _____ __ __ __ __ _____ _____ _____ _ 6 | | ___| | _ \ | ____| | ____| | _ \ | _ \ / _ \ \ \ / / \ \ / / | _ \ / _ \ / _ \ | | 7 | | |__ | |_| | | |__ | |__ | |_| | | |_| | | | | | \ \/ / \ \/ / | |_| | | | | | | | | | | | 8 | | __| | _ / | __| | __| | ___/ | _ / | | | | } { \ / | ___/ | | | | | | | | | | 9 | | | | | \ \ | |___ | |___ | | | | \ \ | |_| | / /\ \ / / | | | |_| | | |_| | | |___ 10 | |_| |_| \_\ |_____| |_____| |_| |_| \_\ \_____/ /_/ \_\ /_/ |_| \_____/ \_____/ |_____| 11 | ``` 12 | 13 | 14 | ### 已支持的免费代理源 15 | 如果有新的质量不错的免费代理网站,欢迎反馈或者根据规则尝试自行适配! 16 | 17 | | 代理名称 | 状态 | 地址 | 18 | | ------- | ---- | ---- | 19 | | 66免费代理 | √ | | 20 | | 快代理 | √ | | 21 | | 云代理 | √ | | 22 | | 89免费代理 | √ | | 23 | | 泥马代理 | √ | | 24 | | 西拉代理 | √ | | 25 | | 站大爷 | √ | | 26 | | 开心代理 | √ | | 27 | | 高可用全球免费代理库 | √ | | 28 | | 小舒代理 | √ | | 29 | | 太阳HTTP | √ | | 30 | | 小幻HTTP代理 | √ | | 31 | | 齐云代理 | √ | | 32 | 33 | 34 | ### 使用说明 35 | 1. 安装需要的支持库 `pip install -r requirements.txt` 36 | 37 | 2. 运行 `get-proxy.py`,根据提示输入即可 38 | 39 | 40 | ### 运行结果 41 | 42 | ![jieguo](https://raw.githubusercontent.com/Fog-Forest/free-proxy-pool/main/demo.png) 43 | 44 | 45 | ### 注意事项 46 | 本项目仅仅是一个简单的脚本,请勿用作非法用途,通常免费代理质量都较差,不建议用于爬虫,本项目依然不够完善,后续随缘更新,如果发现 bug 或有新的功能添加,欢迎在 [Issues](https://github.com/Fog-Forest/free-proxy-pool/issues) 中反馈,同时也可以到我的 [博客](https://m1314.cn/) 中留言。 47 | -------------------------------------------------------------------------------- /demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fog-Forest/free-proxy-pool/9bde06edc5e6904fec3644fc859721be1e304501/demo.png -------------------------------------------------------------------------------- /get-proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # @Author: 蘑菇君 4 | # @Date : 2021/01/03 5 | # @Desc : 自动获取免费代理IP 6 | 7 | import threading 8 | 9 | from rules import * 10 | 11 | proxy_ok_ip = [] # 验证后的代理IP列表 12 | 13 | 14 | # 复写Thread类 15 | class MyThread(threading.Thread): 16 | def __init__(self, func, args): 17 | threading.Thread.__init__(self) 18 | self.func = func 19 | self.args = args 20 | 21 | def run(self): 22 | self.func(*self.args) 23 | 24 | 25 | # 获取普通代理IP函数 26 | def get_ip(): 27 | # 定义一个获取IP的线程池,如果你有其他接口可以往里加 28 | threads_ip = [MyThread(ip_api, args=( 29 | "http://www.66ip.cn/mo.php?sxb=&tqsl=7000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", "66免费代理")), 30 | MyThread(ip_html2, args=("https://www.kuaidaili.com/free/intr/", "快代理", 20, 2)), 31 | MyThread(ip_html2, args=("http://www.ip3366.net/free/?stype=2&page=", "云代理", 7, 1)), 32 | MyThread(ip_api, args=( 33 | "http://www.89ip.cn/tqdl.html?api=1&num=3000&port=&address=&isp=", "89免费代理(未知类型)")), 34 | MyThread(ip_html1, args=("http://www.nimadaili.com/putong/", "泥马代理", 100, 1)), 35 | MyThread(ip_html1, args=("http://www.xiladaili.com/putong/", "西拉代理", 100, 1)), 36 | MyThread(ip_article1, args=("https://www.zdaye.com/dayProxy.html", "站大爷(未知类型)", 14, 4)), 37 | MyThread(ip_html2, args=("http://www.kxdaili.com/dailiip/2/", "开心代理", 9, 2))] 38 | for b in threads_ip: 39 | b.start() 40 | for b in threads_ip: 41 | b.join() 42 | 43 | 44 | # 获取匿名代理IP函数 45 | def get_anonymous_ip(): 46 | # 定义一个获取IP的线程池,如果你有其他接口可以往里加 47 | threads_ip = [MyThread(ip_api, args=( 48 | "http://www.66ip.cn/nmtq.php?getnum=3000&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip", 49 | "66免费代理")), 50 | MyThread(ip_html2, args=("https://www.kuaidaili.com/free/inha/", "快代理", 20, 2)), 51 | MyThread(ip_html2, args=("http://www.ip3366.net/free/?stype=1&page=", "云代理", 7, 1)), 52 | MyThread(ip_api, args=( 53 | "http://www.89ip.cn/tqdl.html?api=1&num=3000&port=&address=&isp=", "89免费代理(未知类型)")), 54 | MyThread(ip_html1, args=("http://www.nimadaili.com/gaoni/", "泥马代理", 100, 1)), 55 | MyThread(ip_html1, args=("http://www.xiladaili.com/gaoni/", "西拉代理", 100, 1)), 56 | MyThread(ip_html2, args=("https://www.7yip.cn/free/?action=china&page=", "齐云代理", 90, 2)), 57 | MyThread(ip_html2, args=("https://ip.jiangxianli.com/?page=", "高可用全球免费代理库", 8, 0)), 58 | MyThread(ip_article1, args=("http://www.xsdaili.cn/", "小舒代理", 6, 2)), 59 | MyThread(ip_article1, args=("https://www.zdaye.com/dayProxy.html", "站大爷(未知类型)", 14, 4)), 60 | MyThread(ip_html2, args=("http://www.kxdaili.com/dailiip/1/", "开心代理", 9, 2)), 61 | MyThread(ip_html4, args=("http://http.taiyangruanjian.com/free/page", "太阳HTTP", 7, 2)), 62 | MyThread(ip_article2, args=("https://ip.ihuan.me/today.html", "小幻HTTP代理", 2))] 63 | for b in threads_ip: 64 | b.start() 65 | for b in threads_ip: 66 | b.join() 67 | 68 | 69 | # 验证代理函数 70 | def check_ip(ip, site, word, code): 71 | global proxy_ok_ip 72 | try: 73 | proxy_temp = {"http": ip, "https": ip} 74 | res = requests.get(site, headers=headers, proxies=proxy_temp, timeout=10) # 验证超时时间,默认10秒 75 | if code == "2": 76 | res.encoding = "gbk" 77 | else: 78 | res.encoding = "utf-8" 79 | if word in res.text: # 判断关键词是否在网站源码中 80 | # print(res, ip + " is OK") 81 | proxy_ok_ip.append(ip) 82 | else: 83 | # print(ip + " is BOOM") 84 | pass 85 | except: 86 | # print(ip + " is BOOM") 87 | pass 88 | 89 | 90 | # 列表写入json文件函数:filename为写入json文件的路径,data为要写入数据列表 91 | def text_save(filename, data): 92 | file = open(filename, "w+") 93 | content = { 94 | "total": len(data), 95 | "data": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 96 | "ip": data 97 | } 98 | file.write(str(content).replace("'", '"')) 99 | file.close() 100 | print("代理IP信息保存文件成功,请查看当前运行目录!") 101 | 102 | 103 | # 列表去重函数 104 | def check_list(lists): 105 | temp = [] 106 | for i in lists: 107 | if i not in temp: 108 | temp.append(i) 109 | return temp 110 | 111 | 112 | if __name__ == "__main__": 113 | input_ip_type = input("请选择你要获取的代理IP类型(1.普通 2.高匿(默认)):") 114 | input_thread_num = 500 # 验证的线程数(默认500) 115 | input_wait_time = input("请输入每次重新获取的间隔时间,免费IP不要太快,单位(秒):") 116 | 117 | while True: 118 | proxy_ip.clear() # 代理IP列表 119 | proxy_ok_ip.clear() # 验证后的代理IP列表 120 | if input_ip_type == "1": 121 | print("\n正在获取[普通]代理IP中请稍等片刻,大概3min... _(:з」∠)_") 122 | get_ip() 123 | else: 124 | print("\n正在获取[高匿]代理IP中请稍等片刻,大概3min... _(:з」∠)_") 125 | get_anonymous_ip() 126 | ip_list = check_list(proxy_ip) # 去重 127 | 128 | # 验证可用性✔ 129 | demo_site = "https://www.baidu.com" 130 | demo_code = str(1) # 1.UTF-8(默认) 2.GBK 131 | demo_word = "百度一下" # 如 “https://www.baidu.com” 中有关键字符串 “百度一下” :") 132 | 133 | # 多线程验证开始,GO! GO! GO! 134 | k = 0 135 | threads = [] # 定义一个线程池 136 | thread_count = len(ip_list) // int(input_thread_num) + 1 # 分几段线程 137 | for i in range(thread_count): 138 | for j in range(int(input_thread_num)): # 一段几线程 139 | try: 140 | # 创建新线程,添加到线程池 141 | threads.append(MyThread(check_ip, args=(ip_list[k], demo_site, demo_word, demo_code))) 142 | k += 1 143 | except: 144 | break 145 | # 开启所有线程 146 | for t in threads: 147 | t.start() 148 | # 等待所有线程完成 149 | for t in threads: 150 | t.join() 151 | print("共获取到" + str(len(ip_list)) + "个代理IP,可用IP总数为" + str(len(proxy_ok_ip)) + "个") 152 | text_save("proxy_ip.json", proxy_ok_ip) 153 | 154 | time.sleep(int(input_wait_time)) 155 | print("\n循环获取ing...\n") 156 | -------------------------------------------------------------------------------- /proxy_ip.json: -------------------------------------------------------------------------------- 1 | { 2 | "total": 421, 3 | "data": "2021-01-21 01:52:48", 4 | "ip": ["58.220.95.90:9401", "183.166.132.74:9999", "221.122.91.59:80", "110.243.10.59:9999", "58.220.95.54:9400", "115.221.240.40:9999", "58.220.95.44:10174", "221.122.91.34:80", "171.35.173.90:9999", "221.122.91.76:9480", "124.94.250.119:9999", "117.95.162.112:9999", "117.95.200.232:9999", "110.243.12.155:9999", "58.220.95.78:9401", "58.220.95.116:10122", "58.220.95.34:10174", "221.122.91.60:80", "113.124.86.33:9999", "58.220.95.79:10000", "58.220.95.30:10174", "183.220.145.3:80", "58.220.95.114:10053", "110.243.21.29:9999", "49.75.59.242:3128", "221.122.91.65:80", "58.220.95.86:9401", "221.122.91.61:80", "218.60.8.99:3129", "218.60.8.83:3129", "211.144.213.145:80", "221.122.91.64:80", "123.54.46.86:9999", "221.122.91.66:80", "115.211.188.155:9999", "171.35.212.30:9999", "183.6.183.35:3128", "171.35.212.99:9999", "221.122.91.64:9401", "120.83.98.159:9999", "120.83.111.81:9999", "60.191.11.241:3128", "39.84.126.132:9999", "221.5.80.66:3128", "122.224.65.197:3128", "120.83.103.49:9999", "223.247.169.40:9999", "58.253.159.244:9999", "223.247.171.6:9999", "120.83.96.139:9999", "59.36.10.52:3128", "51.75.147.40:3128", "45.77.32.34:3128", "116.196.85.150:3128", "171.35.213.109:9999", "150.242.182.98:80", "115.221.240.126:9999", "120.83.100.235:9999", "106.110.195.28:9999", "115.221.241.84:9999", "51.75.147.41:3128", "221.122.91.74:9401", "171.35.212.174:9999", "107.191.63.234:8888", "89.250.152.76:8080", "190.83.31.16:8080", "114.239.147.47:9999", "61.160.210.223:808", "171.35.215.53:9999", "171.35.167.242:9999", "113.195.171.207:9999", "171.35.163.120:9999", "88.255.102.106:8080", "150.138.253.72:808", "171.35.214.46:9999", "113.195.153.248:9999", "150.138.253.70:808", "113.195.170.25:9999", "150.138.253.71:808", "112.47.3.53:3128", "110.243.3.187:9999", "114.249.112.214:9000", "110.243.10.231:9999", "171.35.214.79:9999", "39.156.3.66:80", "167.71.40.51:3128", "190.242.45.124:999", "58.18.72.149:9999", "116.112.250.14:9999", "139.180.202.227:3128", "201.20.39.235:8080", "122.5.109.42:9999", "59.29.245.151:3128", "178.128.117.81:3128", "178.209.51.218:9999", "220.174.236.211:8091", "171.35.222.156:9999", "200.73.129.128:8080", "61.160.210.234:808", "113.195.224.181:9999", "103.224.195.41:3128", "36.248.132.170:9999", "171.35.223.127:9999", "49.70.17.234:9999", "171.35.166.247:9999", "88.82.95.146:3128", "185.198.188.55:8080", "95.0.219.201:8080", "45.82.245.34:3128", "45.76.111.235:3128", "60.216.20.211:8001", "155.138.156.161:8888", "201.44.1.36:3128", "45.186.144.214:999", "171.35.167.36:9999", "113.194.30.226:9999", "110.243.30.182:9999", "118.70.12.171:53281", "178.128.117.81:8888", "221.122.91.75:10286", "187.243.240.54:8080", "113.194.29.10:9999", "202.154.180.53:46717", "171.35.166.80:9999", "95.181.49.26:8080", "217.6.21.170:8080", "115.221.247.80:9999", "45.32.100.224:3128", "104.248.123.76:18080", "107.178.9.186:8080", "79.110.52.243:3128", "150.107.75.82:1347", "223.27.194.68:80", "200.116.226.210:43049", "113.195.225.136:9999", "41.191.228.10:8080", "192.53.117.107:8080", "125.25.45.181:80", "41.180.47.42:8080", "110.243.22.94:9999", "103.36.11.161:14571", "14.20.235.223:9797", "128.14.178.94:3128", "113.195.225.25:9999", "139.180.154.253:8888", "188.186.180.135:8080", "49.70.95.49:9999", "54.89.151.62:17802", "110.189.152.86:40698", "207.74.82.103:3128", "171.35.222.142:9999", "102.33.21.34:8080", "201.142.225.244:8080", "113.194.30.136:9999", "171.35.168.200:9999", "58.255.207.247:9999", "51.158.180.179:8811", "113.194.30.239:9999", "113.195.224.168:9999", "115.221.247.151:9999", "91.194.247.247:3333", "193.178.50.49:3128", "34.68.180.189:3128", "122.5.107.145:9999", "45.251.228.217:8080", "217.195.203.28:3130", "89.175.188.58:56263", "181.129.98.146:8080", "195.60.174.123:39635", "139.5.153.177:3888", "113.195.153.155:9999", "103.22.248.59:61661", "51.79.144.52:3128", "113.194.31.73:9999", "60.13.42.135:9999", "85.173.165.36:46330", "113.195.157.80:9999", "110.74.203.249:8080", "41.220.134.88:8080", "192.46.229.111:8080", "103.110.90.250:8080", "120.83.98.26:9999", "171.35.167.24:9999", "46.151.145.4:53281", "144.52.197.24:9999", "58.253.156.43:9999", "203.81.75.37:8080", "78.111.106.34:3128", "51.158.123.35:9999", "163.172.125.147:8080", "51.75.147.44:3128", "181.78.19.82:999", "186.226.172.165:57783", "51.79.144.52:8080", "177.37.161.4:41819", "176.236.157.154:8080", "171.35.163.92:9999", "159.65.112.248:8888", "193.56.255.131:3128", "187.216.90.46:53281", "182.46.216.99:9999", "171.35.172.97:9999", "201.16.224.201:80", "68.183.202.76:80", "45.168.82.22:8088", "171.35.169.185:9999", "113.194.29.12:9999", "185.198.188.51:8080", "217.88.77.66:8080", "113.195.154.159:9999", "178.128.63.189:3128", "50.203.182.243:8080", "183.247.152.98:53281", "36.89.218.67:8889", "87.249.217.57:3128", "223.247.171.9:9999", "189.84.48.122:8080", "171.35.173.38:9999", "110.243.13.171:9999", "58.253.153.176:9999", "171.35.163.20:9999", "113.120.32.104:9999", "119.176.175.30:9999", "113.194.29.99:9999", "210.26.49.89:3128", "124.41.240.203:55948", "221.182.31.54:8080", "223.247.169.176:9999", "36.72.5.114:80", "113.128.123.182:9999", "113.194.29.153:9999", "171.35.223.224:9999", "78.111.97.181:3141", "123.169.100.52:9999", "93.153.95.186:56603", "41.72.203.66:38057", "45.7.200.98:8080", "186.148.184.132:999", "203.210.84.50:8089", "36.92.85.66:8080", "171.35.214.253:9999", "110.243.4.208:9999", "190.12.95.170:47029", "112.91.78.66:9999", "62.213.14.166:8080", "192.53.117.107:9999", "37.77.128.162:8080", "223.247.168.121:9999", "113.194.29.238:9999", "124.41.211.211:43979", "78.111.106.35:3128", "113.195.152.179:9999", "189.204.242.178:8080", "113.194.148.26:9999", "209.126.4.134:3128", "171.35.169.113:9999", "103.151.226.133:8080", "183.89.64.13:8080", "110.243.22.223:9999", "103.134.213.55:8080", "121.40.138.182:8080", "187.45.147.193:3131", "36.91.163.10:8089", "171.35.173.112:9999", "113.194.48.17:9999", "114.239.198.158:9999", "223.247.168.79:9999", "113.195.224.110:9999", "113.195.224.203:9999", "87.103.202.246:3128", "202.47.66.60:80", "218.66.253.145:80", "218.62.125.195:9999", "122.50.5.148:10000", "1.20.169.206:8080", "14.102.152.158:8080", "212.126.107.2:31475", "77.70.35.87:37475", "78.186.236.210:9090", "171.35.169.3:9999", "222.189.191.186:9999", "212.66.61.118:37141", "185.198.188.48:8080", "37.120.168.223:8888", "182.34.36.64:9999", "123.169.116.2:9999", "181.64.107.96:999", "51.79.173.168:8080", "171.35.169.37:9999", "219.131.243.165:9797", "60.167.132.87:8888", "54.254.24.192:3128", "185.67.95.179:3128", "190.120.249.249:999", "13.76.38.173:3128", "41.229.253.214:8080", "195.248.242.77:8080", "45.174.78.33:999", "37.29.91.178:8080", "36.90.119.224:8080", "122.152.55.71:55443", "110.243.9.28:9999", "113.195.169.97:9999", "113.194.148.248:9999", "110.243.15.54:9999", "171.35.215.164:9999", "41.90.245.23:8080", "181.209.97.75:999", "115.221.244.207:9999", "190.110.219.130:8080", "202.162.199.67:8080", "113.128.36.128:47535", "110.36.181.125:8080", "180.183.133.10:8080", "5.58.81.19:8080", "110.38.74.58:8080", "103.124.136.73:8080", "114.101.253.117:9999", "94.25.104.250:8080", "42.238.82.203:9999", "171.35.173.124:9999", "222.189.191.216:9999", "187.111.176.193:8080", "181.112.164.137:999", "61.145.49.240:9999", "37.120.192.154:8080", "112.111.77.95:9999", "49.89.67.124:9999", "113.195.168.28:9999", "171.35.221.14:9999", "121.226.214.117:9999", "171.35.215.22:9999", "110.243.15.196:9999", "36.67.168.117:8080", "110.243.9.41:9999", "61.145.48.8:9999", "182.46.214.237:9999", "141.105.174.47:80", "113.194.29.221:9999", "45.7.205.103:39750", "37.230.147.206:8080", "110.243.28.154:9999", "113.195.224.164:9999", "113.195.155.204:9999", "202.70.84.1:8080", "134.249.156.228:4469", "118.99.113.131:8080", "171.35.170.30:9999", "221.6.201.74:9999", "202.180.54.211:8080", "187.72.42.88:8080", "45.174.92.3:999", "83.175.238.170:55443", "171.35.173.11:9999", "81.163.57.147:41258", "201.91.82.155:3128", "192.162.192.148:55443", "203.189.89.1:53281", "128.199.162.224:8888", "110.243.4.105:9999", "190.184.144.170:58975", "36.248.133.152:9999", "171.35.213.135:9999", "113.194.29.137:9999", "63.249.67.70:53281", "125.26.99.186:41358", "42.7.31.57:9999", "85.175.227.3:7012", "113.128.121.179:9999", "186.225.63.241:8080", "41.220.138.235:8080", "176.9.166.50:3128", "36.67.57.45:30066", "190.53.38.98:46340", "110.243.9.20:9999", "114.239.199.181:9999", "190.90.18.22:999", "110.243.12.126:9999", "189.193.198.79:8080", "113.194.29.25:9999", "113.194.28.158:9999", "171.35.213.123:9999", "113.194.150.73:9999", "202.166.211.48:30753", "122.138.139.21:9999", "113.121.248.73:9999", "113.195.224.208:9999", "110.76.128.53:42670", "96.9.77.71:8080", "113.194.31.70:9999", "106.110.212.122:9999", "182.46.252.216:9999", "110.243.10.35:9999", "49.89.103.212:9999", "24.172.34.114:49920", "110.243.26.64:9999", "171.35.173.91:9999", "110.243.26.36:9999", "183.166.70.72:9999", "113.195.145.226:9999", "182.32.163.174:9999", "114.239.0.81:9999", "114.239.150.189:9999", "101.75.166.167:9999", "49.89.103.204:9999", "116.112.254.73:9999", "103.89.152.190:8080", "122.5.107.137:9999", "171.35.213.181:9999", "123.169.114.13:9999", "183.166.103.101:9999", "36.37.81.135:8080", "13.212.167.205:80", "103.12.161.38:55443", "128.14.163.92:3128", "190.7.141.66:47576", "110.243.23.184:9999", "182.34.20.189:9999", "109.195.194.79:60992", "188.168.27.71:36733", "36.37.177.186:8080"] 5 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.3 2 | certifi==2020.12.5 3 | chardet==4.0.0 4 | html5lib==1.1 5 | idna==2.10 6 | requests==2.25.1 7 | six==1.15.0 8 | soupsieve==2.1 9 | urllib3==1.26.2 10 | webencodings==0.5.1 11 | -------------------------------------------------------------------------------- /rules.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # @Author: 蘑菇君 4 | # @Date : 2021/01/03 5 | # @Desc : 网站爬取规则 6 | 7 | import re 8 | import time 9 | 10 | import requests 11 | from bs4 import BeautifulSoup 12 | 13 | # 全局变量 14 | headers = { 15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" 16 | } 17 | 18 | proxy_ip = [] # 代理IP列表 19 | 20 | 21 | # 1. 使用正则获取代理IP函数(接口提取形式):提取接口URL、备注名称 22 | def ip_api(link, text): 23 | global proxy_ip 24 | ip_num = 0 25 | try: 26 | url = str(link) 27 | response = requests.get(url, headers=headers, timeout=8) 28 | temp = re.findall(r'(\d+.\d+.\d+.\d+:\d+)', response.text) 29 | for ip in temp: 30 | ip_num += 1 # 自增计算IP数 31 | proxy_ip.append(ip) 32 | except Exception as e: 33 | print(e) 34 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 35 | 36 | 37 | # 2. 使用HTML解析获取代理IP函数【一】(源码IP和端口在一起且在第1个标签内):网页链接、备注名称、爬取页数、爬取速度 38 | def ip_html1(link, text, page, speed): 39 | global proxy_ip 40 | ip_num = 0 41 | try: 42 | for a in range(1, page): # 爬取多少页 43 | url = str(link) + str(a) 44 | response = requests.get(url, headers=headers, timeout=8) 45 | # print(response.text) 46 | soupIP = BeautifulSoup(response.text, 'html5lib') 47 | trs = soupIP.find_all('tr') 48 | for tr in trs[1:]: 49 | ip_num += 1 # 自增计算IP数 50 | tds = tr.find_all('td') 51 | # print(tds) 52 | ip = tds[0].text.strip() 53 | proxy_ip.append(ip) 54 | time.sleep(speed) # 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了) 55 | except Exception as e: 56 | print(e) 57 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 58 | 59 | 60 | # 3. 使用HTML解析获取代理IP函数【二】(源码IP和端口不在一起,IP在第1个标签内,端口在第2个标签内):网页链接、备注名称、爬取页数、爬取速度 61 | def ip_html2(link, text, page, speed): 62 | global proxy_ip 63 | ip_num = 0 64 | try: 65 | for a in range(1, page): # 爬取多少页 66 | 67 | # 部分URL特殊处理 68 | if "www.kxdaili.com" in link.split("/"): 69 | url = str(link) + str(a) + ".html" 70 | else: 71 | url = str(link) + str(a) 72 | 73 | response = requests.get(url, headers=headers, timeout=8) 74 | soupIP = BeautifulSoup(response.text, 'html5lib') 75 | trs = soupIP.find_all('tr') 76 | for tr in trs[1:]: 77 | ip_num += 1 # 自增计算IP数 78 | tds = tr.find_all('td') 79 | # print(tds) 80 | ip = tds[0].text.strip() 81 | port = tds[1].text.strip() 82 | proxy_ip.append(ip + ':' + port) 83 | time.sleep(speed) # 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了) 84 | except Exception as e: 85 | print(e) 86 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 87 | 88 | 89 | # 4. 使用HTML解析获取代理IP函数【三】(源码IP和端口不在一起,IP在第2个标签内,端口在第3个标签内):网页链接、备注名称、爬取页数、爬取速度 90 | def ip_html3(link, text, page, speed): 91 | global proxy_ip 92 | ip_num = 0 93 | try: 94 | for a in range(1, page): # 爬取多少页 95 | url = str(link) + str(a) 96 | response = requests.get(url, headers=headers, timeout=8) 97 | # print(response.text) 98 | soupIP = BeautifulSoup(response.text, 'html5lib') 99 | trs = soupIP.find_all('tr') 100 | for tr in trs[1:]: 101 | ip_num += 1 # 自增计算IP数 102 | tds = tr.find_all('td') 103 | # print(tds) 104 | ip = tds[1].text.strip() 105 | port = tds[2].text.strip() 106 | proxy_ip.append(ip + ':' + port) 107 | time.sleep(speed) # 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了) 108 | except Exception as e: 109 | print(e) 110 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 111 | 112 | 113 | # 5. 使用HTML解析获取代理IP函数【四】(源码IP和端口不在一起,IP在第1个
标签内,端口在第2个
标签内):网页链接、备注名称、爬取页数、爬取速度 114 | def ip_html4(link, text, page, speed): 115 | global proxy_ip 116 | ip_num = 0 117 | try: 118 | for a in range(1, page): # 爬取多少页 119 | url = str(link) + str(a) 120 | response = requests.get(url, headers=headers, timeout=8) 121 | soupIP = BeautifulSoup(response.text, 'html5lib') 122 | divs = soupIP.find_all('div', class_="tr") 123 | for div in divs[1:]: 124 | ip_num += 1 # 自增计算IP数 125 | tds = div.find_all('div', class_="td") 126 | # print(tds) 127 | ip = tds[0].text.strip() 128 | port = tds[1].text.strip() 129 | proxy_ip.append(ip + ':' + port) 130 | time.sleep(speed) # 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了) 131 | except Exception as e: 132 | print(e) 133 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 134 | 135 | 136 | # 6. 小舒代理函数(发布代理文章这类的网站):网页链接、备注名称、爬取页数、爬取速度 137 | def ip_article1(link, text, page, speed): 138 | global proxy_ip 139 | ip_num = 0 140 | try: 141 | url = str(link) 142 | response = requests.get(url, headers=headers, timeout=8) 143 | page_urls = re.findall(r'(?<=href="/dayProxy/ip/).*?(?=">20)', response.text) 144 | for a in range(int(page)): # 爬取前几篇文章内容 145 | 146 | # 部分URL特殊处理 147 | if "www.zdaye.com" in link.split("/"): 148 | page_url = "https://www.zdaye.com/dayProxy/ip/" + page_urls[a] 149 | else: 150 | page_url = link + "dayProxy/ip/" + page_urls[a] 151 | 152 | # 爬取文章内容 153 | article = requests.get(page_url, headers=headers, timeout=8) 154 | temp = re.findall(r'(\d+.\d+.\d+.\d+:\d+)', article.text) 155 | for ip in temp: 156 | ip_num += 1 # 自增计算IP数 157 | proxy_ip.append(ip) 158 | time.sleep(speed) 159 | except Exception as e: 160 | print(e) 161 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 162 | 163 | 164 | # 7. 小幻HTTP代理函数(小幻HTTP代理专用):网页链接、备注名称、爬取速度 165 | def ip_article2(link, text, speed): 166 | global proxy_ip 167 | ip_num = 0 168 | try: 169 | response = requests.get(link, headers=headers, timeout=20) 170 | soupIP = BeautifulSoup(response.text, 'html5lib') 171 | divs = soupIP.find_all('div', class_='bs-callout bs-callout-info') 172 | for div in divs: 173 | ip_num += 1 # 自增计算IP数 174 | hrefs = div.find_all('a') 175 | url = hrefs[0].attrs['href'] 176 | 177 | # 爬取文章内容 178 | article = requests.get("https://ip.ihuan.me/today" + url, headers=headers, timeout=20) 179 | temp = re.findall(r'(\d+.\d+.\d+.\d+:\d+)', article.text) 180 | for ip in temp: 181 | ip_num += 1 # 自增计算IP数 182 | proxy_ip.append(ip) 183 | time.sleep(speed) # 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了) 184 | except Exception as e: 185 | print(e) 186 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 187 | --------------------------------------------------------------------------------