├── src ├── __init__.py ├── entity │ ├── __init__.py │ └── proxy_entity.py ├── enum │ ├── __init__.py │ └── common.py ├── log │ ├── __init__.py │ └── logger.py ├── spider │ ├── __init__.py │ ├── abs_spider.py │ └── spiders.py ├── web │ ├── __init__.py │ └── web_flask.py ├── database │ ├── __init__.py │ ├── abs_database.py │ └── sqlite_opt.py ├── validator │ ├── __init__.py │ ├── expiration_validator.py │ ├── validator.py │ └── anonymity_validator.py └── runner.py ├── test ├── __init__.py ├── spider │ ├── __init__.py │ ├── test_spider_66_ip.py │ ├── test_spider_xici_Ip.py │ ├── test_spider_ip_hai_ip.py │ ├── test_spider_quan_wang_ip.py │ ├── test_spider_yun_dai_li_ip.py │ ├── test_spider_kuai_dai_li_ip.py │ └── test_spider_mian_fei_dai_li_ip.py ├── database │ ├── __init__.py │ └── test_sqlite_opt.py └── validator │ ├── __init__.py │ ├── test_anonymity_validator.py │ └── test_validator.py ├── main.py ├── requirements.txt ├── Dockerfile ├── LICENSE ├── setting.py ├── .gitignore └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/entity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/enum/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/log/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/web/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/database/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/validator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/database/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/validator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from src.runner import run 2 | 3 | if __name__ == '__main__': 4 | run() 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | APScheduler==3.6.1 2 | aiohttp==3.6.0 3 | Flask==1.1.1 4 | SQLAlchemy==1.3.8 5 | requests==2.22.0 6 | beautifulsoup4==4.8.0 7 | typing==3.7.4.1 8 | lxml 9 | pytest -------------------------------------------------------------------------------- /src/enum/common.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, unique 2 | 3 | 4 | @unique 5 | class ProxyTypeEnum(Enum): 6 | UNKNOWN = 0 7 | HTTP = 1 8 | HTTPS = 2 9 | HTTP_AND_HTTPS = 3 10 | 11 | 12 | @unique 13 | class ProxyCoverEnum(Enum): 14 | UNKNOWN = 0 15 | TRANSPARENT = 1 16 | NORMAL_COVER = 2 17 | HIGH_COVER = 3 18 | -------------------------------------------------------------------------------- /src/validator/expiration_validator.py: -------------------------------------------------------------------------------- 1 | from src.database.sqlite_opt import sqlite_opt 2 | from src.log.logger import logger 3 | 4 | 5 | class ExpirationValidator(object): 6 | 7 | def run(self): 8 | logger.info('开始删除不可用代理') 9 | sqlite_opt.remove_all_zero_reliability() 10 | logger.info('不可用代理删除完毕') 11 | 12 | 13 | expiration_validator = ExpirationValidator() 14 | -------------------------------------------------------------------------------- /test/spider/test_spider_66_ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import Spider66Ip 5 | 6 | 7 | class TestSpider66Ip(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = Spider66Ip() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /test/spider/test_spider_xici_Ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import SpiderXiciIp 5 | 6 | 7 | class TestSpiderXiciIp(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = SpiderXiciIp() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /test/spider/test_spider_ip_hai_ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import SpiderIpHaiIp 5 | 6 | 7 | class TestSpiderXiciIp(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = SpiderIpHaiIp() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /test/spider/test_spider_quan_wang_ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import SpiderQuanWangIp 5 | 6 | 7 | class TestSpiderQuanWangIp(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = SpiderQuanWangIp() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /test/spider/test_spider_yun_dai_li_ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import SpiderYunDaiLiIp 5 | 6 | 7 | class TestSpiderXiciIp(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = SpiderYunDaiLiIp() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /test/spider/test_spider_kuai_dai_li_ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import SpiderKuaiDaiLiIp 5 | 6 | 7 | class TestSpiderKuaiDaiLiIp(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = SpiderKuaiDaiLiIp() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | MAINTAINER cwjokaka 3 | RUN mkdir /code 4 | WORKDIR /code 5 | COPY requirements.txt /code/ 6 | RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/ 7 | COPY . /code/ 8 | #WORKDIR /proxy_app 9 | #COPY . ./ 10 | #RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/ 11 | EXPOSE 8080 12 | #CMD ["python", "main.py"] 13 | -------------------------------------------------------------------------------- /test/spider/test_spider_mian_fei_dai_li_ip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import unittest 3 | 4 | from src.spider.spiders import SpiderMianFeiDaiLiIp 5 | 6 | 7 | class TestSpiderMianFeiDaiLiIp(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._spider = SpiderMianFeiDaiLiIp() 11 | 12 | def test_crawl(self): 13 | result = asyncio.run(self._spider.crawl()) 14 | assert result 15 | assert len(result) > 0 16 | -------------------------------------------------------------------------------- /test/validator/test_anonymity_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from src.database.sqlite_opt import sqlite_opt 4 | from src.validator.anonymity_validator import anonymity_validator 5 | 6 | 7 | class TestAnonymityValidator(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | self._opt = sqlite_opt 11 | self._validator = anonymity_validator 12 | 13 | # self._opt.clean() 14 | 15 | def test_valid_proxy(self): 16 | self._validator.run() 17 | pass 18 | 19 | -------------------------------------------------------------------------------- /src/log/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_logger(): 5 | """ 6 | 创建日志单例 7 | """ 8 | formatter = logging.Formatter("%(asctime)s %(name)s:%(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") 9 | logger = logging.getLogger("monitor") 10 | logger.setLevel(logging.INFO) 11 | handler_stream = logging.StreamHandler() 12 | handler_stream.setLevel(logging.INFO) 13 | handler_stream.setFormatter(formatter) 14 | handler_error = logging.FileHandler(filename="error.log", encoding="utf-8") 15 | handler_error.setLevel(logging.ERROR) 16 | handler_error.setFormatter(formatter) 17 | logger.addHandler(handler_stream) 18 | logger.addHandler(handler_error) 19 | return logger 20 | 21 | 22 | logger = get_logger() 23 | -------------------------------------------------------------------------------- /src/database/abs_database.py: -------------------------------------------------------------------------------- 1 | class AbsDatabase(object): 2 | 3 | def add_proxy(self, proxy): 4 | raise NotImplementedError 5 | 6 | def get_all_proxies(self): 7 | raise NotImplementedError 8 | 9 | def get_unknown_anonymity_proxies(self): 10 | raise NotImplementedError 11 | 12 | def increase_reliability(self, url): 13 | raise NotImplementedError 14 | 15 | def reduce_reliability(self, url): 16 | raise NotImplementedError 17 | 18 | def update_anonymity(self, url, value): 19 | raise NotImplementedError 20 | 21 | def remove(self, key): 22 | raise NotImplementedError 23 | 24 | def remove_all_zero_reliability(self): 25 | raise NotImplementedError 26 | 27 | def init_db(self): 28 | return 29 | -------------------------------------------------------------------------------- /test/validator/test_validator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from src.database.sqlite_opt import sqlite_opt 4 | from src.entity.proxy_entity import ProxyEntity 5 | from src.enum.common import ProxyTypeEnum 6 | from src.validator.validator import validator 7 | 8 | 9 | class TestValidator(unittest.TestCase): 10 | 11 | def setUp(self) -> None: 12 | self._opt = sqlite_opt 13 | self._validator = validator 14 | # self._opt.init_db() 15 | # proxy = ProxyEntity('127.0.0.1', '8080', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) 16 | # assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' 17 | # proxy = ProxyEntity('127.0.0.2', '8081', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) 18 | # assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' 19 | 20 | # self._opt.clean() 21 | 22 | def test_valid_proxy(self): 23 | self._validator.run() 24 | pass 25 | 26 | -------------------------------------------------------------------------------- /test/database/test_sqlite_opt.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from src.database.sqlite_opt import sqlite_opt 4 | from src.entity.proxy_entity import ProxyEntity 5 | from src.enum.common import ProxyTypeEnum 6 | 7 | 8 | class TestSqliteOpt(unittest.TestCase): 9 | 10 | def setUp(self) -> None: 11 | self._opt = sqlite_opt 12 | self._opt.init_db() 13 | # self._opt.clean() 14 | 15 | def test_add_proxy(self): 16 | proxy = ProxyEntity('127.0.0.1', '8080', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) 17 | assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' 18 | proxy = ProxyEntity('127.0.0.2', '8081', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) 19 | assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' 20 | 21 | def test_get_all_proxies(self): 22 | proxy_list = self._opt.get_all_proxies() 23 | assert len(proxy_list) > 0 24 | 25 | def test_remove_all_zero_reliability(self): 26 | self._opt.remove_all_zero_reliability() 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 cwjokaka 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/validator/validator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | 4 | from setting import VALIDATOR 5 | from src.database.sqlite_opt import sqlite_opt 6 | 7 | 8 | class Validator(object): 9 | 10 | def run(self): 11 | # 获取proxy列表 12 | proxy_list = sqlite_opt.get_all_proxies() 13 | if len(proxy_list) > 0: 14 | tasks = [self.valid_proxy(proxy.url) for proxy in proxy_list] 15 | asyncio.run(asyncio.wait(tasks)) 16 | 17 | async def valid_proxy(self, proxy_url): 18 | async with aiohttp.ClientSession() as session: 19 | try: 20 | async with session.get(VALIDATOR['test_url'], proxy=proxy_url, 21 | timeout=VALIDATOR['request_timeout']) as resp: 22 | if resp.status == 200: 23 | # print(f'{proxy_url}可靠') 24 | sqlite_opt.increase_reliability(proxy_url) 25 | else: 26 | # print(f'{proxy_url}不可靠') 27 | sqlite_opt.reduce_reliability(proxy_url) 28 | except: 29 | sqlite_opt.reduce_reliability(proxy_url) 30 | # print(f'{proxy_url}不可靠') 31 | 32 | 33 | validator = Validator() 34 | -------------------------------------------------------------------------------- /src/web/web_flask.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, jsonify 2 | 3 | from src.database.sqlite_opt import sqlite_opt 4 | 5 | app = Flask(__name__) 6 | 7 | 8 | @app.route('/') 9 | def index(): 10 | """主页 11 | """ 12 | return ''' 13 |

😘Welcome to Home Page😄

14 |

🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️🙆‍♂️🤷‍♀️

15 |

APIs:

16 |

Get an usable proxy:

17 |

/get

18 |

Get all usable proxies:

19 |

/get_all

20 | ''' 21 | 22 | 23 | @app.route('/get') 24 | def get_proxy(): 25 | """获取单个代理 26 | """ 27 | proxy = sqlite_opt.get_one_in_page() 28 | if proxy: 29 | return jsonify({ 30 | 'code': 200, 31 | 'proxy': proxy.url 32 | }) 33 | else: 34 | return jsonify({'code': 500, 'msg': 'server error'}) 35 | 36 | 37 | @app.route('/get_all') 38 | def get_all_proxy(): 39 | """获取全部(可用的)代理 40 | """ 41 | proxy_list = sqlite_opt.get_all_in_page() 42 | if proxy_list: 43 | return jsonify({ 44 | 'code': 200, 45 | 'proxies': [proxy.url for proxy in proxy_list] 46 | }) 47 | else: 48 | return jsonify({'code': 500, 'msg': 'server error'}) 49 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | # 代理爬虫配置 2 | SPIDER = { 3 | 'crawl_interval': 120, # 爬取IP代理的间隔(秒) 4 | 'list': [ # 使用的代理爬虫(类名) 5 | 'Spider66Ip', 6 | 'SpiderQuanWangIp', 7 | 'SpiderXiciIp', 8 | 'SpiderKuaiDaiLiIp', 9 | 'SpiderYunDaiLiIp', 10 | 'SpiderIpHaiIp', 11 | 'SpiderMianFeiDaiLiIp' 12 | ] 13 | } 14 | 15 | # 校验器配置 16 | VALIDATOR = { 17 | 'test_url': 'http://www.baidu.com', # 可用校验url 18 | 'request_timeout': 4, # 校验超时时间 19 | 'validate_interval': 60 # 校验间隔(秒) 20 | } 21 | 22 | # 匿名性校验配置 23 | ANONYMITY_VALIDATOR = { 24 | 'http_test_url': 'http://httpbin.org/get', # 匿名校验url 25 | 'https_test_url': 'https://httpbin.org/get', 26 | 'request_timeout': 4, # 校验最大超时时间 27 | 'interval': 180 # 校验间隔(秒) 28 | } 29 | 30 | # 清除不可用代理配置 31 | EXPIRATION_VALIDATOR = { 32 | 'interval': 60 * 30 33 | } 34 | 35 | # 数据库配置 36 | DB = { 37 | 'db_name': 'proxy.db', 38 | 'table_name': 'proxy' 39 | } 40 | 41 | # WEB配置(Flask) 42 | WEB_SERVER = { 43 | 'host': '0.0.0.0', 44 | 'port': '8080' 45 | } 46 | 47 | # 爬虫请求头 48 | HEADERS = { 49 | "X-Requested-With": "XMLHttpRequest", 50 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 " 51 | "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", 52 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea 107 | idea/ 108 | 109 | *.db 110 | test/*.db-journal -------------------------------------------------------------------------------- /src/runner.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import typing 3 | 4 | from apscheduler.schedulers.background import BackgroundScheduler 5 | 6 | from src.database.sqlite_opt import sqlite_opt 7 | from src.entity.proxy_entity import ProxyEntity 8 | from src.log.logger import logger 9 | from src.spider.spiders import spider_collection 10 | from setting import WEB_SERVER, VALIDATOR, SPIDER, ANONYMITY_VALIDATOR, EXPIRATION_VALIDATOR 11 | from src.validator.expiration_validator import expiration_validator 12 | from src.validator.validator import validator 13 | from src.validator.anonymity_validator import anonymity_validator 14 | from src.web.web_flask import app 15 | 16 | 17 | def crawl(): 18 | proxies = [] 19 | tasks = [] 20 | for spider_name in SPIDER['list']: 21 | tasks.append(spider_collection[spider_name].crawl()) 22 | loop = asyncio.new_event_loop() 23 | asyncio.set_event_loop(loop) 24 | results = loop.run_until_complete(asyncio.gather(*tasks)) 25 | loop.close() 26 | for proxies_list in results: 27 | proxies.extend(proxies_list) 28 | # proxies = loop.run_until_complete(asyncio.gather(*tasks)) 29 | # 持久化 30 | save(proxies) 31 | 32 | 33 | def save(proxies: typing.List[ProxyEntity]): 34 | for proxy in proxies: 35 | sqlite_opt.add_proxy(proxy) 36 | 37 | 38 | def run(): 39 | logger.info('初始化sqlite数据库...') 40 | sqlite_opt.init_db() 41 | scheduler = BackgroundScheduler() 42 | scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval']) 43 | scheduler.add_job(validator.run, 'interval', seconds=VALIDATOR['validate_interval']) 44 | scheduler.add_job(anonymity_validator.run, 'interval', seconds=ANONYMITY_VALIDATOR['interval']) 45 | scheduler.add_job(expiration_validator.run, 'interval', seconds=EXPIRATION_VALIDATOR['interval']) 46 | scheduler.start() 47 | app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port']) 48 | -------------------------------------------------------------------------------- /src/entity/proxy_entity.py: -------------------------------------------------------------------------------- 1 | from src.enum.common import ProxyTypeEnum, ProxyCoverEnum 2 | from sqlalchemy.ext.declarative import declarative_base 3 | from sqlalchemy import Column, Integer, String 4 | from setting import DB 5 | Base = declarative_base() 6 | 7 | 8 | class ProxyEntity(Base): 9 | __tablename__ = DB['table_name'] 10 | url = Column(String(36), primary_key=True) 11 | # ip = Column(String(20)) 12 | # port = Column(String(5)) 13 | source = Column(String(16)) 14 | # protocol = Column(String(5)) 15 | supplier = Column(String(16)) 16 | proxy_type = Column(Integer()) 17 | proxy_cover = Column(Integer()) 18 | check_count = Column(Integer()) 19 | region = Column(String(32)) 20 | last_check_time = Column(String(32)) 21 | reliability = Column(Integer()) 22 | 23 | """ 24 | ip代理对象 25 | :param url url地址 26 | :param ip ip地址 27 | :param port 端口 28 | :param protocol 协议 29 | :param source 代理源头网站名 30 | :param proxy_type 代理类型 {@link ProxyType} 31 | :param proxy_cover 代理隐蔽性 {@link CoverOfProxy} 32 | :param check_count 有效性检验的次数 33 | :param last_check_time 最后进行有效性检验的时间 34 | :param reliability 代理可靠性, 默认为5 35 | """ 36 | def __init__(self, url: str, 37 | # ip: str, 38 | # port: str, 39 | # protocol: str = 'http', 40 | source: str = 'unknown', 41 | supplier='unknown', 42 | proxy_type: int = ProxyTypeEnum.UNKNOWN.value, 43 | proxy_cover: int = ProxyCoverEnum.UNKNOWN.value, 44 | check_count=0, region='', last_check_time=None, reliability=5): 45 | self.url = url 46 | # self.ip = ip 47 | # self.port = port 48 | # self.protocol = protocol 49 | self.source = source 50 | self.supplier = supplier 51 | self.proxy_type = proxy_type 52 | self.proxy_cover = proxy_cover 53 | self.check_count = check_count 54 | self.region = region 55 | self.last_check_time = last_check_time 56 | self.reliability = reliability 57 | -------------------------------------------------------------------------------- /src/spider/abs_spider.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, Iterable 3 | 4 | import aiohttp 5 | 6 | from setting import HEADERS 7 | from src.entity.proxy_entity import ProxyEntity 8 | from src.log.logger import logger 9 | 10 | 11 | class AbsSpider(object): 12 | 13 | def __init__(self, name='unknown') -> None: 14 | self._name = name 15 | self._urls = self.get_urls() 16 | 17 | async def crawl(self): 18 | logger.info(f'{self._name}开始爬取...') 19 | res = [] 20 | for url in self._urls: 21 | try: 22 | for page in self.get_page_range(): 23 | async with aiohttp.ClientSession() as session: 24 | async with session.get(self.get_page_url(url, page), headers=HEADERS) as resp: 25 | resp.encoding = self.get_encoding() 26 | temp = self.do_crawl(await resp.text()) 27 | res.extend(temp) 28 | await asyncio.sleep(self.get_interval()) 29 | except Exception as e: 30 | logger.exception(f'{self._name}爬取失败url: {url}, :e:{e}') 31 | return res 32 | 33 | def do_crawl(self, resp: str) -> List[ProxyEntity]: 34 | """ 35 | 子类重写此方法解析网页内容 36 | :param resp: 返回内容字符串 37 | :return: 代理列表 38 | """ 39 | raise NotImplementedError 40 | 41 | def get_urls(self) -> List[str]: 42 | """ 43 | 子类从写此方法返回获取 44 | :return: 45 | """ 46 | raise NotImplementedError 47 | 48 | def get_page_range(self) -> Iterable: 49 | """ 50 | 默认只获取第一页内容 51 | :return: 52 | """ 53 | return range(1, 2) 54 | 55 | def get_page_url(self, url, page) -> str: 56 | """ 57 | 格式化页数url 58 | :param url: url 59 | :param page: 60 | :return: 61 | """ 62 | return f'{url}/{page}' 63 | 64 | def get_encoding(self): 65 | """ 66 | 默认页面编码 67 | :return: 68 | """ 69 | return 'utf-8' 70 | 71 | def get_interval(self) -> int: 72 | """ 73 | 代理网站爬取间隔(秒) 74 | :return: 75 | """ 76 | return 0 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ok_ip_proxy_pool😁 2 | 一个还ok的IP代理池,先做给自己用着~ 3 | 4 | 5 | 6 | ## 运行环境 7 | 8 | - python 3.7 9 | 10 | 11 | 12 | ## 特点 13 | 14 | - 异步爬取&验证代理🚀 15 | - 用权重加减来衡量代理的可用性(可用性:通过验证则+1,否则-1)🎭 16 | - 使用Sqlite,无需安装数据库环境🛴 17 | - 目前支持的免费代理有: 免费代理/全网/66/西刺/快代理/云代理/IP海 18 | 19 | 20 | 21 | ## 下载&安装 22 | 23 | - 源码下载: 24 | 25 | ``` 26 | git clone git@github.com:cwjokaka/ok_ip_proxy_pool.git 27 | ``` 28 | 29 | 30 | 31 | - 安装依赖: 32 | 33 | ``` 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | 38 | 39 | ## 配置文件 40 | ```python 41 | # 代理爬虫配置 42 | SPIDER = { 43 | 'crawl_interval': 120, # 爬取IP代理的间隔(秒) 44 | 'list': [ # 使用的代理爬虫(类名) 45 | 'Spider66Ip', 46 | 'SpiderQuanWangIp', 47 | 'SpiderXiciIp', 48 | 'SpiderKuaiDaiLiIp', 49 | 'SpiderYunDaiLiIp', 50 | 'SpiderIpHaiIp', 51 | 'SpiderMianFeiDaiLiIp' 52 | ] 53 | } 54 | 55 | # 校验器配置 56 | VALIDATOR = { 57 | 'test_url': 'http://www.baidu.com', # 可用校验url 58 | 'request_timeout': 4, # 校验超时时间 59 | 'validate_interval': 60 # 校验间隔(秒) 60 | } 61 | 62 | # 匿名性校验配置 63 | ANONYMITY_VALIDATOR = { 64 | 'http_test_url': 'http://httpbin.org/get', # 匿名校验url 65 | 'https_test_url': 'https://httpbin.org/get', 66 | 'request_timeout': 4, # 校验最大超时时间 67 | 'interval': 180 # 校验间隔(秒) 68 | } 69 | 70 | # 数据库配置 71 | DB = { 72 | 'db_name': 'proxy.db', 73 | 'table_name': 'proxy' 74 | } 75 | 76 | # WEB配置(Flask) 77 | WEB_SERVER = { 78 | 'host': '0.0.0.0', 79 | 'port': '8080' 80 | } 81 | 82 | # 爬虫请求头 83 | HEADERS = { 84 | "X-Requested-With": "XMLHttpRequest", 85 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 " 86 | "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", 87 | } 88 | ``` 89 | 90 | 91 | 92 | 93 | 94 | ## 运行 95 | ``` 96 | python main.py 97 | ``` 98 | 99 | 100 | 101 | 102 | 103 | ## API使用 104 | 105 | | API | method | description | 106 | | :------: | :----: | :----------: | 107 | | / | GET | 首页介绍 | 108 | | /get | GET | 获取一个代理 | 109 | | /get_all | GET | 获取所有代理 | 110 | 111 | 112 | 113 | ## 代理爬虫扩展 114 | 如果需要添加自定义代理爬虫,可通过以下步骤添加: 115 | 116 | 1. 进入src/spider/spiders.py 117 | 2. 添加自己的爬虫类,继承AbsSpider,实现它的do_crawl & get_page_range & get_urls方法,按需重写其他方法。 118 | 3. 用@spider_register修饰此类 119 | 4. 在配置文件setting.py的SPIDER['list']中添加此类名 120 | 121 | 122 | 123 | ## LAST 124 | 125 | 欢迎Fork|Star|Issue 三连😘 126 | -------------------------------------------------------------------------------- /src/validator/anonymity_validator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | import aiohttp 5 | 6 | from setting import ANONYMITY_VALIDATOR, HEADERS 7 | from src.database.sqlite_opt import sqlite_opt 8 | from src.enum.common import ProxyCoverEnum, ProxyTypeEnum 9 | from src.log.logger import logger 10 | 11 | 12 | class AnonymityValidator(object): 13 | 14 | urls = { 15 | ProxyTypeEnum.UNKNOWN.value: ANONYMITY_VALIDATOR['http_test_url'], 16 | ProxyTypeEnum.HTTP.value: ANONYMITY_VALIDATOR['http_test_url'], 17 | ProxyTypeEnum.HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'], 18 | ProxyTypeEnum.HTTP_AND_HTTPS.value: ANONYMITY_VALIDATOR['https_test_url'], 19 | } 20 | 21 | def run(self): 22 | # 获取proxy列表 23 | proxy_list = sqlite_opt.get_unknown_anonymity_proxies() 24 | if len(proxy_list) > 0: 25 | tasks = [self.valid_proxy(proxy.url, proxy.proxy_type) for proxy in proxy_list] 26 | asyncio.run(asyncio.wait(tasks)) 27 | 28 | async def valid_proxy(self, proxy_url, proxy_type): 29 | async with aiohttp.ClientSession() as session: 30 | try: 31 | async with session.get(self.urls[proxy_type], 32 | proxy=proxy_url, 33 | headers=HEADERS, 34 | timeout=ANONYMITY_VALIDATOR['request_timeout']) as resp: 35 | if resp.status == 200: 36 | # 检验其匿名性 37 | r_dict = json.loads(await resp.text()) 38 | headers = r_dict.get('headers', '') 39 | ip = r_dict.get('origin') 40 | proxy_connection = headers.get('Proxy-Connection', None) 41 | flag = True 42 | if ',' in ip: 43 | ips = str.split(ip, ',') 44 | first = ips[0] 45 | for p in ips: 46 | if first != p.lstrip(): 47 | proxy_cover = ProxyCoverEnum.TRANSPARENT.value # 透明 48 | flag = False 49 | break 50 | if flag: 51 | if proxy_connection: 52 | proxy_cover = ProxyCoverEnum.NORMAL_COVER.value # 普匿 53 | else: 54 | proxy_cover = ProxyCoverEnum.HIGH_COVER.value # 高匿 55 | # 更新匿名性 56 | sqlite_opt.update_anonymity(proxy_url, proxy_cover) 57 | logger.info(f'验证匿名性成功: url:{proxy_url}, coverValue:{proxy_cover}') 58 | else: 59 | logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, 返回码:{resp.status}') 60 | except asyncio.TimeoutError: 61 | logger.warn(f'验证匿名性请求超时, proxy_url:{proxy_url}') 62 | except ConnectionRefusedError: 63 | logger.warn(f'验证匿名性请求被拒绝, proxy_url:{proxy_url}') 64 | except Exception as e: 65 | # logger.exception(e) 66 | logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, e:{e}') 67 | 68 | 69 | anonymity_validator = AnonymityValidator() 70 | -------------------------------------------------------------------------------- /src/database/sqlite_opt.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.exc import IntegrityError 2 | 3 | from setting import DB 4 | from src.database.abs_database import AbsDatabase 5 | from sqlalchemy import create_engine, desc 6 | from sqlalchemy.orm import sessionmaker 7 | from src.entity.proxy_entity import ProxyEntity 8 | from src.enum.common import ProxyCoverEnum 9 | from src.log.logger import logger 10 | import sqlite3 11 | 12 | class SqliteOpt(AbsDatabase): 13 | 14 | def __init__(self) -> None: 15 | engine = create_engine(f'sqlite:///{DB["db_name"]}?check_same_thread=False', echo=True) 16 | self._DBSession = sessionmaker(bind=engine) 17 | 18 | def add_proxy(self, proxy): 19 | session = self._DBSession() 20 | session.add(proxy) 21 | result = 0 22 | # 提交即保存到数据库: 23 | try: 24 | session.commit() 25 | result = 1 26 | except IntegrityError as e: 27 | logger.info(f'ip: {proxy.url} 已存在') 28 | finally: 29 | # 关闭session: 30 | session.close() 31 | return result 32 | 33 | def get_all_proxies(self): 34 | session = self._DBSession() 35 | try: 36 | return session.query(ProxyEntity).all() 37 | except Exception as e: 38 | logger.exception(e) 39 | finally: 40 | session.close() 41 | return [] 42 | 43 | def get_unknown_anonymity_proxies(self): 44 | session = self._DBSession() 45 | try: 46 | return (session.query(ProxyEntity) 47 | .filter(ProxyEntity.reliability > 0) 48 | .filter(ProxyEntity.proxy_cover == ProxyCoverEnum.UNKNOWN.value) 49 | .all()) 50 | except Exception as e: 51 | logger.exception(e) 52 | finally: 53 | session.close() 54 | return [] 55 | 56 | def increase_reliability(self, url): 57 | conn = self._get_connect() 58 | cursor = conn.cursor() 59 | try: 60 | cursor.execute(f""" 61 | UPDATE {DB["table_name"]} SET reliability = reliability + 1, 62 | last_check_time=datetime(CURRENT_TIMESTAMP,'localtime'), 63 | check_count = check_count + 1 64 | WHERE url='{url}' 65 | """) 66 | conn.commit() 67 | except Exception as e: 68 | pass 69 | # logger.exception(e) 70 | finally: 71 | cursor.close() 72 | conn.close() 73 | 74 | def reduce_reliability(self, url): 75 | conn = self._get_connect() 76 | cursor = conn.cursor() 77 | try: 78 | cursor.execute(f""" 79 | UPDATE {DB["table_name"]} SET reliability = reliability - 1, 80 | last_check_time=datetime(CURRENT_TIMESTAMP, 'localtime'), 81 | check_count = check_count + 1 82 | WHERE url='{url}' 83 | """) 84 | conn.commit() 85 | except Exception as e: 86 | pass 87 | # logger.exception(e) 88 | finally: 89 | cursor.close() 90 | conn.close() 91 | 92 | def remove(self, key): 93 | return super().remove(key) 94 | 95 | def update_anonymity(self, url, value): 96 | conn = self._get_connect() 97 | cursor = conn.cursor() 98 | try: 99 | cursor.execute(f""" 100 | UPDATE {DB["table_name"]} SET proxy_cover = {value} 101 | WHERE url='{url}' 102 | """) 103 | conn.commit() 104 | except Exception as e: 105 | logger.exception(e) 106 | finally: 107 | cursor.close() 108 | conn.close() 109 | 110 | def init_db(self): 111 | conn = self._get_connect() 112 | cursor = conn.cursor() 113 | try: 114 | cursor.execute(f""" 115 | create table {DB["table_name"]}( 116 | url varchar(36) not null, 117 | source varchar(16), 118 | supplier varchar(32), 119 | proxy_type tinyint(3), 120 | proxy_cover tinyint(3), 121 | check_count int(10), 122 | region varchar(36), 123 | last_check_time text, 124 | create_time text default (datetime(CURRENT_TIMESTAMP,'localtime')), 125 | reliability integer not null default 0 check(reliability >= 0) check(reliability <= 15), 126 | PRIMARY KEY ("url") 127 | ) 128 | """) 129 | except sqlite3.OperationalError as e: 130 | logger.warn(e) 131 | finally: 132 | cursor.close() 133 | conn.close() 134 | 135 | def clean(self): 136 | conn = self._get_connect() 137 | cursor = conn.cursor() 138 | try: 139 | cursor.execute(f'DELETE FROM {DB["table_name"]}') 140 | conn.commit() 141 | finally: 142 | cursor.close() 143 | conn.close() 144 | 145 | def get_one_in_page(self): 146 | session = self._DBSession() 147 | try: 148 | return session.query(ProxyEntity).order_by(desc(ProxyEntity.reliability)).first() 149 | except Exception as e: 150 | logger.exception(e) 151 | finally: 152 | session.close() 153 | return None 154 | 155 | def get_all_in_page(self): 156 | session = self._DBSession() 157 | try: 158 | return session.query(ProxyEntity).filter(ProxyEntity.reliability > 0).all() 159 | except Exception as e: 160 | logger.exception(e) 161 | finally: 162 | session.close() 163 | return None 164 | 165 | def remove_all_zero_reliability(self): 166 | conn = self._get_connect() 167 | cursor = conn.cursor() 168 | try: 169 | cursor.execute(f""" 170 | DELETE FROM {DB["table_name"]} 171 | WHERE reliability = 0 172 | """) 173 | conn.commit() 174 | except sqlite3.OperationalError as e: 175 | logger.warn(e) 176 | finally: 177 | cursor.close() 178 | conn.close() 179 | 180 | @staticmethod 181 | def _get_connect(): 182 | return sqlite3.connect(DB['db_name']) 183 | 184 | 185 | sqlite_opt = SqliteOpt() 186 | -------------------------------------------------------------------------------- /src/spider/spiders.py: -------------------------------------------------------------------------------- 1 | from typing import List, Iterable 2 | from src.entity.proxy_entity import ProxyEntity 3 | from src.enum.common import ProxyCoverEnum, ProxyTypeEnum 4 | from src.log.logger import logger 5 | from src.spider.abs_spider import AbsSpider 6 | from bs4 import BeautifulSoup, Tag 7 | 8 | 9 | spider_collection = {} 10 | 11 | 12 | def spider_register(cls): 13 | spider_collection.update({cls.__name__: cls()}) 14 | logger.info(f'注册{cls.__name__}') 15 | return cls 16 | 17 | 18 | @spider_register 19 | class Spider66Ip(AbsSpider): 20 | """ 21 | 66IP代理爬虫 刷新速度:🐌慢 22 | http://www.66ip.cn/ 23 | """ 24 | def __init__(self) -> None: 25 | super().__init__('66IP代理爬虫') 26 | 27 | def do_crawl(self, resp) -> List[ProxyEntity]: 28 | result = [] 29 | soup = BeautifulSoup(resp, 'lxml') 30 | tr_list = soup.find('table', attrs={'width': '100%', 'bordercolor': '#6699ff'}).find_all('tr') 31 | for i, tr in enumerate(tr_list): 32 | if i == 0: 33 | continue 34 | contents = tr.contents 35 | ip = contents[0].text 36 | port = contents[1].text 37 | region = contents[2].text 38 | proxy_cover = contents[3].text 39 | result.append(ProxyEntity(f'http://{ip}:{port}', 40 | source=self._name, 41 | proxy_cover=self._judge_proxy_cover(proxy_cover), 42 | region=region)) 43 | return result 44 | 45 | def get_urls(self) -> List[str]: 46 | return ['http://www.66ip.cn'] 47 | 48 | def get_page_range(self) -> Iterable: 49 | return range(1, 6) 50 | 51 | def get_page_url(self, url, page) -> str: 52 | return f'{url}/{page}.html' 53 | 54 | def get_encoding(self) -> str: 55 | return 'gb2312' 56 | 57 | @staticmethod 58 | def _judge_proxy_cover(cover_str: str): 59 | if cover_str == '高匿代理': 60 | return ProxyCoverEnum.HIGH_COVER.value 61 | else: 62 | return ProxyCoverEnum.UNKNOWN.value 63 | 64 | 65 | @spider_register 66 | class SpiderQuanWangIp(AbsSpider): 67 | """ 68 | 全网IP代理爬虫 刷新速度:极快 69 | http://www.goubanjia.com/ 70 | """ 71 | def __init__(self) -> None: 72 | super().__init__('全网IP代理爬虫') 73 | 74 | def do_crawl(self, resp) -> List[ProxyEntity]: 75 | result = [] 76 | soup = BeautifulSoup(resp, 'lxml') 77 | tr_list = soup.find('tbody').find_all('tr') 78 | for i, tr in enumerate(tr_list): 79 | tds = tr.find_all('td') 80 | id_and_port = tds[0] 81 | ip, port = self._parse_ip_and_port(id_and_port) 82 | proxy_cover = tds[1].text 83 | proxy_type = tds[2].text 84 | region = tds[3].contents[1].text 85 | supplier = tds[4].text 86 | result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', 87 | source=self._name, 88 | supplier=supplier, 89 | proxy_type=self._judge_proxy_type(proxy_type), 90 | proxy_cover=self._judge_proxy_cover(proxy_cover), 91 | region=region 92 | ) 93 | ) 94 | return result 95 | 96 | def get_urls(self) -> List[str]: 97 | return ['http://www.goubanjia.com'] 98 | 99 | def get_page_url(self, url, page) -> str: 100 | return url 101 | 102 | def _parse_ip_and_port(self, ip_td: Tag): 103 | res = [] 104 | contents = ip_td.find_all(['div', 'span']) 105 | for content in contents: 106 | res.append(content.text) 107 | res.pop() 108 | ip = ''.join(res) 109 | 110 | port_tag = contents[-1] 111 | port_ori_str = port_tag.get('class')[1] 112 | # 解码真实的端口 113 | port = 0 114 | for c in port_ori_str: 115 | port *= 10 116 | port += (ord(c) - ord('A')) 117 | port /= 8 118 | port = int(port) 119 | return ip, str(port) 120 | 121 | def _judge_proxy_type(self, type_str: str): 122 | type_low = type_str.lower() 123 | if type_low == 'http': 124 | return ProxyTypeEnum.HTTP.value 125 | elif type_low == 'https': 126 | return ProxyTypeEnum.HTTPS.value 127 | else: 128 | return ProxyTypeEnum.UNKNOWN.value 129 | 130 | def _judge_proxy_cover(self, cover_str: str): 131 | if cover_str == '透明': 132 | return ProxyCoverEnum.TRANSPARENT.value 133 | elif cover_str == '高匿': 134 | return ProxyCoverEnum.HIGH_COVER.value 135 | else: 136 | return ProxyCoverEnum.UNKNOWN.value 137 | 138 | 139 | @spider_register 140 | class SpiderXiciIp(AbsSpider): 141 | """ 142 | 西刺代理爬虫 刷新速度:🐌慢 143 | 基本上没几个代理个能用🆒 144 | https://www.xicidaili.com/ 145 | """ 146 | def __init__(self) -> None: 147 | super().__init__('西刺IP代理爬虫') 148 | 149 | def do_crawl(self, resp) -> List[ProxyEntity]: 150 | result = [] 151 | soup = BeautifulSoup(resp, 'lxml') 152 | tab = soup.find('table', attrs={'id': 'ip_list'}) 153 | if tab is None: 154 | return [] 155 | tr_list = tab.find_all('tr')[1: -1] 156 | for tr in tr_list: 157 | tds = tr.find_all('td') 158 | ip = tds[1].text 159 | port = tds[2].text 160 | proxy_cover = tds[4].text 161 | proxy_type = tds[5].text 162 | result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', 163 | source=self._name, 164 | proxy_cover=self._judge_proxy_cover(proxy_cover), 165 | proxy_type=self._judge_proxy_type(proxy_type), 166 | )) 167 | return result 168 | 169 | def get_urls(self) -> List[str]: 170 | return [ 171 | 'https://www.xicidaili.com/nn', # 高匿 172 | 'https://www.xicidaili.com/nt' # 透明 173 | ] 174 | 175 | def get_page_range(self) -> Iterable: 176 | return range(1, 3) 177 | 178 | @staticmethod 179 | def _judge_proxy_cover(cover_str: str): 180 | if cover_str == '高匿': 181 | return ProxyCoverEnum.HIGH_COVER.value 182 | if cover_str == '透明': 183 | return ProxyCoverEnum.TRANSPARENT.value 184 | else: 185 | return ProxyCoverEnum.UNKNOWN.value 186 | 187 | @staticmethod 188 | def _judge_proxy_type(type_str: str): 189 | if type_str == 'HTTPS': 190 | return ProxyTypeEnum.HTTPS.value 191 | if type_str == 'HTTP': 192 | return ProxyTypeEnum.HTTP.value 193 | else: 194 | return ProxyTypeEnum.UNKNOWN.value 195 | 196 | 197 | @spider_register 198 | class SpiderKuaiDaiLiIp(AbsSpider): 199 | """ 200 | 快代理IP 刷新速度: 极快 201 | https://www.kuaidaili.com/free 202 | """ 203 | def __init__(self) -> None: 204 | super().__init__('快代理IP代理爬虫') 205 | 206 | def do_crawl(self, resp) -> List[ProxyEntity]: 207 | result = [] 208 | soup = BeautifulSoup(resp, 'lxml') 209 | trs = soup.find('table').find('tbody').find_all('tr') 210 | for tr in trs: 211 | tds = tr.find_all('td') 212 | ip = tds[0].text 213 | port = tds[1].text 214 | proxy_cover = tds[2].text 215 | proxy_type = tds[3].text 216 | region = tds[4].text 217 | result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', 218 | # ip, port, protocol=proxy_type.lower(), 219 | source=self._name, 220 | proxy_type=self._judge_proxy_type(proxy_type), 221 | proxy_cover=self._judge_proxy_cover(proxy_cover), 222 | region=region)) 223 | return result 224 | 225 | def get_urls(self) -> List[str]: 226 | return [ 227 | 'https://www.kuaidaili.com/free/inha', # 高匿 228 | 'https://www.kuaidaili.com/free/intr' # 透明 229 | ] 230 | 231 | def get_page_range(self) -> Iterable: 232 | return range(1, 3) 233 | 234 | # 爬太快会被封 235 | def get_interval(self) -> int: 236 | return 3 237 | 238 | def _judge_proxy_type(self, type_str: str): 239 | type_low = type_str.lower() 240 | if type_low == 'http': 241 | return ProxyTypeEnum.HTTP.value 242 | elif type_low == 'https': 243 | return ProxyTypeEnum.HTTPS.value 244 | else: 245 | return ProxyTypeEnum.UNKNOWN.value 246 | 247 | def _judge_proxy_cover(self, cover_str: str): 248 | if cover_str == '透明': 249 | return ProxyCoverEnum.TRANSPARENT.value 250 | elif cover_str == '高匿名': 251 | return ProxyCoverEnum.HIGH_COVER.value 252 | else: 253 | return ProxyCoverEnum.UNKNOWN.value 254 | 255 | 256 | @spider_register 257 | class SpiderYunDaiLiIp(AbsSpider): 258 | """ 259 | 云代理IP 刷新速度: 快 260 | http://www.ip3366.net/free 261 | """ 262 | def __init__(self) -> None: 263 | super().__init__('云代理IP爬虫') 264 | 265 | def do_crawl(self, resp) -> List[ProxyEntity]: 266 | result = [] 267 | soup = BeautifulSoup(resp, 'lxml') 268 | trs = soup.find('table').find('tbody').find_all('tr') 269 | for tr in trs: 270 | tds = tr.find_all('td') 271 | ip = tds[0].text 272 | port = tds[1].text 273 | proxy_cover = tds[2].text 274 | proxy_type = tds[3].text 275 | region = tds[4].text 276 | result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', 277 | source=self._name, 278 | proxy_type=self._judge_proxy_type(proxy_type), 279 | proxy_cover=self._judge_proxy_cover(proxy_cover), 280 | region=region)) 281 | return result 282 | 283 | def get_urls(self) -> List[str]: 284 | return [ 285 | 'http://www.ip3366.net/free/?stype=1', # 高匿 286 | 'http://www.ip3366.net/free/?stype=2' # 透明 or 普匿 287 | ] 288 | 289 | def get_page_range(self) -> Iterable: 290 | return range(1, 3) 291 | 292 | def get_page_url(self, url, page) -> str: 293 | return f'{url}&page={page}' 294 | 295 | 296 | def _judge_proxy_type(self, type_str: str): 297 | type_low = type_str.lower() 298 | if type_low == 'http': 299 | return ProxyTypeEnum.HTTP.value 300 | elif type_low == 'https': 301 | return ProxyTypeEnum.HTTPS.value 302 | else: 303 | return ProxyTypeEnum.UNKNOWN.value 304 | 305 | def _judge_proxy_cover(self, cover_str: str): 306 | if cover_str == '透明代理IP': 307 | return ProxyCoverEnum.TRANSPARENT.value 308 | elif cover_str == '高匿代理IP': 309 | return ProxyCoverEnum.HIGH_COVER.value 310 | elif cover_str == '普通代理IP': 311 | return ProxyCoverEnum.NORMAL_COVER.value 312 | else: 313 | return ProxyCoverEnum.UNKNOWN.value 314 | 315 | 316 | @spider_register 317 | class SpiderIpHaiIp(AbsSpider): 318 | """ 319 | IP海代理IP 刷新速度: 8分钟/1个 320 | 有时会连不上 321 | http://www.iphai.com 322 | """ 323 | def __init__(self) -> None: 324 | super().__init__('IP海代理IP爬虫') 325 | 326 | def do_crawl(self, resp) -> List[ProxyEntity]: 327 | result = [] 328 | soup = BeautifulSoup(resp, 'lxml') 329 | table = soup.find('table') 330 | if table is None: 331 | return [] 332 | tbody = soup.find('tbody') 333 | if tbody is None: 334 | return [] 335 | trs = tbody.find_all('tr') 336 | for i, tr in enumerate(trs): 337 | if i == 0: 338 | continue 339 | tds = tr.find_all('td') 340 | ip = tds[0].text 341 | port = tds[1].text 342 | proxy_cover = tds[2].text 343 | proxy_type = tds[3].text if tds[3].text != '' else 'http' 344 | region = tds[4].text 345 | result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', 346 | source=self._name, 347 | proxy_type=self._judge_proxy_type(proxy_type), 348 | proxy_cover=self._judge_proxy_cover(proxy_cover), 349 | region=region)) 350 | return result 351 | 352 | def get_urls(self) -> List[str]: 353 | return [ 354 | 'http://www.iphai.com/free/ng', # 国内高匿 355 | 'http://www.iphai.com/free/np', # 国内普通 356 | 'http://www.iphai.com/free/wg', # 国外高匿 357 | 'http://www.iphai.com/free/wp', # 国外普通 358 | ] 359 | 360 | # 爬太快会被封 361 | def get_interval(self) -> int: 362 | return 2 363 | 364 | def get_page_url(self, url, page) -> str: 365 | return url 366 | 367 | @staticmethod 368 | def _judge_proxy_type(type_str: str): 369 | type_low = type_str.lower() 370 | if type_low == 'http': 371 | return ProxyTypeEnum.HTTP.value 372 | elif type_low == 'https': 373 | return ProxyTypeEnum.HTTPS.value 374 | else: 375 | return ProxyTypeEnum.UNKNOWN.value 376 | 377 | @staticmethod 378 | def _judge_proxy_cover(cover_str: str): 379 | if cover_str == '透明': 380 | return ProxyCoverEnum.TRANSPARENT.value 381 | elif cover_str == '高匿': 382 | return ProxyCoverEnum.HIGH_COVER.value 383 | elif cover_str == '普匿': 384 | return ProxyCoverEnum.NORMAL_COVER.value 385 | else: 386 | return ProxyCoverEnum.UNKNOWN.value 387 | 388 | 389 | @spider_register 390 | class SpiderMianFeiDaiLiIp(AbsSpider): 391 | """ 392 | 免费代理IP库 393 | http://ip.jiangxianli.com/ 394 | """ 395 | def __init__(self) -> None: 396 | super().__init__('免费代理IP爬虫') 397 | 398 | def do_crawl(self, resp) -> List[ProxyEntity]: 399 | result = [] 400 | soup = BeautifulSoup(resp, 'lxml') 401 | table = soup.find('table') 402 | if table is None: 403 | return [] 404 | tbody = soup.find('tbody') 405 | if tbody is None: 406 | return [] 407 | trs = tbody.find_all('tr') 408 | for i, tr in enumerate(trs): 409 | if i == 0: 410 | continue 411 | tds = tr.find_all('td') 412 | ip = tds[0].text 413 | port = tds[1].text 414 | proxy_cover = tds[2].text 415 | proxy_type = tds[3].text if tds[3].text != '' else 'http' 416 | region = tds[5].text 417 | supplier = tds[6].text 418 | result.append(ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', 419 | source=self._name, 420 | supplier=supplier, 421 | proxy_type=self._judge_proxy_type(proxy_type), 422 | proxy_cover=self._judge_proxy_cover(proxy_cover), 423 | region=region)) 424 | return result 425 | 426 | def get_interval(self) -> int: 427 | return 2 428 | 429 | def get_page_range(self) -> Iterable: 430 | return range(1, 4) 431 | 432 | def get_urls(self) -> List[str]: 433 | return ['http://ip.jiangxianli.com/?page={}'] 434 | 435 | def get_page_url(self, url, page) -> str: 436 | return url.format(page) 437 | 438 | @staticmethod 439 | def _judge_proxy_type(type_str: str): 440 | type_low = type_str.lower() 441 | if type_low == 'http': 442 | return ProxyTypeEnum.HTTP.value 443 | elif type_low == 'https': 444 | return ProxyTypeEnum.HTTPS.value 445 | else: 446 | return ProxyTypeEnum.UNKNOWN.value 447 | 448 | @staticmethod 449 | def _judge_proxy_cover(cover_str: str): 450 | if cover_str == '透明': 451 | return ProxyCoverEnum.TRANSPARENT.value 452 | elif cover_str == '高匿': 453 | return ProxyCoverEnum.HIGH_COVER.value 454 | elif cover_str == '普匿': 455 | return ProxyCoverEnum.NORMAL_COVER.value 456 | else: 457 | return ProxyCoverEnum.UNKNOWN.value 458 | --------------------------------------------------------------------------------