├── proxy_spider ├── __init__.py ├── spiders │ ├── __init__.py │ └── proxy_spider.py ├── pipelines.py ├── items.py ├── settings.py └── middlewares.py ├── requirements.txt ├── Dockerfile ├── scrapy.cfg ├── LICENSE ├── hq-proxies.yml ├── .gitignore ├── README.md ├── config.yml └── start.py /proxy_spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | redis==2.10.5 2 | requests==2.13.0 3 | Scrapy==1.3.0 4 | Twisted==16.6.0 5 | PyYAML==3.12 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM daocloud.io/python:3.4-onbuild 2 | 3 | # Setting timezone 4 | RUN rm -f /etc/localtime 5 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime 6 | 7 | CMD [ "python", "./start.py"] -------------------------------------------------------------------------------- /proxy_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = proxy_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = proxy_spider 12 | -------------------------------------------------------------------------------- /proxy_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ProxySpiderPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /proxy_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ProxySpiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 arthurmmm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hq-proxies.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # 请填写redis配置 3 | # 请把这个配置文件放置到/etc/hq-proxies.yml 4 | REDIS_HOST: your_redis_host 5 | REDIS_PORT: 6379 6 | REDIS_PASSWORD: your_redis_password 7 | REDIS_DB: your_redis_db 8 | 9 | # Redis key 10 | PROXY_COUNT: hq-proxies:proxy_count 11 | PROXY_SET: hq-proxies:proxy_pool 12 | PROXY_PROTECT: hq-proxies:proxy_protect 13 | PROXY_REFRESH: hq-proxies:proxy_refresh 14 | 15 | # 代理数量低于proxy_low时会刷新 16 | PROXY_LOW: 5 17 | # 代理数量低于proxy_exhaust时会强制刷新 18 | PROXY_EXHAUST: 2 19 | 20 | # 多久检查一次代理质量 21 | CHECK_INTERVAL: 10 22 | # 多久检查一次代理数量 23 | LOOP_DELAY: 20 24 | # 每次获取代理后添加一个保护时间,避免频繁刷新 25 | PROTECT_SEC: 600 26 | # 强制刷新时间 27 | REFRESH_SEC: 86400 28 | 29 | # 配置代理源和验证页 30 | PROXY_VENDORS: 31 | - parser: parse_xici 32 | url: http://www.xicidaili.com/nn/ 33 | - parser: parse_kxdaili 34 | url: http://www.kxdaili.com/dailiip/1/1.html#ip 35 | - parser: parse_ip181 36 | url: http://www.ip181.com/ 37 | - parser: parse_66ip 38 | url: http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip 39 | 40 | # 可以配置多个验证页,会随机抽取一个用于验证 41 | PROXY_VALIDATORS: 42 | - url: http://olbllni9a.bkt.clouddn.com/text/helloworld.txt 43 | startstring: hello world! :) 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Privacy 2 | dbsetting.py 3 | dbsetting_test.py 4 | start.sh 5 | stop.sh 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv/ 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hq-proxies 2 | 3 | 一个简单的动态代理池,通过较高频率的自检保证池内代理的高可靠性。 4 | 5 | # 代码结构 6 | 代码分三个部分: 7 | * 一个scrapy爬虫去爬代理网站,获取免费代理,验证后入库 (proxy_fetch) 8 | * 一个scrapy爬虫把代理池内的代理全部验证一遍,若验证失败就从代理池内删除 (proxy_check) 9 | * 一个调度程序用于管理上面两个爬虫 (start.py) 10 | 11 | ![hq-proxies.png](http://upload-images.jianshu.io/upload_images/4610828-edbea71e6ff36157.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 12 | 13 | # 部署 14 | 需要先改一下配置文件hq-proxies.yml,把Redis的地址密码之类的填上,改完后放到/etc/hq-proxies.yml下。 15 | 在配置文件中也可以调整相应的阈值和免费代理源和测试页面。 16 | 测试页面需要频繁访问,为了节省流量我在某云存储上丢了个helloworld的文本当测试页面了,云存储有流量限制建议大家换掉。。验证方式很粗暴,比较一下网页开头字符串。。 17 | 18 | 另外写了个Dockerfile可以直接部署到Docker上(python3用的是Daocloud的镜像),跑容器的时候记得把hq-proxies.yml映射到容器/etc/hq-proxies.yml下。 19 | 手工部署的话跑`pip install -r requirements.txt`安装依赖包 20 | 21 | # 使用 22 | 在scrapy中使用代理池的只需要添加一个middleware,每次爬取时从redis SET里用srandmember随机获取一个代理使用,代理失效和一般的请求超时一样retry,代理池的自检特性保证了我们retry时候再次拿到失效代理的概率很低。middleware代码示例: 23 | 24 | ```python 25 | class DynamicProxyMiddleware(object): 26 | def process_request(self, request, spider): 27 | redis_db = StrictRedis( 28 | host=LOCAL_CONFIG['REDIS_HOST'], 29 | port=LOCAL_CONFIG['REDIS_PORT'], 30 | password=LOCAL_CONFIG['REDIS_PASSWORD'], 31 | db=LOCAL_CONFIG['REDIS_DB'] 32 | ) 33 | proxy = redis_db.sismember(PROXY_SET, proxy): 34 | logger.debug('使用代理[%s]访问[%s]' % (proxy, request.url)) 35 | request.meta['proxy'] = proxy 36 | ``` 37 | 38 | 39 | 博客: http://blog.arthurmao.me/2017/02/python-redis-hq-proxies 40 | 41 | 简书: http://www.jianshu.com/p/6cd4f1876b31 42 | 43 | 日志截图: 44 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/4610828-29e8d33a438a606f.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 45 | -------------------------------------------------------------------------------- /proxy_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for proxy_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'proxy_spider' 13 | 14 | SPIDER_MODULES = ['proxy_spider.spiders'] 15 | NEWSPIDER_MODULE = 'proxy_spider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | # DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | # } 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'proxy_spider.middlewares.ProxySpiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'proxy_spider.middlewares.ProxyPoolDownloaderMiddleware': 542, 57 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 351, 58 | 'proxy_spider.middlewares.ProxyPoolUserAgentMiddleware': 543, 59 | } 60 | DOWNLOAD_TIMEOUT = 3 61 | RETRY_ENABLED = False 62 | 63 | # DUPEFILTER_CLASS = 'proxy_spider.' 64 | DUPEFILTER_DEBUG = True 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | #ITEM_PIPELINES = { 75 | # 'proxy_spider.pipelines.SomePipeline': 300, 76 | #} 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -------------------------------------------------------------------------------- /proxy_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import random 5 | from scrapy import signals 6 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 7 | from scrapy.downloadermiddlewares.downloadtimeout import DownloadTimeoutMiddleware 8 | from scrapy.http import TextResponse 9 | import logging 10 | from twisted.web._newclient import ResponseNeverReceived 11 | from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class ProxyPoolUserAgentMiddleware(UserAgentMiddleware): 16 | 17 | #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 18 | #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 19 | user_agent_list = [ 20 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " 21 | "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 22 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " 23 | "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 24 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " 25 | "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 26 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " 27 | "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 28 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " 29 | "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 30 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " 31 | "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 32 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " 33 | "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 34 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 35 | "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 36 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/536.3 " 37 | "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 38 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " 39 | "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 40 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 41 | "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 42 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 43 | "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 44 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 45 | "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 46 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " 47 | "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 48 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/536.3 " 49 | "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 50 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " 51 | "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 52 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " 53 | "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 54 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " 55 | "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 56 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36" 57 | ] 58 | 59 | def __init__(self, user_agent=''): 60 | self.user_agent = user_agent 61 | 62 | def process_request(self, request, spider): 63 | ua = random.choice(self.user_agent_list) 64 | if ua: 65 | request.headers.setdefault('User-Agent', ua) 66 | 67 | class ProxyPoolDownloaderMiddleware(DownloadTimeoutMiddleware): 68 | DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError, TypeError) 69 | 70 | def process_exception(self, request, exception, spider): 71 | if isinstance(exception, self.DONT_RETRY_ERRORS): 72 | return TextResponse(url=request.meta['proxy']) 73 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import sys 5 | import time 6 | import random 7 | import unittest 8 | import yaml 9 | from redis import StrictRedis 10 | from threading import Thread 11 | from datetime import datetime 12 | import logging 13 | from logging.handlers import RotatingFileHandler 14 | 15 | if __name__ == '__main__': 16 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml' 17 | with open(LOCAL_CONFIG_YAML, 'r') as f: 18 | LOCAL_CONFIG = yaml.load(f) 19 | fetchcmd = 'scrapy crawl proxy_fetch' 20 | checkcmd = 'scrapy crawl proxy_check > /dev/null 2>&1' 21 | log_path = '/data/logs/hq-proxies.log' 22 | else: 23 | print('测试模式!') 24 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml' 25 | with open(LOCAL_CONFIG_YAML, 'r') as f: 26 | LOCAL_CONFIG = yaml.load(f) 27 | fetchcmd = 'scrapy crawl proxy_fetch -a mode=test' 28 | checkcmd = 'scrapy crawl proxy_check -a mode=test' 29 | log_path = '/data/logs/hq-proxies.test.log' 30 | 31 | FORMAT = '%(asctime)s %(levelno)s/%(lineno)d: %(message)s' 32 | logging.basicConfig(level=logging.DEBUG, format=FORMAT) 33 | logger = logging.getLogger(__name__) 34 | logger.setLevel(logging.DEBUG) 35 | formatter = logging.Formatter(fmt=FORMAT) 36 | rfh = RotatingFileHandler(log_path, maxBytes=1*1024*1024, backupCount=10) 37 | rfh.setFormatter(formatter) 38 | rfh.setLevel(logging.DEBUG) 39 | logger.addHandler(rfh) 40 | 41 | # redis keys 42 | PROXY_COUNT = 'hq-proxies:proxy_count' 43 | PROXY_SET = 'hq-proxies:proxy_pool' 44 | PROXY_PROTECT = 'hq-proxies:proxy_protect' 45 | PROXY_REFRESH = 'hq-proxies:proxy_refresh' 46 | 47 | # mongo collections 48 | VENDORS = 'vendors' 49 | VALIDATORS = 'validators' 50 | 51 | redis_db = StrictRedis( 52 | host=LOCAL_CONFIG['REDIS_HOST'], 53 | port=LOCAL_CONFIG['REDIS_PORT'], 54 | password=LOCAL_CONFIG['REDIS_PASSWORD'], 55 | db=LOCAL_CONFIG['REDIS_DB'] 56 | ) 57 | 58 | PROXY_LOW = 5 59 | PROXY_EXHAUST = 2 60 | 61 | CHECK_INTERVAL = 10 62 | LOOP_DELAY = 20 63 | PROTECT_SEC = 600 64 | REFRESH_SEC = 3600 * 24 65 | 66 | def startFetch(reason=None, fetchcmd='scrapy crawl proxy_fetch > /dev/null 2>&1'): 67 | logger.info(reason) 68 | redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True) 69 | redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True) 70 | os.system(fetchcmd) 71 | 72 | def proxyFetch(single_run=False): 73 | while True: 74 | protect_ttl = redis_db.ttl(PROXY_PROTECT) 75 | refresh_ttl = redis_db.ttl(PROXY_REFRESH) 76 | 77 | pcount = redis_db.get(PROXY_COUNT) 78 | if not pcount: 79 | pcount = 0 80 | else: 81 | pcount = int(pcount) 82 | logger.info('代理数量:%s' % pcount) 83 | if pcount < PROXY_LOW and protect_ttl <= 0: 84 | startFetch('代理池存量低了,需要补充些代理... (*゜ー゜*)', fetchcmd) 85 | elif pcount < PROXY_EXHAUST: 86 | startFetch('代理池即将耗尽啦,需要立即补充些代理... Σ( ° △ °|||)', fetchcmd) 87 | elif pcount < PROXY_LOW and protect_ttl > 0: 88 | logger.info('代理池存量有点低,但尚在保护期,让我们继续观察一会... O__O') 89 | elif not refresh_ttl: 90 | startFetch('代理池太久没更新啦,补充些新鲜代理... ლ(╹◡╹ლ)', fetchcmd) 91 | else: 92 | logger.info('当前可用代理数:%s 库存情况良好... (๑•̀ㅂ•́)و✧' % pcount) 93 | 94 | protect_ttl = redis_db.ttl(PROXY_PROTECT) 95 | refresh_ttl = redis_db.ttl(PROXY_REFRESH) 96 | if protect_ttl > 0: 97 | logger.info('代理池尚在保护期, 剩余保护时间:%s' % protect_ttl) 98 | if refresh_ttl > 0: 99 | logger.info('距离下次常规更新还剩%s秒' % refresh_ttl) 100 | logger.info('%s秒后开始下次检测...' % LOOP_DELAY) 101 | 102 | if single_run: 103 | break 104 | time.sleep(LOOP_DELAY) 105 | 106 | def proxyCheck(single_run=False): 107 | while True: 108 | logger.info('检查库存代理质量...') 109 | os.system(checkcmd) 110 | pcount = redis_db.get(PROXY_COUNT) 111 | if pcount: 112 | pcount = int(pcount) 113 | else: 114 | pcount = 0 115 | logger.info('检查完成,存活代理数%s..' % pcount) 116 | if single_run: 117 | break 118 | time.sleep(CHECK_INTERVAL) 119 | 120 | def main(): 121 | logger.info('启动进程中...') 122 | # reset 'protect' and 'refresh' tag 123 | redis_db.delete(PROXY_PROTECT) 124 | redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True) 125 | # start proxy-check thread 126 | check_thd = Thread(target=proxyCheck) 127 | check_thd.daemon = True 128 | check_thd.start() 129 | # start proxy-fetch thread 130 | fetch_thd = Thread(target=proxyFetch) 131 | fetch_thd.daemon = True 132 | fetch_thd.start() 133 | 134 | while True: 135 | if not check_thd.is_alive(): 136 | logger.error('自检线程已挂..重启中..') 137 | check_thd.start() 138 | if not fetch_thd.is_alive(): 139 | logger.error('抓取线程已挂..重启中..') 140 | fetch_thd.start() 141 | time.sleep(60) 142 | 143 | class TestCases(unittest.TestCase): 144 | def proxyCheck(self): 145 | proxyCheck(True) 146 | 147 | def proxyFetch(self): 148 | proxyFetch(True) 149 | 150 | def proxyExhaust(self): 151 | redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True) 152 | redis_db.set(PROXY_COUNT, 0) 153 | proxyFetch(True) 154 | 155 | def proxyLow(self): 156 | redis_db.delete(PROXY_PROTECT) 157 | redis_db.set(PROXY_COUNT, 3) 158 | proxyFetch(True) 159 | 160 | def proxyLowProtect(self): 161 | redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True) 162 | redis_db.set(PROXY_COUNT, 3) 163 | proxyFetch(True) 164 | 165 | def proxyRefresh(self): 166 | redis_db.delete(PROXY_REFRESH) 167 | redis_db.set(PROXY_COUNT, 10) 168 | proxyFetch(True) 169 | 170 | def loop(self): 171 | main() 172 | 173 | if __name__ == '__main__': 174 | main() -------------------------------------------------------------------------------- /start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import sys 5 | import time 6 | import random 7 | import unittest 8 | import yaml 9 | from redis import StrictRedis 10 | from threading import Thread 11 | from datetime import datetime 12 | import logging 13 | from logging.handlers import RotatingFileHandler 14 | 15 | if __name__ == '__main__': 16 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml' 17 | with open(LOCAL_CONFIG_YAML, 'r') as f: 18 | LOCAL_CONFIG = yaml.load(f) 19 | fetchcmd = 'scrapy crawl proxy_fetch' 20 | checkcmd = 'scrapy crawl proxy_check > /dev/null 2>&1' 21 | else: 22 | print('测试模式!') 23 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml' 24 | with open(LOCAL_CONFIG_YAML, 'r') as f: 25 | LOCAL_CONFIG = yaml.load(f) 26 | fetchcmd = 'scrapy crawl proxy_fetch -a mode=test' 27 | checkcmd = 'scrapy crawl proxy_check -a mode=test' 28 | 29 | FORMAT = '%(asctime)s %(levelno)s/%(lineno)d: %(message)s' 30 | logging.basicConfig(level=logging.DEBUG, format=FORMAT) 31 | logger = logging.getLogger(__name__) 32 | 33 | # redis keys 34 | PROXY_COUNT = LOCAL_CONFIG['PROXY_COUNT'] 35 | PROXY_SET = LOCAL_CONFIG['PROXY_SET'] 36 | PROXY_PROTECT = LOCAL_CONFIG['PROXY_PROTECT'] 37 | PROXY_REFRESH = LOCAL_CONFIG['PROXY_REFRESH'] 38 | 39 | redis_db = StrictRedis( 40 | host=LOCAL_CONFIG['REDIS_HOST'], 41 | port=LOCAL_CONFIG['REDIS_PORT'], 42 | password=LOCAL_CONFIG['REDIS_PASSWORD'], 43 | db=LOCAL_CONFIG['REDIS_DB'] 44 | ) 45 | 46 | # settings 47 | PROXY_LOW = LOCAL_CONFIG['PROXY_LOW'] 48 | PROXY_EXHAUST = LOCAL_CONFIG['PROXY_EXHAUST'] 49 | 50 | CHECK_INTERVAL = LOCAL_CONFIG['CHECK_INTERVAL'] 51 | LOOP_DELAY = LOCAL_CONFIG['LOOP_DELAY'] 52 | PROTECT_SEC = LOCAL_CONFIG['PROTECT_SEC'] 53 | REFRESH_SEC = LOCAL_CONFIG['REFRESH_SEC'] 54 | 55 | def startFetch(reason=None, fetchcmd='scrapy crawl proxy_fetch > /dev/null 2>&1'): 56 | logger.info(reason) 57 | redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True) 58 | redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True) 59 | os.system(fetchcmd) 60 | 61 | def proxyFetch(single_run=False, fake=False): 62 | while True: 63 | protect_ttl = redis_db.ttl(PROXY_PROTECT) 64 | refresh_ttl = redis_db.ttl(PROXY_REFRESH) 65 | 66 | pcount = redis_db.get(PROXY_COUNT) 67 | if not pcount: 68 | pcount = 0 69 | else: 70 | pcount = int(pcount) 71 | logger.info('代理数量:%s' % pcount) 72 | if pcount < PROXY_LOW and protect_ttl <= 0: 73 | msg = '代理池存量低了,需要补充些代理... (*゜ー゜*)' 74 | if fake: 75 | logger.debug(msg) 76 | else: 77 | startFetch(msg, fetchcmd) 78 | elif pcount < PROXY_EXHAUST: 79 | msg = '代理池即将耗尽啦,需要立即补充些代理... Σ( ° △ °|||)' 80 | if fake: 81 | logger.debug(msg) 82 | else: 83 | startFetch(msg, fetchcmd) 84 | elif pcount < PROXY_LOW and protect_ttl > 0: 85 | msg = '代理池存量有点低,但尚在保护期,让我们继续观察一会... O__O' 86 | if fake: 87 | logger.debug(msg) 88 | else: 89 | logger.info(msg) 90 | elif not refresh_ttl: 91 | msg = '代理池太久没更新啦,补充些新鲜代理... ლ(╹◡╹ლ)' 92 | if fake: 93 | logger.debug(msg) 94 | else: 95 | startFetch(msg, fetchcmd) 96 | else: 97 | logger.info('当前可用代理数:%s 库存情况良好... (๑•̀ㅂ•́)و✧' % pcount) 98 | 99 | protect_ttl = redis_db.ttl(PROXY_PROTECT) 100 | refresh_ttl = redis_db.ttl(PROXY_REFRESH) 101 | if protect_ttl > 0: 102 | logger.info('代理池尚在保护期, 剩余保护时间:%s' % protect_ttl) 103 | if refresh_ttl > 0: 104 | logger.info('距离下次常规更新还剩%s秒' % refresh_ttl) 105 | logger.info('%s秒后开始下次检测...' % LOOP_DELAY) 106 | 107 | if single_run: 108 | break 109 | time.sleep(LOOP_DELAY) 110 | 111 | def proxyCheck(single_run=False, fake=False): 112 | while True: 113 | logger.info('检查库存代理质量...') 114 | os.system(checkcmd) 115 | pcount = redis_db.get(PROXY_COUNT) 116 | if pcount: 117 | pcount = int(pcount) 118 | else: 119 | pcount = 0 120 | logger.info('检查完成,存活代理数%s..' % pcount) 121 | if single_run: 122 | break 123 | time.sleep(CHECK_INTERVAL) 124 | 125 | def main(): 126 | logger.info('启动进程中...') 127 | # reset 'protect' and 'refresh' tag 128 | redis_db.delete(PROXY_PROTECT) 129 | redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True) 130 | # start proxy-check thread 131 | check_thd = Thread(target=proxyCheck) 132 | check_thd.daemon = True 133 | check_thd.start() 134 | # start proxy-fetch thread 135 | fetch_thd = Thread(target=proxyFetch) 136 | fetch_thd.daemon = True 137 | fetch_thd.start() 138 | 139 | while True: 140 | if not check_thd.is_alive(): 141 | logger.error('自检线程已挂..重启中..') 142 | check_thd.start() 143 | if not fetch_thd.is_alive(): 144 | logger.error('抓取线程已挂..重启中..') 145 | fetch_thd.start() 146 | time.sleep(60) 147 | 148 | class TestCases(unittest.TestCase): 149 | def test_proxyFetch(self): 150 | proxyFetch(True) 151 | def test_proxyCheck(self): 152 | proxyCheck(True) 153 | 154 | def test_proxyExhaust(self): 155 | redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True) 156 | redis_db.set(PROXY_COUNT, 0) 157 | proxyFetch(True, True) 158 | 159 | def test_proxyLow(self): 160 | redis_db.delete(PROXY_PROTECT) 161 | redis_db.set(PROXY_COUNT, 3) 162 | proxyFetch(True, True) 163 | 164 | def test_proxyLowProtect(self): 165 | redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True) 166 | redis_db.set(PROXY_COUNT, 3) 167 | proxyFetch(True, True) 168 | 169 | def test_proxyRefresh(self): 170 | redis_db.delete(PROXY_REFRESH) 171 | redis_db.set(PROXY_COUNT, 10) 172 | proxyFetch(True, True) 173 | 174 | def loop(self): 175 | main() 176 | 177 | if __name__ == '__main__': 178 | main() -------------------------------------------------------------------------------- /proxy_spider/spiders/proxy_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import time 5 | from datetime import datetime 6 | from redis import StrictRedis 7 | import re 8 | import random 9 | import requests 10 | import yaml 11 | from scrapy import Spider, Request 12 | from scrapy.http import HtmlResponse 13 | from collections import defaultdict 14 | 15 | import logging 16 | from logging.handlers import RotatingFileHandler 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | class ProxyCheckSpider(Spider): 21 | ''' Spider to crawl free proxy servers for intern 22 | ''' 23 | name = 'proxy_check' 24 | 25 | def __init__(self, mode='prod', *args, **kwargs): 26 | if mode == 'prod': 27 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml' 28 | elif mode == 'test': 29 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml' 30 | with open(LOCAL_CONFIG_YAML, 'r') as f: 31 | LOCAL_CONFIG = yaml.load(f) 32 | 33 | self.redis_db = StrictRedis( 34 | host=LOCAL_CONFIG['REDIS_HOST'], 35 | port=LOCAL_CONFIG['REDIS_PORT'], 36 | password=LOCAL_CONFIG['REDIS_PASSWORD'], 37 | db=LOCAL_CONFIG['REDIS_DB'] 38 | ) 39 | 40 | self.validator_pool = set([]) 41 | for validator in LOCAL_CONFIG['PROXY_VALIDATORS']: 42 | self.validator_pool.add((validator['url'], validator['startstring'])) 43 | self.PROXY_COUNT = LOCAL_CONFIG['PROXY_COUNT'] 44 | self.PROXY_SET = LOCAL_CONFIG['PROXY_SET'] 45 | 46 | def start_requests(self): 47 | 48 | logger.info('测试代理池内代理质量...') 49 | self.redis_db.set(self.PROXY_COUNT, self.redis_db.scard(self.PROXY_SET)) 50 | for proxy in self.redis_db.smembers(self.PROXY_SET): 51 | proxy = proxy.decode('utf-8') 52 | vaurl, vastart = random.choice(list(self.validator_pool)) 53 | yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True) 54 | 55 | def checkin(self, response): 56 | res = response.body_as_unicode() 57 | if 'startstring' in response.meta and res.startswith(response.meta['startstring']): 58 | proxy = response.meta['proxy'] 59 | self.redis_db.sadd(self.PROXY_SET, proxy) 60 | logger.info('可用代理+1 %s' % proxy) 61 | yield None 62 | else: 63 | proxy = response.url if 'proxy' not in response.meta else response.meta['proxy'] 64 | self.redis_db.srem(self.PROXY_SET, proxy) 65 | logger.info('无效代理 %s' % proxy) 66 | yield None 67 | 68 | def closed(self, reason): 69 | pcount = self.redis_db.scard(self.PROXY_SET) 70 | logger.info('代理池测试完成,有效代理数: %s' % pcount) 71 | self.redis_db.set(self.PROXY_COUNT, pcount) 72 | 73 | class ProxyFetchSpider(Spider): 74 | name = 'proxy_fetch' 75 | loop_delay = 10 76 | protect_sec = 180 77 | 78 | def __init__(self, mode='prod', *args, **kwargs): 79 | if mode == 'prod': 80 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml' 81 | elif mode == 'test': 82 | LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml' 83 | with open(LOCAL_CONFIG_YAML, 'r') as f: 84 | LOCAL_CONFIG = yaml.load(f) 85 | 86 | self.redis_db = StrictRedis( 87 | host=LOCAL_CONFIG['REDIS_HOST'], 88 | port=LOCAL_CONFIG['REDIS_PORT'], 89 | password=LOCAL_CONFIG['REDIS_PASSWORD'], 90 | db=LOCAL_CONFIG['REDIS_DB'] 91 | ) 92 | self.PROXY_COUNT = LOCAL_CONFIG['PROXY_COUNT'] 93 | self.PROXY_SET = LOCAL_CONFIG['PROXY_SET'] 94 | 95 | self.validator_pool = set([]) 96 | for validator in LOCAL_CONFIG['PROXY_VALIDATORS']: 97 | self.validator_pool.add((validator['url'], validator['startstring'])) 98 | 99 | self.vendors = LOCAL_CONFIG['PROXY_VENDORS'] 100 | 101 | def start_requests(self): 102 | for vendor in self.vendors: 103 | logger.debug(vendor) 104 | callback = getattr(self, vendor['parser']) 105 | yield Request(url=vendor['url'], callback=callback) 106 | 107 | def checkin(self, response): 108 | res = response.body_as_unicode() 109 | if 'startstring' in response.meta and res.startswith(response.meta['startstring']): 110 | proxy = response.meta['proxy'] 111 | self.redis_db.sadd(self.PROXY_SET, proxy) 112 | logger.info('可用代理+1 %s' % proxy) 113 | yield None 114 | else: 115 | proxy = response.url if 'proxy' not in response.meta else response.meta['proxy'] 116 | logger.info('无效代理 %s' % proxy) 117 | yield None 118 | 119 | def parse_xici(self, response): 120 | ''' 121 | @url http://www.xicidaili.com/nn/ 122 | ''' 123 | logger.info('解析http://www.xicidaili.com/nn/') 124 | succ = 0 125 | fail = 0 126 | count = 0 127 | for tr in response.css('#ip_list tr'): 128 | td_list = tr.css('td::text') 129 | if len(td_list) < 3: 130 | continue 131 | ipaddr = td_list[0].extract() 132 | port = td_list[1].extract() 133 | proto = td_list[5].extract() 134 | latency = tr.css('div.bar::attr(title)').extract_first() 135 | latency = re.match('(\d+\.\d+)秒', latency).group(1) 136 | proxy = '%s://%s:%s' % (proto, ipaddr, port) 137 | proxies = {proto: '%s:%s' % (ipaddr, port)} 138 | if float(latency) > 3: 139 | logger.info('丢弃慢速代理: %s 延迟%s秒' % (proxy, latency)) 140 | continue 141 | logger.info('验证: %s' % proxy) 142 | if not self.redis_db.sismember(self.PROXY_SET, proxy): 143 | vaurl, vastart = random.choice(list(self.validator_pool)) 144 | yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True) 145 | else: 146 | logger.info('该代理已收录..') 147 | 148 | def parse_66ip(self, response): 149 | ''' 150 | @url http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip 151 | ''' 152 | logger.info('开始爬取66ip') 153 | if 'proxy' in response.meta: 154 | logger.info('=>使用代理%s' % response.meta['proxy']) 155 | res = response.body_as_unicode() 156 | for addr in re.findall('\d+\.\d+\.\d+\.\d+\:\d+', res): 157 | proxy = 'http://' + addr 158 | print(proxy) 159 | logger.info('验证: %s' % proxy) 160 | if not self.redis_db.sismember(self.PROXY_SET, proxy): 161 | vaurl, vastart = random.choice(list(self.validator_pool)) 162 | yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True) 163 | else: 164 | logger.info('该代理已收录..') 165 | 166 | def parse_ip181(self, response): 167 | ''' 168 | @url http://www.ip181.com/ 169 | ''' 170 | logger.info('开始爬取ip181') 171 | if 'proxy' in response.meta: 172 | logger.info('=>使用代理%s' % response.meta['proxy']) 173 | for tr in response.css('table tbody tr'): 174 | ip = tr.css('td::text').extract()[0] 175 | port = tr.css('td::text').extract()[1] 176 | type = tr.css('td::text').extract()[2] 177 | proxy = 'http://%s:%s' % (ip, port) 178 | if type != '高匿': 179 | logger.info('丢弃非高匿代理:%s' % proxy) 180 | continue 181 | logger.info('验证: %s' % proxy) 182 | if not self.redis_db.sismember(self.PROXY_SET, proxy): 183 | vaurl, vastart = random.choice(list(self.validator_pool)) 184 | yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True) 185 | else: 186 | logger.info('该代理已收录..') 187 | 188 | def parse_kxdaili(self, response): 189 | ''' 190 | @url http://www.kxdaili.com/dailiip/1/1.html#ip 191 | ''' 192 | logger.info('开始爬取kxdaili') 193 | if 'proxy' in response.meta: 194 | logger.info('=>使用代理%s' % response.meta['proxy']) 195 | url_pattern = 'http://www.kxdaili.com/dailiip/1/%s.html#ip' 196 | try: 197 | page = re.search('(\d)+\.html', response.url).group(1) 198 | page = int(page) 199 | except Exception as e: 200 | logger.exception(e) 201 | logger.error(response.url) 202 | for tr in response.css('table.ui.table.segment tbody tr'): 203 | ip = tr.css('td::text').extract()[0] 204 | port = tr.css('td::text').extract()[1] 205 | proxy = 'http://%s:%s' % (ip, port) 206 | logger.info('验证: %s' % proxy) 207 | if not self.redis_db.sismember(self.PROXY_SET, proxy): 208 | vaurl, vastart = random.choice(list(self.validator_pool)) 209 | yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True) 210 | else: 211 | logger.info('该代理已收录..') 212 | if page < 3: # 爬取前3页 213 | page += 1 214 | new_url = url_pattern % page 215 | new_meta = response.meta.copy() 216 | new_meta['page'] = page 217 | yield Request(url=new_url, meta=new_meta, callback=self.parse_kxdaili) 218 | 219 | def closed(self, reason): 220 | logger.info('代理池更新完成,有效代理数: %s' % self.redis_db.scard(self.PROXY_SET)) 221 | 222 | --------------------------------------------------------------------------------