├── proxy_spider
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── proxy_spider.py
    ├── pipelines.py
    ├── items.py
    ├── settings.py
    └── middlewares.py
├── requirements.txt
├── Dockerfile
├── scrapy.cfg
├── LICENSE
├── hq-proxies.yml
├── .gitignore
├── README.md
├── config.yml
└── start.py


/proxy_spider/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | redis==2.10.5
2 | requests==2.13.0
3 | Scrapy==1.3.0
4 | Twisted==16.6.0
5 | PyYAML==3.12
6 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM daocloud.io/python:3.4-onbuild
2 | 
3 | # Setting timezone
4 | RUN rm -f /etc/localtime
5 | RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime 
6 | 
7 | CMD [ "python", "./start.py"]


--------------------------------------------------------------------------------
/proxy_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = proxy_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = proxy_spider
12 | 


--------------------------------------------------------------------------------
/proxy_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ProxySpiderPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/proxy_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ProxySpiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 arthurmmm
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hq-proxies.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # 请填写redis配置
 3 | # 请把这个配置文件放置到/etc/hq-proxies.yml
 4 | REDIS_HOST: your_redis_host
 5 | REDIS_PORT: 6379
 6 | REDIS_PASSWORD: your_redis_password
 7 | REDIS_DB: your_redis_db
 8 | 
 9 | # Redis key 
10 | PROXY_COUNT: hq-proxies:proxy_count
11 | PROXY_SET: hq-proxies:proxy_pool
12 | PROXY_PROTECT: hq-proxies:proxy_protect
13 | PROXY_REFRESH: hq-proxies:proxy_refresh
14 | 
15 | # 代理数量低于proxy_low时会刷新
16 | PROXY_LOW: 5 
17 | # 代理数量低于proxy_exhaust时会强制刷新
18 | PROXY_EXHAUST: 2
19 | 
20 | # 多久检查一次代理质量
21 | CHECK_INTERVAL: 10
22 | # 多久检查一次代理数量
23 | LOOP_DELAY: 20
24 | # 每次获取代理后添加一个保护时间，避免频繁刷新
25 | PROTECT_SEC: 600
26 | # 强制刷新时间
27 | REFRESH_SEC: 86400
28 | 
29 | # 配置代理源和验证页
30 | PROXY_VENDORS: 
31 | - parser: parse_xici
32 |   url: http://www.xicidaili.com/nn/
33 | - parser: parse_kxdaili
34 |   url: http://www.kxdaili.com/dailiip/1/1.html#ip
35 | - parser: parse_ip181
36 |   url: http://www.ip181.com/
37 | - parser: parse_66ip
38 |   url: http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip
39 |   
40 | # 可以配置多个验证页，会随机抽取一个用于验证
41 | PROXY_VALIDATORS:
42 | - url: http://olbllni9a.bkt.clouddn.com/text/helloworld.txt
43 |   startstring: hello world! :)
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Privacy
 2 | dbsetting.py
 3 | dbsetting_test.py
 4 | start.sh
 5 | stop.sh
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | 
34 | # PyInstaller
35 | #  Usually these files are written by a python script from a template
36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 | 
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 | 
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *,cover
53 | .hypothesis/
54 | 
55 | # Translations
56 | *.mo
57 | *.pot
58 | 
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | 
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 | 
67 | # Scrapy stuff:
68 | .scrapy
69 | 
70 | # Sphinx documentation
71 | docs/_build/
72 | 
73 | # PyBuilder
74 | target/
75 | 
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 | 
79 | # pyenv
80 | .python-version
81 | 
82 | # celery beat schedule file
83 | celerybeat-schedule
84 | 
85 | # dotenv
86 | .env
87 | 
88 | # virtualenv
89 | .venv/
90 | venv/
91 | ENV/
92 | 
93 | # Spyder project settings
94 | .spyderproject
95 | 
96 | # Rope project settings
97 | .ropeproject
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hq-proxies
 2 | 
 3 | 一个简单的动态代理池，通过较高频率的自检保证池内代理的高可靠性。
 4 | 
 5 | # 代码结构
 6 | 代码分三个部分：
 7 | *  一个scrapy爬虫去爬代理网站，获取免费代理，验证后入库   (proxy_fetch)
 8 | *  一个scrapy爬虫把代理池内的代理全部验证一遍，若验证失败就从代理池内删除   (proxy_check)
 9 | *  一个调度程序用于管理上面两个爬虫   (start.py)
10 | 
11 | ![hq-proxies.png](http://upload-images.jianshu.io/upload_images/4610828-edbea71e6ff36157.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
12 | 
13 | # 部署
14 | 需要先改一下配置文件hq-proxies.yml，把Redis的地址密码之类的填上，改完后放到/etc/hq-proxies.yml下。
15 | 在配置文件中也可以调整相应的阈值和免费代理源和测试页面。
16 | 测试页面需要频繁访问，为了节省流量我在某云存储上丢了个helloworld的文本当测试页面了，云存储有流量限制建议大家换掉。。验证方式很粗暴，比较一下网页开头字符串。。
17 | 
18 | 另外写了个Dockerfile可以直接部署到Docker上（python3用的是Daocloud的镜像），跑容器的时候记得把hq-proxies.yml映射到容器/etc/hq-proxies.yml下。
19 | 手工部署的话跑`pip install -r requirements.txt`安装依赖包
20 | 
21 | # 使用
22 | 在scrapy中使用代理池的只需要添加一个middleware，每次爬取时从redis SET里用srandmember随机获取一个代理使用，代理失效和一般的请求超时一样retry，代理池的自检特性保证了我们retry时候再次拿到失效代理的概率很低。middleware代码示例：   
23 | 
24 | ```python
25 | class DynamicProxyMiddleware(object):
26 |     def process_request(self, request, spider):
27 |         redis_db = StrictRedis(
28 |             host=LOCAL_CONFIG['REDIS_HOST'], 
29 |             port=LOCAL_CONFIG['REDIS_PORT'], 
30 |             password=LOCAL_CONFIG['REDIS_PASSWORD'],
31 |             db=LOCAL_CONFIG['REDIS_DB']
32 |         ) 
33 |         proxy = redis_db.sismember(PROXY_SET, proxy):
34 |         logger.debug('使用代理[%s]访问[%s]' % (proxy, request.url))
35 |         request.meta['proxy'] = proxy
36 | ```
37 | 
38 | 
39 | 博客： http://blog.arthurmao.me/2017/02/python-redis-hq-proxies   
40 | 
41 | 简书： http://www.jianshu.com/p/6cd4f1876b31   
42 | 
43 | 日志截图：
44 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/4610828-29e8d33a438a606f.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
45 | 


--------------------------------------------------------------------------------
/proxy_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for proxy_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'proxy_spider'
13 | 
14 | SPIDER_MODULES = ['proxy_spider.spiders']
15 | NEWSPIDER_MODULE = 'proxy_spider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | # DEFAULT_REQUEST_HEADERS = {
43 |   # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 |   # 'Accept-Language': 'en',
45 | # }
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'proxy_spider.middlewares.ProxySpiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |    'proxy_spider.middlewares.ProxyPoolDownloaderMiddleware': 542,
57 |    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 351,
58 |    'proxy_spider.middlewares.ProxyPoolUserAgentMiddleware': 543,
59 | }
60 | DOWNLOAD_TIMEOUT = 3
61 | RETRY_ENABLED = False
62 | 
63 | # DUPEFILTER_CLASS = 'proxy_spider.'
64 | DUPEFILTER_DEBUG = True
65 | 
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 | 
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | #ITEM_PIPELINES = {
75 | #    'proxy_spider.pipelines.SomePipeline': 300,
76 | #}
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


--------------------------------------------------------------------------------
/proxy_spider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import random
 5 | from scrapy import signals
 6 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 
 7 | from scrapy.downloadermiddlewares.downloadtimeout import DownloadTimeoutMiddleware
 8 | from scrapy.http import TextResponse
 9 | import logging
10 | from twisted.web._newclient import ResponseNeverReceived
11 | from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | class ProxyPoolUserAgentMiddleware(UserAgentMiddleware):
16 |     
17 |     #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
18 |     #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
19 |     user_agent_list = [
20 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
21 |         "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
22 |         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
23 |         "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
24 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
25 |         "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
26 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
27 |         "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
28 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
29 |         "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
30 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
31 |         "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
32 |         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
33 |         "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
34 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
35 |         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
36 |         "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/536.3 "
37 |         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
38 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
39 |         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
40 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
41 |         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
42 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
43 |         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
44 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
45 |         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
46 |         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
47 |         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
48 |         "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/536.3 "
49 |         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
50 |         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
51 |         "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
52 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
53 |         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
54 |         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
55 |         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
56 |         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36"
57 |     ]
58 | 
59 |     def __init__(self, user_agent=''):  
60 |         self.user_agent = user_agent  
61 |   
62 |     def process_request(self, request, spider):  
63 |         ua = random.choice(self.user_agent_list)
64 |         if ua:
65 |             request.headers.setdefault('User-Agent', ua)
66 | 
67 | class ProxyPoolDownloaderMiddleware(DownloadTimeoutMiddleware):
68 |     DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError, TypeError)
69 |     
70 |     def process_exception(self, request, exception, spider):
71 |         if isinstance(exception, self.DONT_RETRY_ERRORS):
72 |             return TextResponse(url=request.meta['proxy'])
73 |         


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import sys
  5 | import time
  6 | import random
  7 | import unittest
  8 | import yaml
  9 | from redis import StrictRedis
 10 | from threading import Thread
 11 | from datetime import datetime
 12 | import logging
 13 | from logging.handlers import RotatingFileHandler
 14 | 
 15 | if __name__ == '__main__':
 16 |     LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml'
 17 |     with open(LOCAL_CONFIG_YAML, 'r') as f:
 18 |         LOCAL_CONFIG = yaml.load(f)
 19 |     fetchcmd = 'scrapy crawl proxy_fetch'
 20 |     checkcmd = 'scrapy crawl proxy_check > /dev/null 2>&1'
 21 |     log_path = '/data/logs/hq-proxies.log'
 22 | else:
 23 |     print('测试模式！')
 24 |     LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml'
 25 |     with open(LOCAL_CONFIG_YAML, 'r') as f:
 26 |         LOCAL_CONFIG = yaml.load(f)
 27 |     fetchcmd = 'scrapy crawl proxy_fetch -a mode=test'
 28 |     checkcmd = 'scrapy crawl proxy_check -a mode=test'
 29 |     log_path = '/data/logs/hq-proxies.test.log'
 30 | 
 31 | FORMAT = '%(asctime)s %(levelno)s/%(lineno)d: %(message)s'
 32 | logging.basicConfig(level=logging.DEBUG, format=FORMAT)    
 33 | logger = logging.getLogger(__name__)
 34 | logger.setLevel(logging.DEBUG)
 35 | formatter = logging.Formatter(fmt=FORMAT)
 36 | rfh = RotatingFileHandler(log_path, maxBytes=1*1024*1024, backupCount=10)
 37 | rfh.setFormatter(formatter)
 38 | rfh.setLevel(logging.DEBUG)
 39 | logger.addHandler(rfh)
 40 | 
 41 | # redis keys
 42 | PROXY_COUNT = 'hq-proxies:proxy_count'
 43 | PROXY_SET = 'hq-proxies:proxy_pool'
 44 | PROXY_PROTECT = 'hq-proxies:proxy_protect'
 45 | PROXY_REFRESH = 'hq-proxies:proxy_refresh'
 46 | 
 47 | # mongo collections
 48 | VENDORS = 'vendors'
 49 | VALIDATORS = 'validators'
 50 |     
 51 | redis_db = StrictRedis(
 52 |     host=LOCAL_CONFIG['REDIS_HOST'], 
 53 |     port=LOCAL_CONFIG['REDIS_PORT'], 
 54 |     password=LOCAL_CONFIG['REDIS_PASSWORD'],
 55 |     db=LOCAL_CONFIG['REDIS_DB']
 56 | )
 57 | 
 58 | PROXY_LOW = 5
 59 | PROXY_EXHAUST = 2
 60 | 
 61 | CHECK_INTERVAL = 10
 62 | LOOP_DELAY = 20
 63 | PROTECT_SEC = 600
 64 | REFRESH_SEC = 3600 * 24
 65 | 
 66 | def startFetch(reason=None, fetchcmd='scrapy crawl proxy_fetch > /dev/null 2>&1'):
 67 |     logger.info(reason)
 68 |     redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True)
 69 |     redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True)
 70 |     os.system(fetchcmd)
 71 | 
 72 | def proxyFetch(single_run=False):
 73 |     while True:
 74 |         protect_ttl = redis_db.ttl(PROXY_PROTECT)
 75 |         refresh_ttl = redis_db.ttl(PROXY_REFRESH)
 76 |         
 77 |         pcount = redis_db.get(PROXY_COUNT)
 78 |         if not pcount:
 79 |             pcount = 0
 80 |         else:
 81 |             pcount = int(pcount)
 82 |         logger.info('代理数量：%s' % pcount)
 83 |         if pcount < PROXY_LOW and protect_ttl <= 0:
 84 |             startFetch('代理池存量低了，需要补充些代理... (*゜ー゜*)', fetchcmd)
 85 |         elif pcount < PROXY_EXHAUST:
 86 |             startFetch('代理池即将耗尽啦，需要立即补充些代理... Σ( ° △ °|||)', fetchcmd)
 87 |         elif pcount < PROXY_LOW and protect_ttl > 0:
 88 |             logger.info('代理池存量有点低，但尚在保护期，让我们继续观察一会... O__O')
 89 |         elif not refresh_ttl:
 90 |             startFetch('代理池太久没更新啦，补充些新鲜代理... ლ(╹◡╹ლ)', fetchcmd)
 91 |         else:
 92 |             logger.info('当前可用代理数：%s 库存情况良好... (๑•̀ㅂ•́)و✧' % pcount)
 93 |         
 94 |         protect_ttl = redis_db.ttl(PROXY_PROTECT)
 95 |         refresh_ttl = redis_db.ttl(PROXY_REFRESH)
 96 |         if protect_ttl > 0:
 97 |             logger.info('代理池尚在保护期, 剩余保护时间：%s' % protect_ttl)
 98 |         if refresh_ttl > 0:
 99 |             logger.info('距离下次常规更新还剩%s秒' % refresh_ttl)
100 |         logger.info('%s秒后开始下次检测...' % LOOP_DELAY)
101 |         
102 |         if single_run:
103 |             break
104 |         time.sleep(LOOP_DELAY)
105 | 
106 | def proxyCheck(single_run=False):
107 |     while True:
108 |         logger.info('检查库存代理质量...')
109 |         os.system(checkcmd)
110 |         pcount = redis_db.get(PROXY_COUNT)
111 |         if pcount:
112 |             pcount = int(pcount)
113 |         else:
114 |             pcount = 0
115 |         logger.info('检查完成，存活代理数%s..' % pcount)
116 |         if single_run:
117 |             break
118 |         time.sleep(CHECK_INTERVAL)
119 | 
120 | def main():
121 |     logger.info('启动进程中...')
122 |     # reset 'protect' and 'refresh' tag
123 |     redis_db.delete(PROXY_PROTECT)
124 |     redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True)
125 |     # start proxy-check thread
126 |     check_thd = Thread(target=proxyCheck)
127 |     check_thd.daemon = True
128 |     check_thd.start()
129 |     # start proxy-fetch thread
130 |     fetch_thd = Thread(target=proxyFetch)
131 |     fetch_thd.daemon = True
132 |     fetch_thd.start()
133 |     
134 |     while True:
135 |         if not check_thd.is_alive():
136 |             logger.error('自检线程已挂..重启中..')
137 |             check_thd.start()
138 |         if not fetch_thd.is_alive():
139 |             logger.error('抓取线程已挂..重启中..')
140 |             fetch_thd.start()
141 |         time.sleep(60)
142 | 
143 | class TestCases(unittest.TestCase):
144 |     def proxyCheck(self):
145 |         proxyCheck(True)
146 |         
147 |     def proxyFetch(self):
148 |         proxyFetch(True) 
149 |         
150 |     def proxyExhaust(self):
151 |         redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True)
152 |         redis_db.set(PROXY_COUNT, 0)
153 |         proxyFetch(True)
154 |         
155 |     def proxyLow(self):
156 |         redis_db.delete(PROXY_PROTECT)
157 |         redis_db.set(PROXY_COUNT, 3)
158 |         proxyFetch(True)
159 |         
160 |     def proxyLowProtect(self):
161 |         redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True)
162 |         redis_db.set(PROXY_COUNT, 3)
163 |         proxyFetch(True)    
164 |     
165 |     def proxyRefresh(self):
166 |         redis_db.delete(PROXY_REFRESH)
167 |         redis_db.set(PROXY_COUNT, 10)
168 |         proxyFetch(True)  
169 |     
170 |     def loop(self):
171 |         main()
172 | 
173 | if __name__ == '__main__':
174 |     main()


--------------------------------------------------------------------------------
/start.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import sys
  5 | import time
  6 | import random
  7 | import unittest
  8 | import yaml
  9 | from redis import StrictRedis
 10 | from threading import Thread
 11 | from datetime import datetime
 12 | import logging
 13 | from logging.handlers import RotatingFileHandler
 14 | 
 15 | if __name__ == '__main__':
 16 |     LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml'
 17 |     with open(LOCAL_CONFIG_YAML, 'r') as f:
 18 |         LOCAL_CONFIG = yaml.load(f)
 19 |     fetchcmd = 'scrapy crawl proxy_fetch'
 20 |     checkcmd = 'scrapy crawl proxy_check > /dev/null 2>&1'
 21 | else:
 22 |     print('测试模式！')
 23 |     LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml'
 24 |     with open(LOCAL_CONFIG_YAML, 'r') as f:
 25 |         LOCAL_CONFIG = yaml.load(f)
 26 |     fetchcmd = 'scrapy crawl proxy_fetch -a mode=test'
 27 |     checkcmd = 'scrapy crawl proxy_check -a mode=test'
 28 | 
 29 | FORMAT = '%(asctime)s %(levelno)s/%(lineno)d: %(message)s'
 30 | logging.basicConfig(level=logging.DEBUG, format=FORMAT)    
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | # redis keys
 34 | PROXY_COUNT = LOCAL_CONFIG['PROXY_COUNT']
 35 | PROXY_SET = LOCAL_CONFIG['PROXY_SET']
 36 | PROXY_PROTECT = LOCAL_CONFIG['PROXY_PROTECT']
 37 | PROXY_REFRESH = LOCAL_CONFIG['PROXY_REFRESH']
 38 |     
 39 | redis_db = StrictRedis(
 40 |     host=LOCAL_CONFIG['REDIS_HOST'], 
 41 |     port=LOCAL_CONFIG['REDIS_PORT'], 
 42 |     password=LOCAL_CONFIG['REDIS_PASSWORD'],
 43 |     db=LOCAL_CONFIG['REDIS_DB']
 44 | )
 45 | 
 46 | # settings
 47 | PROXY_LOW = LOCAL_CONFIG['PROXY_LOW']
 48 | PROXY_EXHAUST = LOCAL_CONFIG['PROXY_EXHAUST']
 49 | 
 50 | CHECK_INTERVAL = LOCAL_CONFIG['CHECK_INTERVAL']
 51 | LOOP_DELAY = LOCAL_CONFIG['LOOP_DELAY']
 52 | PROTECT_SEC = LOCAL_CONFIG['PROTECT_SEC']
 53 | REFRESH_SEC = LOCAL_CONFIG['REFRESH_SEC']
 54 | 
 55 | def startFetch(reason=None, fetchcmd='scrapy crawl proxy_fetch > /dev/null 2>&1'):
 56 |     logger.info(reason)
 57 |     redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True)
 58 |     redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True)
 59 |     os.system(fetchcmd)
 60 | 
 61 | def proxyFetch(single_run=False, fake=False):
 62 |     while True:
 63 |         protect_ttl = redis_db.ttl(PROXY_PROTECT)
 64 |         refresh_ttl = redis_db.ttl(PROXY_REFRESH)
 65 |         
 66 |         pcount = redis_db.get(PROXY_COUNT)
 67 |         if not pcount:
 68 |             pcount = 0
 69 |         else:
 70 |             pcount = int(pcount)
 71 |         logger.info('代理数量：%s' % pcount)
 72 |         if pcount < PROXY_LOW and protect_ttl <= 0:
 73 |             msg = '代理池存量低了，需要补充些代理... (*゜ー゜*)'
 74 |             if fake:
 75 |                 logger.debug(msg)
 76 |             else:
 77 |                 startFetch(msg, fetchcmd)
 78 |         elif pcount < PROXY_EXHAUST:
 79 |             msg = '代理池即将耗尽啦，需要立即补充些代理... Σ( ° △ °|||)'
 80 |             if fake:
 81 |                 logger.debug(msg)
 82 |             else:
 83 |                 startFetch(msg, fetchcmd)
 84 |         elif pcount < PROXY_LOW and protect_ttl > 0:
 85 |             msg = '代理池存量有点低，但尚在保护期，让我们继续观察一会... O__O'
 86 |             if fake:
 87 |                 logger.debug(msg)
 88 |             else:
 89 |                 logger.info(msg)
 90 |         elif not refresh_ttl:
 91 |             msg = '代理池太久没更新啦，补充些新鲜代理... ლ(╹◡╹ლ)'
 92 |             if fake:
 93 |                 logger.debug(msg)
 94 |             else:
 95 |                 startFetch(msg, fetchcmd)
 96 |         else:
 97 |             logger.info('当前可用代理数：%s 库存情况良好... (๑•̀ㅂ•́)و✧' % pcount)
 98 |         
 99 |         protect_ttl = redis_db.ttl(PROXY_PROTECT)
100 |         refresh_ttl = redis_db.ttl(PROXY_REFRESH)
101 |         if protect_ttl > 0:
102 |             logger.info('代理池尚在保护期, 剩余保护时间：%s' % protect_ttl)
103 |         if refresh_ttl > 0:
104 |             logger.info('距离下次常规更新还剩%s秒' % refresh_ttl)
105 |         logger.info('%s秒后开始下次检测...' % LOOP_DELAY)
106 |         
107 |         if single_run:
108 |             break
109 |         time.sleep(LOOP_DELAY)
110 | 
111 | def proxyCheck(single_run=False, fake=False):
112 |     while True:
113 |         logger.info('检查库存代理质量...')
114 |         os.system(checkcmd)
115 |         pcount = redis_db.get(PROXY_COUNT)
116 |         if pcount:
117 |             pcount = int(pcount)
118 |         else:
119 |             pcount = 0
120 |         logger.info('检查完成，存活代理数%s..' % pcount)
121 |         if single_run:
122 |             break
123 |         time.sleep(CHECK_INTERVAL)
124 | 
125 | def main():
126 |     logger.info('启动进程中...')
127 |     # reset 'protect' and 'refresh' tag
128 |     redis_db.delete(PROXY_PROTECT)
129 |     redis_db.setex(PROXY_REFRESH, REFRESH_SEC, True)
130 |     # start proxy-check thread
131 |     check_thd = Thread(target=proxyCheck)
132 |     check_thd.daemon = True
133 |     check_thd.start()
134 |     # start proxy-fetch thread
135 |     fetch_thd = Thread(target=proxyFetch)
136 |     fetch_thd.daemon = True
137 |     fetch_thd.start()
138 |     
139 |     while True:
140 |         if not check_thd.is_alive():
141 |             logger.error('自检线程已挂..重启中..')
142 |             check_thd.start()
143 |         if not fetch_thd.is_alive():
144 |             logger.error('抓取线程已挂..重启中..')
145 |             fetch_thd.start()
146 |         time.sleep(60)
147 | 
148 | class TestCases(unittest.TestCase):
149 |     def test_proxyFetch(self):
150 |         proxyFetch(True) 
151 |     def test_proxyCheck(self):
152 |         proxyCheck(True)
153 |         
154 |     def test_proxyExhaust(self):
155 |         redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True)
156 |         redis_db.set(PROXY_COUNT, 0)
157 |         proxyFetch(True, True)
158 |         
159 |     def test_proxyLow(self):
160 |         redis_db.delete(PROXY_PROTECT)
161 |         redis_db.set(PROXY_COUNT, 3)
162 |         proxyFetch(True, True)
163 |         
164 |     def test_proxyLowProtect(self):
165 |         redis_db.setex(PROXY_PROTECT, PROTECT_SEC, True)
166 |         redis_db.set(PROXY_COUNT, 3)
167 |         proxyFetch(True, True)    
168 |     
169 |     def test_proxyRefresh(self):
170 |         redis_db.delete(PROXY_REFRESH)
171 |         redis_db.set(PROXY_COUNT, 10)
172 |         proxyFetch(True, True)  
173 |     
174 |     def loop(self):
175 |         main()
176 | 
177 | if __name__ == '__main__':
178 |     main()


--------------------------------------------------------------------------------
/proxy_spider/spiders/proxy_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import time
  5 | from datetime import datetime
  6 | from redis import StrictRedis
  7 | import re
  8 | import random
  9 | import requests
 10 | import yaml
 11 | from scrapy import Spider, Request
 12 | from scrapy.http import HtmlResponse
 13 | from collections import defaultdict
 14 | 
 15 | import logging
 16 | from logging.handlers import RotatingFileHandler
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | class ProxyCheckSpider(Spider):
 21 |     ''' Spider to crawl free proxy servers for intern
 22 |     '''
 23 |     name = 'proxy_check'
 24 |     
 25 |     def __init__(self, mode='prod', *args, **kwargs):
 26 |         if mode == 'prod':            
 27 |             LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml'
 28 |         elif mode == 'test':
 29 |             LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml'
 30 |         with open(LOCAL_CONFIG_YAML, 'r') as f:
 31 |             LOCAL_CONFIG = yaml.load(f)
 32 |         
 33 |         self.redis_db = StrictRedis(
 34 |             host=LOCAL_CONFIG['REDIS_HOST'], 
 35 |             port=LOCAL_CONFIG['REDIS_PORT'], 
 36 |             password=LOCAL_CONFIG['REDIS_PASSWORD'],
 37 |             db=LOCAL_CONFIG['REDIS_DB']
 38 |         )
 39 |         
 40 |         self.validator_pool = set([])
 41 |         for validator in LOCAL_CONFIG['PROXY_VALIDATORS']:
 42 |             self.validator_pool.add((validator['url'], validator['startstring']))
 43 |         self.PROXY_COUNT = LOCAL_CONFIG['PROXY_COUNT']
 44 |         self.PROXY_SET = LOCAL_CONFIG['PROXY_SET']
 45 |         
 46 |     def start_requests(self):
 47 | 
 48 |         logger.info('测试代理池内代理质量...')
 49 |         self.redis_db.set(self.PROXY_COUNT, self.redis_db.scard(self.PROXY_SET))
 50 |         for proxy in self.redis_db.smembers(self.PROXY_SET):
 51 |             proxy = proxy.decode('utf-8')
 52 |             vaurl, vastart = random.choice(list(self.validator_pool))
 53 |             yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
 54 |     
 55 |     def checkin(self, response):
 56 |         res = response.body_as_unicode()
 57 |         if 'startstring' in response.meta and res.startswith(response.meta['startstring']):
 58 |             proxy = response.meta['proxy']
 59 |             self.redis_db.sadd(self.PROXY_SET, proxy)
 60 |             logger.info('可用代理+1  %s' % proxy)
 61 |             yield None
 62 |         else:
 63 |             proxy = response.url if 'proxy' not in response.meta else response.meta['proxy']
 64 |             self.redis_db.srem(self.PROXY_SET, proxy)
 65 |             logger.info('无效代理  %s' % proxy)
 66 |             yield None
 67 |     
 68 |     def closed(self, reason):
 69 |         pcount = self.redis_db.scard(self.PROXY_SET)
 70 |         logger.info('代理池测试完成，有效代理数: %s' % pcount)
 71 |         self.redis_db.set(self.PROXY_COUNT, pcount)
 72 | 
 73 | class ProxyFetchSpider(Spider):
 74 |     name = 'proxy_fetch'
 75 |     loop_delay = 10
 76 |     protect_sec = 180
 77 |     
 78 |     def __init__(self, mode='prod', *args, **kwargs):
 79 |         if mode == 'prod':            
 80 |             LOCAL_CONFIG_YAML = '/etc/hq-proxies.yml'
 81 |         elif mode == 'test':
 82 |             LOCAL_CONFIG_YAML = '/etc/hq-proxies.test.yml'
 83 |         with open(LOCAL_CONFIG_YAML, 'r') as f:
 84 |             LOCAL_CONFIG = yaml.load(f)
 85 |         
 86 |         self.redis_db = StrictRedis(
 87 |             host=LOCAL_CONFIG['REDIS_HOST'], 
 88 |             port=LOCAL_CONFIG['REDIS_PORT'], 
 89 |             password=LOCAL_CONFIG['REDIS_PASSWORD'],
 90 |             db=LOCAL_CONFIG['REDIS_DB']
 91 |         )
 92 |         self.PROXY_COUNT = LOCAL_CONFIG['PROXY_COUNT']
 93 |         self.PROXY_SET = LOCAL_CONFIG['PROXY_SET']
 94 | 
 95 |         self.validator_pool = set([])
 96 |         for validator in LOCAL_CONFIG['PROXY_VALIDATORS']:
 97 |             self.validator_pool.add((validator['url'], validator['startstring']))
 98 |         
 99 |         self.vendors = LOCAL_CONFIG['PROXY_VENDORS']
100 |     
101 |     def start_requests(self):
102 |         for vendor in self.vendors:
103 |             logger.debug(vendor)
104 |             callback = getattr(self, vendor['parser'])
105 |             yield Request(url=vendor['url'], callback=callback)
106 |     
107 |     def checkin(self, response):
108 |         res = response.body_as_unicode()
109 |         if 'startstring' in response.meta and res.startswith(response.meta['startstring']):
110 |             proxy = response.meta['proxy']
111 |             self.redis_db.sadd(self.PROXY_SET, proxy)
112 |             logger.info('可用代理+1  %s' % proxy)
113 |             yield None
114 |         else:
115 |             proxy = response.url if 'proxy' not in response.meta else response.meta['proxy']
116 |             logger.info('无效代理  %s' % proxy)
117 |             yield None
118 |     
119 |     def parse_xici(self, response):
120 |         ''' 
121 |         @url http://www.xicidaili.com/nn/
122 |         '''
123 |         logger.info('解析http://www.xicidaili.com/nn/')
124 |         succ = 0
125 |         fail = 0
126 |         count = 0
127 |         for tr in response.css('#ip_list tr'):
128 |             td_list = tr.css('td::text')
129 |             if len(td_list) < 3:
130 |                 continue
131 |             ipaddr = td_list[0].extract()
132 |             port = td_list[1].extract()
133 |             proto = td_list[5].extract()
134 |             latency = tr.css('div.bar::attr(title)').extract_first()
135 |             latency = re.match('(\d+\.\d+)秒', latency).group(1)
136 |             proxy = '%s://%s:%s' % (proto, ipaddr, port)
137 |             proxies = {proto: '%s:%s' % (ipaddr, port)}
138 |             if float(latency) > 3:
139 |                 logger.info('丢弃慢速代理: %s 延迟%s秒' % (proxy, latency))
140 |                 continue
141 |             logger.info('验证: %s' % proxy)
142 |             if not self.redis_db.sismember(self.PROXY_SET, proxy):
143 |                 vaurl, vastart = random.choice(list(self.validator_pool))
144 |                 yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
145 |             else:
146 |                 logger.info('该代理已收录..')
147 |     
148 |     def parse_66ip(self, response):
149 |         ''' 
150 |         @url http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip
151 |         '''
152 |         logger.info('开始爬取66ip')
153 |         if 'proxy' in response.meta:
154 |             logger.info('=>使用代理%s' % response.meta['proxy'])
155 |         res = response.body_as_unicode()
156 |         for addr in re.findall('\d+\.\d+\.\d+\.\d+\:\d+', res):
157 |             proxy = 'http://' + addr
158 |             print(proxy)
159 |             logger.info('验证: %s' % proxy)
160 |             if not self.redis_db.sismember(self.PROXY_SET, proxy):
161 |                 vaurl, vastart = random.choice(list(self.validator_pool))
162 |                 yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
163 |             else:
164 |                 logger.info('该代理已收录..')
165 |     
166 |     def parse_ip181(self, response):
167 |         ''' 
168 |         @url http://www.ip181.com/
169 |         '''
170 |         logger.info('开始爬取ip181')
171 |         if 'proxy' in response.meta:
172 |             logger.info('=>使用代理%s' % response.meta['proxy'])
173 |         for tr in response.css('table tbody tr'):
174 |             ip = tr.css('td::text').extract()[0]
175 |             port = tr.css('td::text').extract()[1]
176 |             type = tr.css('td::text').extract()[2]
177 |             proxy = 'http://%s:%s' % (ip, port)
178 |             if type != '高匿':
179 |                 logger.info('丢弃非高匿代理：%s' % proxy)
180 |                 continue
181 |             logger.info('验证: %s' % proxy)
182 |             if not self.redis_db.sismember(self.PROXY_SET, proxy):
183 |                 vaurl, vastart = random.choice(list(self.validator_pool))
184 |                 yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
185 |             else:
186 |                 logger.info('该代理已收录..')
187 |     
188 |     def parse_kxdaili(self, response):
189 |         ''' 
190 |         @url http://www.kxdaili.com/dailiip/1/1.html#ip
191 |         '''
192 |         logger.info('开始爬取kxdaili')
193 |         if 'proxy' in response.meta:
194 |             logger.info('=>使用代理%s' % response.meta['proxy'])
195 |         url_pattern = 'http://www.kxdaili.com/dailiip/1/%s.html#ip'
196 |         try:
197 |             page = re.search('(\d)+\.html', response.url).group(1)
198 |             page = int(page)
199 |         except Exception as e:
200 |             logger.exception(e)
201 |             logger.error(response.url)
202 |         for tr in response.css('table.ui.table.segment tbody tr'):
203 |             ip = tr.css('td::text').extract()[0]
204 |             port = tr.css('td::text').extract()[1]
205 |             proxy = 'http://%s:%s' % (ip, port)
206 |             logger.info('验证: %s' % proxy)
207 |             if not self.redis_db.sismember(self.PROXY_SET, proxy):
208 |                 vaurl, vastart = random.choice(list(self.validator_pool))
209 |                 yield Request(url=vaurl, meta={'proxy': proxy, 'startstring': vastart}, callback=self.checkin, dont_filter=True)
210 |             else:
211 |                 logger.info('该代理已收录..')
212 |         if page < 3: # 爬取前3页
213 |             page += 1
214 |             new_url = url_pattern % page
215 |             new_meta = response.meta.copy()
216 |             new_meta['page'] = page
217 |             yield Request(url=new_url, meta=new_meta, callback=self.parse_kxdaili)
218 |     
219 |     def closed(self, reason):
220 |         logger.info('代理池更新完成，有效代理数: %s' % self.redis_db.scard(self.PROXY_SET))
221 |         
222 | 


--------------------------------------------------------------------------------