├── README.md ├── __init__.py ├── proxy_provider.txt ├── proxypool ├── __init__.py ├── crawler.py ├── db.py ├── error.py ├── getter.py ├── importer.py ├── scheduler.py ├── setting.py ├── tester.py └── utils.py ├── requirements.txt └── run.py /README.md: -------------------------------------------------------------------------------- 1 | # Proxy池-模块说明 2 | 3 | ##### 由于爬虫业务大部分涉及电商、行业数据、资讯价值数据,被爬方一般为了反爬取,大部分会采用初级的反爬措施,其中较为常用的就是IP限制。 4 | ##### 为方便爬取网站,搭建了IP池模块,供大家可以使用。 5 | 6 | * 项目结构: 7 | ``` 8 | proxypool 9 | │ __init__.py 10 | │ proxy_provider.txt 11 | │ requirements.txt 12 | │ run.py 13 | │ README.md 14 | └───proxypool 15 | │ __init__.py 16 | │ crawler.py 17 | │ db.py 18 | │ error.py 19 | │ getter.py 20 | │ importer.py 21 | │ scheduler.py 22 | │ setting.py 23 | │ tester.py 24 | │ utils.py 25 | │ 26 | 27 | ``` 28 | 29 | * IP代理池调用 30 | 31 | * 调用Get_Proxy()中的 process_request 返回随机得分高的IP 32 | ```python 33 | import logging 34 | import redis 35 | from random import choice 36 | 37 | class Get_Proxy(): 38 | def __init__(self,host,port): 39 | self.logger = logging.getLogger(__name__) 40 | self.REDIS_KEY = 'proxies' # 41 | self.MAX_SCORE = 100 42 | # 连接数据库 43 | pool = redis.ConnectionPool(host=host,port=port) 44 | self.db = redis.StrictRedis(connection_pool=pool) 45 | 46 | def get_random_proxy(self): 47 | # 数据库拿数据IP,优先得分高的IP 48 | result = self.db.zrangebyscore(self.REDIS_KEY,self.MAX_SCORE,self.MAX_SCORE) 49 | if len(result): 50 | return choice(result) 51 | else: 52 | result = self.db.zrevrange(self.REDIS_KEY,0,100) 53 | if len(result): 54 | return choice(result) 55 | else: 56 | raise EOFError 57 | def process_request(self): 58 | proxy = self.get_random_proxy() 59 | if proxy: 60 | proxy_uri = 'http://{proxy}'.format(proxy=proxy.decode('utf-8')) 61 | self.logger.debug("正在使用代理:"+proxy_uri) 62 | return proxy_uri 63 | ``` 64 | * 模块可分享点 65 | * 使用元类实现方法的自动调用[crawler.py] 66 | * 由于使用元类识别以crawl_的方法,因此拓展抓取方法请按crawl_XX 定义[crawler.py] 67 | 68 | ```python 69 | class ProxyMetaclass(type): 70 | def __new__(cls, name, bases, attrs): 71 | count = 0 72 | attrs['__CrawlFunc__'] = [] 73 | for k, v in attrs.items(): 74 | if 'crawl_' in k: 75 | attrs['__CrawlFunc__'].append(k) 76 | count += 1 77 | attrs['__CrawlFuncCount__'] = count 78 | return type.__new__(cls, name, bases, attrs) 79 | ``` 80 | 81 | ```python 82 | class Crawler(object, metaclass=ProxyMetaclass): 83 | def get_proxies(self, callback): 84 | proxies = [] 85 | for proxy in eval("self.{}()".format(callback)): 86 | print('成功获取到代理', proxy) 87 | proxies.append(proxy) 88 | return proxies 89 | 90 | def crawl_XX(self): 91 | 92 | XXXXXXXXX 93 | ``` 94 | 95 | * 添加测试网站(建议爬取哪个网站就添加哪个网站进入测试)[setting.py] 96 | 97 | ```python 98 | # 测试,建议抓哪个网站测哪个 99 | TEST_URL = [ 100 | 'https://tech.china.com/', 101 | ] 102 | ``` 103 | 104 | * 抓取的代理信息[proxy_provider.txt] 105 | 106 | 107 | ``` 108 | 代理: 109 | https://proxy.mimvp.com/free.php?proxy=in_hp 110 | http://www.coobobo.com/free-http-proxy 111 | http://ip.zdaye.com/ 112 | http://www.mayidaili.com/free/anonymous/%E9%AB%98%E5%8C%BF 113 | http://http.taiyangruanjian.com/ 114 | http://http.zhimaruanjian.com/ 115 | http://ip.jiangxianli.com 116 | 117 | 66代理 118 | 云代理 119 | 快代理 120 | 西刺代理 121 | 无忧代理 122 | 免费IP代理 123 | ``` 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdcrgb/proxypool/2ba6c1c577722868b457dd4c56facdf2ecf3af73/__init__.py -------------------------------------------------------------------------------- /proxy_provider.txt: -------------------------------------------------------------------------------- 1 | 代理: 2 | https://proxy.mimvp.com/free.php?proxy=in_hp 3 | http://www.coobobo.com/free-http-proxy 4 | http://ip.zdaye.com/ 5 | http://www.mayidaili.com/free/anonymous/%E9%AB%98%E5%8C%BF 6 | http://http.taiyangruanjian.com/ 7 | http://http.zhimaruanjian.com/ 8 | http://ip.jiangxianli.com 9 | 10 | 66代理 11 | 云代理 12 | 快代理 13 | 西刺代理 14 | 无忧代理 15 | 免费IP代理 -------------------------------------------------------------------------------- /proxypool/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdcrgb/proxypool/2ba6c1c577722868b457dd4c56facdf2ecf3af73/proxypool/__init__.py -------------------------------------------------------------------------------- /proxypool/crawler.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from .utils import get_page 4 | from pyquery import PyQuery as pq 5 | 6 | 7 | class ProxyMetaclass(type): 8 | def __new__(cls, name, bases, attrs): 9 | count = 0 10 | attrs['__CrawlFunc__'] = [] 11 | for k, v in attrs.items(): 12 | if 'crawl_' in k: 13 | attrs['__CrawlFunc__'].append(k) 14 | count += 1 15 | attrs['__CrawlFuncCount__'] = count 16 | return type.__new__(cls, name, bases, attrs) 17 | 18 | 19 | class Crawler(object, metaclass=ProxyMetaclass): 20 | def get_proxies(self, callback): 21 | proxies = [] 22 | for proxy in eval("self.{}()".format(callback)): 23 | print('成功获取到代理', proxy) 24 | proxies.append(proxy) 25 | return proxies 26 | 27 | def crawl_daili66(self, page_count=4): 28 | """ 29 | 获取代理66 30 | :param page_count: 页码 31 | :return: 代理 32 | """ 33 | start_url = 'http://www.66ip.cn/{}.html' 34 | urls = [start_url.format(page) for page in range(1, page_count + 1)] 35 | for url in urls: 36 | print('Crawling', url) 37 | html = get_page(url) 38 | if html: 39 | doc = pq(html) 40 | trs = doc('.containerbox table tr:gt(0)').items() 41 | for tr in trs: 42 | ip = tr.find('td:nth-child(1)').text() 43 | port = tr.find('td:nth-child(2)').text() 44 | yield ':'.join([ip, port]) 45 | 46 | def crawl_ip3366(self): 47 | for page in range(1, 4): 48 | start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page) 49 | html = get_page(start_url) 50 | ip_address = re.compile('\s*(.*?)\s*(.*?)') 51 | # \s * 匹配空格,起到换行作用 52 | re_ip_address = ip_address.findall(html) 53 | for address, port in re_ip_address: 54 | result = address+':'+ port 55 | yield result.replace(' ', '') 56 | 57 | def crawl_kuaidaili(self): 58 | for i in range(1, 4): 59 | start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i) 60 | html = get_page(start_url) 61 | if html: 62 | ip_address = re.compile('(.*?)') 63 | re_ip_address = ip_address.findall(html) 64 | port = re.compile('(.*?)') 65 | re_port = port.findall(html) 66 | for address,port in zip(re_ip_address, re_port): 67 | address_port = address+':'+port 68 | yield address_port.replace(' ','') 69 | 70 | def crawl_xicidaili(self): 71 | for i in range(1, 3): 72 | start_url = 'http://www.xicidaili.com/nn/{}'.format(i) 73 | headers = { 74 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 75 | 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 76 | 'Host':'www.xicidaili.com', 77 | 'Referer':'http://www.xicidaili.com/nn/3', 78 | 'Upgrade-Insecure-Requests':'1', 79 | } 80 | html = get_page(start_url, options=headers) 81 | if html: 82 | find_trs = re.compile('(.*?)', re.S) 83 | trs = find_trs.findall(html) 84 | for tr in trs: 85 | find_ip = re.compile('(\d+\.\d+\.\d+\.\d+)') 86 | re_ip_address = find_ip.findall(tr) 87 | find_port = re.compile('(\d+)') 88 | re_port = find_port.findall(tr) 89 | for address,port in zip(re_ip_address, re_port): 90 | address_port = address+':'+port 91 | yield address_port.replace(' ','') 92 | 93 | def crawl_ip3366(self): 94 | for i in range(1, 4): 95 | start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i) 96 | html = get_page(start_url) 97 | if html: 98 | find_tr = re.compile('(.*?)', re.S) 99 | trs = find_tr.findall(html) 100 | for s in range(1, len(trs)): 101 | find_ip = re.compile('(\d+\.\d+\.\d+\.\d+)') 102 | re_ip_address = find_ip.findall(trs[s]) 103 | find_port = re.compile('(\d+)') 104 | re_port = find_port.findall(trs[s]) 105 | for address,port in zip(re_ip_address, re_port): 106 | address_port = address+':'+port 107 | yield address_port.replace(' ','') 108 | 109 | def crawl_iphai(self): 110 | start_url = 'http://www.iphai.com/' 111 | html = get_page(start_url) 112 | if html: 113 | find_tr = re.compile('(.*?)', re.S) 114 | trs = find_tr.findall(html) 115 | for s in range(1, len(trs)): 116 | find_ip = re.compile('\s+(\d+\.\d+\.\d+\.\d+)\s+', re.S) 117 | re_ip_address = find_ip.findall(trs[s]) 118 | find_port = re.compile('\s+(\d+)\s+', re.S) 119 | re_port = find_port.findall(trs[s]) 120 | for address,port in zip(re_ip_address, re_port): 121 | address_port = address+':'+port 122 | yield address_port.replace(' ','') 123 | 124 | def crawl_data5u(self): 125 | start_url = 'http://www.data5u.com/free/gngn/index.shtml' 126 | headers = { 127 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 128 | 'Accept-Encoding': 'gzip, deflate', 129 | 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 130 | 'Cache-Control': 'max-age=0', 131 | 'Connection': 'keep-alive', 132 | 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 133 | 'Host': 'www.data5u.com', 134 | 'Referer': 'http://www.data5u.com/free/index.shtml', 135 | 'Upgrade-Insecure-Requests': '1', 136 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', 137 | } 138 | html = get_page(start_url, options=headers) 139 | if html: 140 | ip_address = re.compile('
  • (\d+\.\d+\.\d+\.\d+)
  • .*?
  • (\d+)
  • ', re.S) 141 | re_ip_address = ip_address.findall(html) 142 | for address, port in re_ip_address: 143 | result = address + ':' + port 144 | yield result.replace(' ', '') 145 | 146 | # 免费IP代理库 147 | def crawl_jiangxianli(self): 148 | start_url = 'http://ip.jiangxianli.com' 149 | html = get_page(start_url) 150 | if html: 151 | ip_address = re.compile('\