├── README.md
├── __init__.py
├── proxy_provider.txt
├── proxypool
├── __init__.py
├── crawler.py
├── db.py
├── error.py
├── getter.py
├── importer.py
├── scheduler.py
├── setting.py
├── tester.py
└── utils.py
├── requirements.txt
└── run.py
/README.md:
--------------------------------------------------------------------------------
1 | # Proxy池-模块说明
2 |
3 | ##### 由于爬虫业务大部分涉及电商、行业数据、资讯价值数据,被爬方一般为了反爬取,大部分会采用初级的反爬措施,其中较为常用的就是IP限制。
4 | ##### 为方便爬取网站,搭建了IP池模块,供大家可以使用。
5 |
6 | * 项目结构:
7 | ```
8 | proxypool
9 | │ __init__.py
10 | │ proxy_provider.txt
11 | │ requirements.txt
12 | │ run.py
13 | │ README.md
14 | └───proxypool
15 | │ __init__.py
16 | │ crawler.py
17 | │ db.py
18 | │ error.py
19 | │ getter.py
20 | │ importer.py
21 | │ scheduler.py
22 | │ setting.py
23 | │ tester.py
24 | │ utils.py
25 | │
26 |
27 | ```
28 |
29 | * IP代理池调用
30 |
31 | * 调用Get_Proxy()中的 process_request 返回随机得分高的IP
32 | ```python
33 | import logging
34 | import redis
35 | from random import choice
36 |
37 | class Get_Proxy():
38 | def __init__(self,host,port):
39 | self.logger = logging.getLogger(__name__)
40 | self.REDIS_KEY = 'proxies' #
41 | self.MAX_SCORE = 100
42 | # 连接数据库
43 | pool = redis.ConnectionPool(host=host,port=port)
44 | self.db = redis.StrictRedis(connection_pool=pool)
45 |
46 | def get_random_proxy(self):
47 | # 数据库拿数据IP,优先得分高的IP
48 | result = self.db.zrangebyscore(self.REDIS_KEY,self.MAX_SCORE,self.MAX_SCORE)
49 | if len(result):
50 | return choice(result)
51 | else:
52 | result = self.db.zrevrange(self.REDIS_KEY,0,100)
53 | if len(result):
54 | return choice(result)
55 | else:
56 | raise EOFError
57 | def process_request(self):
58 | proxy = self.get_random_proxy()
59 | if proxy:
60 | proxy_uri = 'http://{proxy}'.format(proxy=proxy.decode('utf-8'))
61 | self.logger.debug("正在使用代理:"+proxy_uri)
62 | return proxy_uri
63 | ```
64 | * 模块可分享点
65 | * 使用元类实现方法的自动调用[crawler.py]
66 | * 由于使用元类识别以crawl_的方法,因此拓展抓取方法请按crawl_XX 定义[crawler.py]
67 |
68 | ```python
69 | class ProxyMetaclass(type):
70 | def __new__(cls, name, bases, attrs):
71 | count = 0
72 | attrs['__CrawlFunc__'] = []
73 | for k, v in attrs.items():
74 | if 'crawl_' in k:
75 | attrs['__CrawlFunc__'].append(k)
76 | count += 1
77 | attrs['__CrawlFuncCount__'] = count
78 | return type.__new__(cls, name, bases, attrs)
79 | ```
80 |
81 | ```python
82 | class Crawler(object, metaclass=ProxyMetaclass):
83 | def get_proxies(self, callback):
84 | proxies = []
85 | for proxy in eval("self.{}()".format(callback)):
86 | print('成功获取到代理', proxy)
87 | proxies.append(proxy)
88 | return proxies
89 |
90 | def crawl_XX(self):
91 |
92 | XXXXXXXXX
93 | ```
94 |
95 | * 添加测试网站(建议爬取哪个网站就添加哪个网站进入测试)[setting.py]
96 |
97 | ```python
98 | # 测试,建议抓哪个网站测哪个
99 | TEST_URL = [
100 | 'https://tech.china.com/',
101 | ]
102 | ```
103 |
104 | * 抓取的代理信息[proxy_provider.txt]
105 |
106 |
107 | ```
108 | 代理:
109 | https://proxy.mimvp.com/free.php?proxy=in_hp
110 | http://www.coobobo.com/free-http-proxy
111 | http://ip.zdaye.com/
112 | http://www.mayidaili.com/free/anonymous/%E9%AB%98%E5%8C%BF
113 | http://http.taiyangruanjian.com/
114 | http://http.zhimaruanjian.com/
115 | http://ip.jiangxianli.com
116 |
117 | 66代理
118 | 云代理
119 | 快代理
120 | 西刺代理
121 | 无忧代理
122 | 免费IP代理
123 | ```
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdcrgb/proxypool/2ba6c1c577722868b457dd4c56facdf2ecf3af73/__init__.py
--------------------------------------------------------------------------------
/proxy_provider.txt:
--------------------------------------------------------------------------------
1 | 代理:
2 | https://proxy.mimvp.com/free.php?proxy=in_hp
3 | http://www.coobobo.com/free-http-proxy
4 | http://ip.zdaye.com/
5 | http://www.mayidaili.com/free/anonymous/%E9%AB%98%E5%8C%BF
6 | http://http.taiyangruanjian.com/
7 | http://http.zhimaruanjian.com/
8 | http://ip.jiangxianli.com
9 |
10 | 66代理
11 | 云代理
12 | 快代理
13 | 西刺代理
14 | 无忧代理
15 | 免费IP代理
--------------------------------------------------------------------------------
/proxypool/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdcrgb/proxypool/2ba6c1c577722868b457dd4c56facdf2ecf3af73/proxypool/__init__.py
--------------------------------------------------------------------------------
/proxypool/crawler.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | from .utils import get_page
4 | from pyquery import PyQuery as pq
5 |
6 |
7 | class ProxyMetaclass(type):
8 | def __new__(cls, name, bases, attrs):
9 | count = 0
10 | attrs['__CrawlFunc__'] = []
11 | for k, v in attrs.items():
12 | if 'crawl_' in k:
13 | attrs['__CrawlFunc__'].append(k)
14 | count += 1
15 | attrs['__CrawlFuncCount__'] = count
16 | return type.__new__(cls, name, bases, attrs)
17 |
18 |
19 | class Crawler(object, metaclass=ProxyMetaclass):
20 | def get_proxies(self, callback):
21 | proxies = []
22 | for proxy in eval("self.{}()".format(callback)):
23 | print('成功获取到代理', proxy)
24 | proxies.append(proxy)
25 | return proxies
26 |
27 | def crawl_daili66(self, page_count=4):
28 | """
29 | 获取代理66
30 | :param page_count: 页码
31 | :return: 代理
32 | """
33 | start_url = 'http://www.66ip.cn/{}.html'
34 | urls = [start_url.format(page) for page in range(1, page_count + 1)]
35 | for url in urls:
36 | print('Crawling', url)
37 | html = get_page(url)
38 | if html:
39 | doc = pq(html)
40 | trs = doc('.containerbox table tr:gt(0)').items()
41 | for tr in trs:
42 | ip = tr.find('td:nth-child(1)').text()
43 | port = tr.find('td:nth-child(2)').text()
44 | yield ':'.join([ip, port])
45 |
46 | def crawl_ip3366(self):
47 | for page in range(1, 4):
48 | start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
49 | html = get_page(start_url)
50 | ip_address = re.compile('
\s*(.*?) | \s*(.*?) | ')
51 | # \s * 匹配空格,起到换行作用
52 | re_ip_address = ip_address.findall(html)
53 | for address, port in re_ip_address:
54 | result = address+':'+ port
55 | yield result.replace(' ', '')
56 |
57 | def crawl_kuaidaili(self):
58 | for i in range(1, 4):
59 | start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
60 | html = get_page(start_url)
61 | if html:
62 | ip_address = re.compile('(.*?) | ')
63 | re_ip_address = ip_address.findall(html)
64 | port = re.compile('(.*?) | ')
65 | re_port = port.findall(html)
66 | for address,port in zip(re_ip_address, re_port):
67 | address_port = address+':'+port
68 | yield address_port.replace(' ','')
69 |
70 | def crawl_xicidaili(self):
71 | for i in range(1, 3):
72 | start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
73 | headers = {
74 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
75 | 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
76 | 'Host':'www.xicidaili.com',
77 | 'Referer':'http://www.xicidaili.com/nn/3',
78 | 'Upgrade-Insecure-Requests':'1',
79 | }
80 | html = get_page(start_url, options=headers)
81 | if html:
82 | find_trs = re.compile('
(.*?)
', re.S)
83 | trs = find_trs.findall(html)
84 | for tr in trs:
85 | find_ip = re.compile('(\d+\.\d+\.\d+\.\d+) | ')
86 | re_ip_address = find_ip.findall(tr)
87 | find_port = re.compile('(\d+) | ')
88 | re_port = find_port.findall(tr)
89 | for address,port in zip(re_ip_address, re_port):
90 | address_port = address+':'+port
91 | yield address_port.replace(' ','')
92 |
93 | def crawl_ip3366(self):
94 | for i in range(1, 4):
95 | start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
96 | html = get_page(start_url)
97 | if html:
98 | find_tr = re.compile('(.*?)
', re.S)
99 | trs = find_tr.findall(html)
100 | for s in range(1, len(trs)):
101 | find_ip = re.compile('(\d+\.\d+\.\d+\.\d+) | ')
102 | re_ip_address = find_ip.findall(trs[s])
103 | find_port = re.compile('(\d+) | ')
104 | re_port = find_port.findall(trs[s])
105 | for address,port in zip(re_ip_address, re_port):
106 | address_port = address+':'+port
107 | yield address_port.replace(' ','')
108 |
109 | def crawl_iphai(self):
110 | start_url = 'http://www.iphai.com/'
111 | html = get_page(start_url)
112 | if html:
113 | find_tr = re.compile('(.*?)
', re.S)
114 | trs = find_tr.findall(html)
115 | for s in range(1, len(trs)):
116 | find_ip = re.compile('\s+(\d+\.\d+\.\d+\.\d+)\s+ | ', re.S)
117 | re_ip_address = find_ip.findall(trs[s])
118 | find_port = re.compile('\s+(\d+)\s+ | ', re.S)
119 | re_port = find_port.findall(trs[s])
120 | for address,port in zip(re_ip_address, re_port):
121 | address_port = address+':'+port
122 | yield address_port.replace(' ','')
123 |
124 | def crawl_data5u(self):
125 | start_url = 'http://www.data5u.com/free/gngn/index.shtml'
126 | headers = {
127 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
128 | 'Accept-Encoding': 'gzip, deflate',
129 | 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
130 | 'Cache-Control': 'max-age=0',
131 | 'Connection': 'keep-alive',
132 | 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
133 | 'Host': 'www.data5u.com',
134 | 'Referer': 'http://www.data5u.com/free/index.shtml',
135 | 'Upgrade-Insecure-Requests': '1',
136 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
137 | }
138 | html = get_page(start_url, options=headers)
139 | if html:
140 | ip_address = re.compile('(\d+\.\d+\.\d+\.\d+).*?(\d+)', re.S)
141 | re_ip_address = ip_address.findall(html)
142 | for address, port in re_ip_address:
143 | result = address + ':' + port
144 | yield result.replace(' ', '')
145 |
146 | # 免费IP代理库
147 | def crawl_jiangxianli(self):
148 | start_url = 'http://ip.jiangxianli.com'
149 | html = get_page(start_url)
150 | if html:
151 | ip_address = re.compile('\