├── .idea
├── boss_scrapy.iml
└── modules.xml
├── README.md
├── boss_scrapy
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
├── spiders
│ ├── __init__.py
│ └── boss_spider.py
└── try_to_getProxy.py
├── boss_start.py
└── scrapy.cfg
/.idea/boss_scrapy.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # boss_scrapy
2 | 利用代理IP爬取boss直聘的内容
3 | 需要注意的几个点:
4 | 1.setting.py文件中要修改DEFAULT_REQUEST_HEADERS中的Proxy-Authorization和cookie
5 | 2.要先在MySQL数据库中添加一个名为boss的数据库,再在数据库中添加名字为company_detail的表
6 |
--------------------------------------------------------------------------------
/boss_scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weakmaple/boss_scrapy/7877d00562171a2cd422a981cdaaeb916395df9d/boss_scrapy/__init__.py
--------------------------------------------------------------------------------
/boss_scrapy/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | class BossScrapyItem(scrapy.Item):
11 | # 工作名称 工作城市 工作经验 学历要求 公司名字 公司所在地
12 | job_name = scrapy.Field()
13 | job_city = scrapy.Field()
14 | job_experience = scrapy.Field()
15 | job_education = scrapy.Field()
16 | company_name = scrapy.Field()
17 | loc_job = scrapy.Field()
18 | # 公司介绍 职位描述 团队介绍 工商信息
19 | company_describe_detail = scrapy.Field()
20 | job_describe_detail = scrapy.Field()
21 | team_describe_detail = scrapy.Field()
22 | business_information = scrapy.Field()
23 |
24 |
--------------------------------------------------------------------------------
/boss_scrapy/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | import base64
9 | from twisted.internet.defer import DeferredLock
10 | import requests
11 | import random
12 | import json
13 | from boss_scrapy.settings import DEFAULT_REQUEST_HEADERS,USER_AGENT_LIST
14 | from boss_scrapy.try_to_getProxy import ProxyModel
15 |
16 | class RandomProxy(object):
17 |
18 | def __init__(self):
19 | self.current_proxy = None
20 | self.lock = DeferredLock()
21 |
22 | def process_request(self, request, spider):
23 | user_agent = random.choice(USER_AGENT_LIST)
24 | request.headers['User-Agent'] = user_agent
25 |
26 | if 'proxy' not in request.meta or self.current_proxy.is_expiring:
27 | #请求代理
28 | self.update_proxy()
29 | request.meta['proxy'] = self.current_proxy.proxy
30 |
31 | def process_response(self, request, response, spider):
32 | # 如果对方重定向(302)去验证码的网页,换掉代理IP
33 | # 'captcha' in response.url 指的是有时候验证码的网页返回的状态码是200,所以用这个作为辨识的标志
34 | if response.status != 200 or 'captcha' in response.url:
35 | # 如果来到这里,说明这个请求已经被boss直聘识别为爬虫了
36 | # 所以这个请求就相当于什么都没有获取到
37 | # 所以要重新返回request,让这个请求重新加入到调度中
38 | # 下次再发送
39 | if not self.current_proxy.blacked:
40 | self.current_proxy.blacked = True
41 | self.update_proxy()
42 | print('%s代理失效' % self.current_proxy.proxy)
43 | request.meta['proxy'] = self.current_proxy.proxy
44 | return request
45 |
46 | # 如果是正常的话,记得最后要返回response
47 | # 如果不返回,这个response就不会被传到爬虫那里去
48 | # 也就得不到解析
49 | return response
50 |
51 | def update_proxy(self):
52 | #lock是属于多线程中的一个概念,因为这里scrapy是采用异步的,可以直接看成多线程
53 | #所以有可能出现这样的情况,爬虫在爬取一个网页的时候,忽然被对方封了,这时候就会来到这里
54 | #获取新的IP,但是同时会有多条线程来这里请求,那么就会出现浪费代理IP的请求,所以这这里加上了锁
55 | #锁的作用是在同一时间段,所有线程只能有一条线程可以访问锁内的代码,这个时候一条线程获得新的代理IP
56 | #而这个代理IP是可以用在所有线程的,这样子别的线程就可以继续运行了,减少了代理IP(钱)的浪费
57 | self.lock.acquire()
58 | # 判断换线程的条件
59 | # 1.目前没有使用代理IP
60 | # 2.到线程过期的时间了
61 | # 3.目前IP已经被对方封了
62 | # 满足以上其中一种情况就可以换代理IP了
63 | if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked:
64 | url = r'https://h.wandouip.com/get/ip-list?pack=%s&num=1&xy=1&type=2&lb=\r\n&mr=1&' % random.randint(100, 1000)
65 | response = requests.get(url=url, headers=DEFAULT_REQUEST_HEADERS)
66 | text = json.loads(response.text)
67 | print(text)
68 | data = text['data'][0]
69 | proxy_model = ProxyModel(data)
70 | print('重新获取了一个代理:%s' % proxy_model.proxy)
71 | self.current_proxy = proxy_model
72 | # return proxy_model
73 | self.lock.release()
--------------------------------------------------------------------------------
/boss_scrapy/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymysql
9 |
10 | class BossScrapyPipeline(object):
11 |
12 | def __init__(self):
13 | self.client = pymysql.connect(
14 | host='localhost',
15 | port=3306,
16 | user='root',
17 | password='123456',
18 | database='boss',
19 | charset='utf8'
20 | )
21 | self.cur = self.client.cursor()
22 |
23 | def process_item(self, item, spider):
24 | # 工作名称 工作城市 工作经验 学历要求 公司名字 公司所在地
25 | job_name = item['job_name']
26 | job_city = item['job_city']
27 | job_experience = item['job_experience']
28 | job_education = item['job_education']
29 | company_name = item['company_name']
30 | loc_job = item['loc_job']
31 | # 公司介绍 职位描述 团队介绍 工商信息
32 | company_describe_detail = item['company_describe_detail']
33 | job_describe_detail = item['job_describe_detail']
34 | team_describe_detail = item['team_describe_detail']
35 | business_information = item['business_information']
36 |
37 | lis = [
38 | job_name, job_city, job_experience, job_education, company_name, loc_job,
39 | company_describe_detail, job_describe_detail, team_describe_detail,
40 | business_information
41 | ]
42 |
43 | sql = 'insert into company_detail(工作名称,工作城市,工作经验,学历要求,' \
44 | '公司名字,公司所在地,公司介绍,职位描述,团队介绍,工商信息) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
45 | self.cur.execute(sql, lis)
46 | self.client.commit()
47 |
48 | return item
49 |
--------------------------------------------------------------------------------
/boss_scrapy/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for boss_scrapy project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'boss_scrapy'
13 |
14 | SPIDER_MODULES = ['boss_scrapy.spiders']
15 | NEWSPIDER_MODULE = 'boss_scrapy.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'boss_scrapy (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | #Proxy-Authorization要填入由自己的账号密码生成base64加密字符串
44 | 'Proxy-Authorization':'Basic xxxxxxxxxxxxxxxxx',
45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | 'Accept-Language': 'en',
47 | #cookie要填入获得代理那个网页的cookie
48 | 'cookie':'advanced-frontend=xxxxxxxxxxxxxxxxx',
49 | 'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
50 |
51 | }
52 |
53 | # Enable or disable spider middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
55 | # SPIDER_MIDDLEWARES = {
56 | # 'boss_scrapy.middlewares.randomIPMiddleware': 100,
57 | # }
58 |
59 | # Enable or disable downloader middlewares
60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
61 | DOWNLOADER_MIDDLEWARES = {
62 | 'boss_scrapy.middlewares.RandomProxy': 100,
63 | }
64 |
65 | # Enable or disable extensions
66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 |
71 | # Configure item pipelines
72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 | 'boss_scrapy.pipelines.BossScrapyPipeline': 300,
75 | }
76 |
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 |
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 |
98 | USER_AGENT_LIST = [
99 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
100 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
101 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
102 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
103 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
104 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
105 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
106 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
107 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
108 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
109 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
110 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
111 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
112 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
113 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
114 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
115 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
116 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
117 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
118 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
119 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
120 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
121 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
122 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
123 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
124 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
125 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
126 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
127 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
128 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
129 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
130 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
131 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
132 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
133 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
134 | ]
135 |
--------------------------------------------------------------------------------
/boss_scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/boss_scrapy/spiders/boss_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from boss_scrapy.items import BossScrapyItem
6 |
7 |
8 | class BossSpiderSpider(CrawlSpider):
9 | name = 'boss_spider'
10 | # allowed_domains = ['zhipin.com"']
11 | # start_urls = ['http://httpbin.org/']
12 | start_urls = ['https://www.zhipin.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1&ka=page-1']
13 |
14 | rules = (
15 | Rule(LinkExtractor(allow=r'.+page=%d+&ka=page-%d+/'), follow=True),
16 | Rule(LinkExtractor(allow=r'.+/job_detail/.+\.html'), callback='parse_item', follow=False),
17 | # Rule(LinkExtractor(allow=r'http://httpbin.org/ip'), callback='parse_item'),
18 | )
19 |
20 | def parse_item(self, response):
21 | # print("="*40)
22 | # print(response.body)
23 | # print("=" * 40)
24 | job_name = response.xpath('//div[@class="name"]/h1/text()').get()
25 | job_detail = response.xpath('//div[@class="info-primary"]/p//text()').getall()
26 | job_city = job_detail[0]
27 | job_experience = job_detail[1]
28 | job_education = job_detail[2]
29 | company_name = response.xpath('//div[@class="info-company"]/h3/a/text()').get()
30 | loc_job = response.xpath('//div[@class="location-address"]/text()').get()
31 |
32 | job_secs = response.xpath('//div[@class="job-sec company-info"]/h3/text()').get()
33 | if job_secs == '公司介绍':
34 | company_describe_detail = response.xpath('//div[@class="job-sec company-info"]/div[@class="text"]/text()').getall()
35 | company_describe_detail = '\n'.join(company_describe_detail).strip()
36 | else:
37 | company_describe_detail = '无'
38 |
39 | job_secs = response.xpath('//div[@class="job-sec"]')
40 | job_describe_detail,team_describe_detail,business_information = '无','无','无'
41 | for job_sec in job_secs:
42 | sec_describe = job_sec.xpath('./h3/text()').get()
43 | if sec_describe == '职位描述':
44 | job_describe_detail = job_sec.xpath('./div[@class="text"]/text()').getall()
45 | job_describe_detail = '\n'.join(job_describe_detail).strip()
46 |
47 | if sec_describe == '团队介绍':
48 | team_describe_detail = job_sec.xpath('./div[@class="text"]/text()').getall()
49 | team_describe_detail = '\n'.join(team_describe_detail).strip()
50 |
51 | if sec_describe == '工商信息':
52 | business_information_1 = job_sec.xpath('./div[@class="name"]/text()').getall()
53 | business_information_1 = ''.join(business_information_1).strip()
54 | business_information_2 = job_sec.xpath('./div[@class="level-list"]/li//text()').getall()
55 | business_information_2 = '\n'.join(business_information_2).strip()
56 | business_information = business_information_1+'\n'+business_information_2
57 |
58 | item = BossScrapyItem(
59 | job_name = job_name,
60 | job_city = job_city,
61 | job_experience = job_experience,
62 | job_education = job_education,
63 | company_name = company_name,
64 | loc_job = loc_job,
65 | company_describe_detail = company_describe_detail,
66 | job_describe_detail = job_describe_detail,
67 | team_describe_detail = team_describe_detail,
68 | business_information = business_information
69 | )
70 |
71 | yield item
72 |
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/boss_scrapy/try_to_getProxy.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime,timedelta
2 |
3 | class ProxyModel(object):
4 | def __init__(self,data):
5 | self.ip = data['ip']
6 | self.port = data['port']
7 | self.expire_str = data['expire_time']
8 | self.proxy = 'http://'+ '%s:%s' % (self.ip, self.port)
9 | self.expire_time = self.detail_time
10 | #代理是否已经被拉入黑名单了
11 | self.blacked = False
12 | #这个函数用于把str格式的过期时间(expire_time)转化为datetime格式,方便我们来
13 | #根据过期时间换新的代理
14 | @property
15 | def detail_time(self):
16 | date_str,time_str = self.expire_str.split(" ")
17 | year,month,day = date_str.split('-')
18 | hour,minute,second = time_str.split(':')
19 | expire_time = datetime(
20 | year=int(year),
21 | month=int(month),
22 | day=int(day),
23 | hour=int(hour),
24 | minute=int(minute),
25 | second=int(second),
26 | )
27 | return expire_time
28 | #比较代理的过期时间和现在的时间
29 | #如果这个代理的存活时间少于10,那么就要准备更换代理IP了
30 | @property
31 | def is_expiring(self):
32 | now = datetime.now()
33 | if (self.expire_time - now)