├── .idea ├── boss_scrapy.iml └── modules.xml ├── README.md ├── boss_scrapy ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders │ ├── __init__.py │ └── boss_spider.py └── try_to_getProxy.py ├── boss_start.py └── scrapy.cfg /.idea/boss_scrapy.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # boss_scrapy 2 | 利用代理IP爬取boss直聘的内容 3 | 需要注意的几个点: 4 | 1.setting.py文件中要修改DEFAULT_REQUEST_HEADERS中的Proxy-Authorization和cookie 5 | 2.要先在MySQL数据库中添加一个名为boss的数据库,再在数据库中添加名字为company_detail的表 6 | -------------------------------------------------------------------------------- /boss_scrapy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weakmaple/boss_scrapy/7877d00562171a2cd422a981cdaaeb916395df9d/boss_scrapy/__init__.py -------------------------------------------------------------------------------- /boss_scrapy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class BossScrapyItem(scrapy.Item): 11 | # 工作名称 工作城市 工作经验 学历要求 公司名字 公司所在地 12 | job_name = scrapy.Field() 13 | job_city = scrapy.Field() 14 | job_experience = scrapy.Field() 15 | job_education = scrapy.Field() 16 | company_name = scrapy.Field() 17 | loc_job = scrapy.Field() 18 | # 公司介绍 职位描述 团队介绍 工商信息 19 | company_describe_detail = scrapy.Field() 20 | job_describe_detail = scrapy.Field() 21 | team_describe_detail = scrapy.Field() 22 | business_information = scrapy.Field() 23 | 24 | -------------------------------------------------------------------------------- /boss_scrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | import base64 9 | from twisted.internet.defer import DeferredLock 10 | import requests 11 | import random 12 | import json 13 | from boss_scrapy.settings import DEFAULT_REQUEST_HEADERS,USER_AGENT_LIST 14 | from boss_scrapy.try_to_getProxy import ProxyModel 15 | 16 | class RandomProxy(object): 17 | 18 | def __init__(self): 19 | self.current_proxy = None 20 | self.lock = DeferredLock() 21 | 22 | def process_request(self, request, spider): 23 | user_agent = random.choice(USER_AGENT_LIST) 24 | request.headers['User-Agent'] = user_agent 25 | 26 | if 'proxy' not in request.meta or self.current_proxy.is_expiring: 27 | #请求代理 28 | self.update_proxy() 29 | request.meta['proxy'] = self.current_proxy.proxy 30 | 31 | def process_response(self, request, response, spider): 32 | # 如果对方重定向(302)去验证码的网页,换掉代理IP 33 | # 'captcha' in response.url 指的是有时候验证码的网页返回的状态码是200,所以用这个作为辨识的标志 34 | if response.status != 200 or 'captcha' in response.url: 35 | # 如果来到这里,说明这个请求已经被boss直聘识别为爬虫了 36 | # 所以这个请求就相当于什么都没有获取到 37 | # 所以要重新返回request,让这个请求重新加入到调度中 38 | # 下次再发送 39 | if not self.current_proxy.blacked: 40 | self.current_proxy.blacked = True 41 | self.update_proxy() 42 | print('%s代理失效' % self.current_proxy.proxy) 43 | request.meta['proxy'] = self.current_proxy.proxy 44 | return request 45 | 46 | # 如果是正常的话,记得最后要返回response 47 | # 如果不返回,这个response就不会被传到爬虫那里去 48 | # 也就得不到解析 49 | return response 50 | 51 | def update_proxy(self): 52 | #lock是属于多线程中的一个概念,因为这里scrapy是采用异步的,可以直接看成多线程 53 | #所以有可能出现这样的情况,爬虫在爬取一个网页的时候,忽然被对方封了,这时候就会来到这里 54 | #获取新的IP,但是同时会有多条线程来这里请求,那么就会出现浪费代理IP的请求,所以这这里加上了锁 55 | #锁的作用是在同一时间段,所有线程只能有一条线程可以访问锁内的代码,这个时候一条线程获得新的代理IP 56 | #而这个代理IP是可以用在所有线程的,这样子别的线程就可以继续运行了,减少了代理IP(钱)的浪费 57 | self.lock.acquire() 58 | # 判断换线程的条件 59 | # 1.目前没有使用代理IP 60 | # 2.到线程过期的时间了 61 | # 3.目前IP已经被对方封了 62 | # 满足以上其中一种情况就可以换代理IP了 63 | if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked: 64 | url = r'https://h.wandouip.com/get/ip-list?pack=%s&num=1&xy=1&type=2&lb=\r\n&mr=1&' % random.randint(100, 1000) 65 | response = requests.get(url=url, headers=DEFAULT_REQUEST_HEADERS) 66 | text = json.loads(response.text) 67 | print(text) 68 | data = text['data'][0] 69 | proxy_model = ProxyModel(data) 70 | print('重新获取了一个代理:%s' % proxy_model.proxy) 71 | self.current_proxy = proxy_model 72 | # return proxy_model 73 | self.lock.release() -------------------------------------------------------------------------------- /boss_scrapy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymysql 9 | 10 | class BossScrapyPipeline(object): 11 | 12 | def __init__(self): 13 | self.client = pymysql.connect( 14 | host='localhost', 15 | port=3306, 16 | user='root', 17 | password='123456', 18 | database='boss', 19 | charset='utf8' 20 | ) 21 | self.cur = self.client.cursor() 22 | 23 | def process_item(self, item, spider): 24 | # 工作名称 工作城市 工作经验 学历要求 公司名字 公司所在地 25 | job_name = item['job_name'] 26 | job_city = item['job_city'] 27 | job_experience = item['job_experience'] 28 | job_education = item['job_education'] 29 | company_name = item['company_name'] 30 | loc_job = item['loc_job'] 31 | # 公司介绍 职位描述 团队介绍 工商信息 32 | company_describe_detail = item['company_describe_detail'] 33 | job_describe_detail = item['job_describe_detail'] 34 | team_describe_detail = item['team_describe_detail'] 35 | business_information = item['business_information'] 36 | 37 | lis = [ 38 | job_name, job_city, job_experience, job_education, company_name, loc_job, 39 | company_describe_detail, job_describe_detail, team_describe_detail, 40 | business_information 41 | ] 42 | 43 | sql = 'insert into company_detail(工作名称,工作城市,工作经验,学历要求,' \ 44 | '公司名字,公司所在地,公司介绍,职位描述,团队介绍,工商信息) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' 45 | self.cur.execute(sql, lis) 46 | self.client.commit() 47 | 48 | return item 49 | -------------------------------------------------------------------------------- /boss_scrapy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for boss_scrapy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'boss_scrapy' 13 | 14 | SPIDER_MODULES = ['boss_scrapy.spiders'] 15 | NEWSPIDER_MODULE = 'boss_scrapy.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'boss_scrapy (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | #Proxy-Authorization要填入由自己的账号密码生成base64加密字符串 44 | 'Proxy-Authorization':'Basic xxxxxxxxxxxxxxxxx', 45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | 'Accept-Language': 'en', 47 | #cookie要填入获得代理那个网页的cookie 48 | 'cookie':'advanced-frontend=xxxxxxxxxxxxxxxxx', 49 | 'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", 50 | 51 | } 52 | 53 | # Enable or disable spider middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 55 | # SPIDER_MIDDLEWARES = { 56 | # 'boss_scrapy.middlewares.randomIPMiddleware': 100, 57 | # } 58 | 59 | # Enable or disable downloader middlewares 60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 61 | DOWNLOADER_MIDDLEWARES = { 62 | 'boss_scrapy.middlewares.RandomProxy': 100, 63 | } 64 | 65 | # Enable or disable extensions 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | 'boss_scrapy.pipelines.BossScrapyPipeline': 300, 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | 98 | USER_AGENT_LIST = [ 99 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 100 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 101 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 102 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 103 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 104 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 105 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 106 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 107 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 108 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 109 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 110 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 111 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 112 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 113 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 114 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 115 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 116 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 117 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 118 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 119 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 120 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 121 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 122 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 123 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 124 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 125 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 126 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 127 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 128 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 129 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 130 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 131 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 132 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 133 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 134 | ] 135 | -------------------------------------------------------------------------------- /boss_scrapy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /boss_scrapy/spiders/boss_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from boss_scrapy.items import BossScrapyItem 6 | 7 | 8 | class BossSpiderSpider(CrawlSpider): 9 | name = 'boss_spider' 10 | # allowed_domains = ['zhipin.com"'] 11 | # start_urls = ['http://httpbin.org/'] 12 | start_urls = ['https://www.zhipin.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1&ka=page-1'] 13 | 14 | rules = ( 15 | Rule(LinkExtractor(allow=r'.+page=%d+&ka=page-%d+/'), follow=True), 16 | Rule(LinkExtractor(allow=r'.+/job_detail/.+\.html'), callback='parse_item', follow=False), 17 | # Rule(LinkExtractor(allow=r'http://httpbin.org/ip'), callback='parse_item'), 18 | ) 19 | 20 | def parse_item(self, response): 21 | # print("="*40) 22 | # print(response.body) 23 | # print("=" * 40) 24 | job_name = response.xpath('//div[@class="name"]/h1/text()').get() 25 | job_detail = response.xpath('//div[@class="info-primary"]/p//text()').getall() 26 | job_city = job_detail[0] 27 | job_experience = job_detail[1] 28 | job_education = job_detail[2] 29 | company_name = response.xpath('//div[@class="info-company"]/h3/a/text()').get() 30 | loc_job = response.xpath('//div[@class="location-address"]/text()').get() 31 | 32 | job_secs = response.xpath('//div[@class="job-sec company-info"]/h3/text()').get() 33 | if job_secs == '公司介绍': 34 | company_describe_detail = response.xpath('//div[@class="job-sec company-info"]/div[@class="text"]/text()').getall() 35 | company_describe_detail = '\n'.join(company_describe_detail).strip() 36 | else: 37 | company_describe_detail = '无' 38 | 39 | job_secs = response.xpath('//div[@class="job-sec"]') 40 | job_describe_detail,team_describe_detail,business_information = '无','无','无' 41 | for job_sec in job_secs: 42 | sec_describe = job_sec.xpath('./h3/text()').get() 43 | if sec_describe == '职位描述': 44 | job_describe_detail = job_sec.xpath('./div[@class="text"]/text()').getall() 45 | job_describe_detail = '\n'.join(job_describe_detail).strip() 46 | 47 | if sec_describe == '团队介绍': 48 | team_describe_detail = job_sec.xpath('./div[@class="text"]/text()').getall() 49 | team_describe_detail = '\n'.join(team_describe_detail).strip() 50 | 51 | if sec_describe == '工商信息': 52 | business_information_1 = job_sec.xpath('./div[@class="name"]/text()').getall() 53 | business_information_1 = ''.join(business_information_1).strip() 54 | business_information_2 = job_sec.xpath('./div[@class="level-list"]/li//text()').getall() 55 | business_information_2 = '\n'.join(business_information_2).strip() 56 | business_information = business_information_1+'\n'+business_information_2 57 | 58 | item = BossScrapyItem( 59 | job_name = job_name, 60 | job_city = job_city, 61 | job_experience = job_experience, 62 | job_education = job_education, 63 | company_name = company_name, 64 | loc_job = loc_job, 65 | company_describe_detail = company_describe_detail, 66 | job_describe_detail = job_describe_detail, 67 | team_describe_detail = team_describe_detail, 68 | business_information = business_information 69 | ) 70 | 71 | yield item 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /boss_scrapy/try_to_getProxy.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime,timedelta 2 | 3 | class ProxyModel(object): 4 | def __init__(self,data): 5 | self.ip = data['ip'] 6 | self.port = data['port'] 7 | self.expire_str = data['expire_time'] 8 | self.proxy = 'http://'+ '%s:%s' % (self.ip, self.port) 9 | self.expire_time = self.detail_time 10 | #代理是否已经被拉入黑名单了 11 | self.blacked = False 12 | #这个函数用于把str格式的过期时间(expire_time)转化为datetime格式,方便我们来 13 | #根据过期时间换新的代理 14 | @property 15 | def detail_time(self): 16 | date_str,time_str = self.expire_str.split(" ") 17 | year,month,day = date_str.split('-') 18 | hour,minute,second = time_str.split(':') 19 | expire_time = datetime( 20 | year=int(year), 21 | month=int(month), 22 | day=int(day), 23 | hour=int(hour), 24 | minute=int(minute), 25 | second=int(second), 26 | ) 27 | return expire_time 28 | #比较代理的过期时间和现在的时间 29 | #如果这个代理的存活时间少于10,那么就要准备更换代理IP了 30 | @property 31 | def is_expiring(self): 32 | now = datetime.now() 33 | if (self.expire_time - now)