├── .idea
    ├── boss_scrapy.iml
    └── modules.xml
├── README.md
├── boss_scrapy
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    ├── spiders
    │   ├── __init__.py
    │   └── boss_spider.py
    └── try_to_getProxy.py
├── boss_start.py
└── scrapy.cfg


/.idea/boss_scrapy.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="py.test" />
10 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/boss_scrapy.iml" filepath="$PROJECT_DIR$/.idea/boss_scrapy.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # boss_scrapy
2 | 利用代理IP爬取boss直聘的内容
3 | 需要注意的几个点:
4 |   1.setting.py文件中要修改DEFAULT_REQUEST_HEADERS中的Proxy-Authorization和cookie
5 |   2.要先在MySQL数据库中添加一个名为boss的数据库，再在数据库中添加名字为company_detail的表
6 | 


--------------------------------------------------------------------------------
/boss_scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weakmaple/boss_scrapy/7877d00562171a2cd422a981cdaaeb916395df9d/boss_scrapy/__init__.py


--------------------------------------------------------------------------------
/boss_scrapy/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class BossScrapyItem(scrapy.Item):
11 |     # 工作名称 工作城市 工作经验 学历要求 公司名字 公司所在地
12 |     job_name = scrapy.Field()
13 |     job_city = scrapy.Field()
14 |     job_experience = scrapy.Field()
15 |     job_education = scrapy.Field()
16 |     company_name = scrapy.Field()
17 |     loc_job = scrapy.Field()
18 |     # 公司介绍 职位描述 团队介绍 工商信息
19 |     company_describe_detail = scrapy.Field()
20 |     job_describe_detail = scrapy.Field()
21 |     team_describe_detail = scrapy.Field()
22 |     business_information = scrapy.Field()
23 | 
24 | 


--------------------------------------------------------------------------------
/boss_scrapy/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | import base64
 9 | from twisted.internet.defer import DeferredLock
10 | import requests
11 | import random
12 | import json
13 | from boss_scrapy.settings import DEFAULT_REQUEST_HEADERS,USER_AGENT_LIST
14 | from boss_scrapy.try_to_getProxy import ProxyModel
15 | 
16 | class RandomProxy(object):
17 | 
18 |     def __init__(self):
19 |         self.current_proxy = None
20 |         self.lock = DeferredLock()
21 | 
22 |     def process_request(self, request, spider):
23 |         user_agent = random.choice(USER_AGENT_LIST)
24 |         request.headers['User-Agent'] = user_agent
25 | 
26 |         if 'proxy' not in request.meta or self.current_proxy.is_expiring:
27 |             #请求代理
28 |             self.update_proxy()
29 |             request.meta['proxy'] = self.current_proxy.proxy
30 | 
31 |     def process_response(self, request, response, spider):
32 |         # 如果对方重定向（302）去验证码的网页，换掉代理IP
33 |         # 'captcha' in response.url 指的是有时候验证码的网页返回的状态码是200，所以用这个作为辨识的标志
34 |         if response.status != 200 or 'captcha' in response.url:
35 |             # 如果来到这里，说明这个请求已经被boss直聘识别为爬虫了
36 |             # 所以这个请求就相当于什么都没有获取到
37 |             # 所以要重新返回request，让这个请求重新加入到调度中
38 |             # 下次再发送
39 |             if not self.current_proxy.blacked:
40 |                 self.current_proxy.blacked = True
41 |             self.update_proxy()
42 |             print('%s代理失效' % self.current_proxy.proxy)
43 |             request.meta['proxy'] = self.current_proxy.proxy
44 |             return request
45 | 
46 |         # 如果是正常的话，记得最后要返回response
47 |         # 如果不返回，这个response就不会被传到爬虫那里去
48 |         # 也就得不到解析
49 |         return response
50 | 
51 |     def update_proxy(self):
52 |         #lock是属于多线程中的一个概念，因为这里scrapy是采用异步的，可以直接看成多线程
53 |         #所以有可能出现这样的情况，爬虫在爬取一个网页的时候，忽然被对方封了，这时候就会来到这里
54 |         #获取新的IP，但是同时会有多条线程来这里请求，那么就会出现浪费代理IP的请求，所以这这里加上了锁
55 |         #锁的作用是在同一时间段，所有线程只能有一条线程可以访问锁内的代码，这个时候一条线程获得新的代理IP
56 |         #而这个代理IP是可以用在所有线程的，这样子别的线程就可以继续运行了，减少了代理IP（钱）的浪费
57 |         self.lock.acquire()
58 |         # 判断换线程的条件
59 |         # 1.目前没有使用代理IP
60 |         # 2.到线程过期的时间了
61 |         # 3.目前IP已经被对方封了
62 |         # 满足以上其中一种情况就可以换代理IP了
63 |         if not self.current_proxy or self.current_proxy.is_expiring or self.current_proxy.blacked:
64 |             url = r'https://h.wandouip.com/get/ip-list?pack=%s&num=1&xy=1&type=2&lb=\r\n&mr=1&' % random.randint(100, 1000)
65 |             response = requests.get(url=url, headers=DEFAULT_REQUEST_HEADERS)
66 |             text = json.loads(response.text)
67 |             print(text)
68 |             data = text['data'][0]
69 |             proxy_model = ProxyModel(data)
70 |             print('重新获取了一个代理：%s' % proxy_model.proxy)
71 |             self.current_proxy = proxy_model
72 |             # return proxy_model
73 |         self.lock.release()


--------------------------------------------------------------------------------
/boss_scrapy/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymysql
 9 | 
10 | class BossScrapyPipeline(object):
11 | 
12 |     def __init__(self):
13 |         self.client = pymysql.connect(
14 |             host='localhost',
15 |             port=3306,
16 |             user='root',
17 |             password='123456',
18 |             database='boss',
19 |             charset='utf8'
20 |         )
21 |         self.cur = self.client.cursor()
22 | 
23 |     def process_item(self, item, spider):
24 |         # 工作名称 工作城市 工作经验 学历要求 公司名字 公司所在地
25 |         job_name = item['job_name']
26 |         job_city = item['job_city']
27 |         job_experience = item['job_experience']
28 |         job_education = item['job_education']
29 |         company_name = item['company_name']
30 |         loc_job = item['loc_job']
31 |         # 公司介绍 职位描述 团队介绍 工商信息
32 |         company_describe_detail = item['company_describe_detail']
33 |         job_describe_detail = item['job_describe_detail']
34 |         team_describe_detail = item['team_describe_detail']
35 |         business_information = item['business_information']
36 | 
37 |         lis = [
38 |             job_name, job_city, job_experience, job_education, company_name, loc_job,
39 |             company_describe_detail, job_describe_detail, team_describe_detail,
40 |             business_information
41 |         ]
42 | 
43 |         sql = 'insert into company_detail(工作名称,工作城市,工作经验,学历要求,' \
44 |               '公司名字,公司所在地,公司介绍,职位描述,团队介绍,工商信息) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
45 |         self.cur.execute(sql, lis)
46 |         self.client.commit()
47 | 
48 |         return item
49 | 


--------------------------------------------------------------------------------
/boss_scrapy/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for boss_scrapy project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'boss_scrapy'
 13 | 
 14 | SPIDER_MODULES = ['boss_scrapy.spiders']
 15 | NEWSPIDER_MODULE = 'boss_scrapy.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'boss_scrapy (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | DOWNLOAD_DELAY = 1
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | DEFAULT_REQUEST_HEADERS = {
 43 |         #Proxy-Authorization要填入由自己的账号密码生成base64加密字符串
 44 |         'Proxy-Authorization':'Basic xxxxxxxxxxxxxxxxx',
 45 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 46 |         'Accept-Language': 'en',
 47 |         #cookie要填入获得代理那个网页的cookie
 48 |         'cookie':'advanced-frontend=xxxxxxxxxxxxxxxxx',
 49 |         'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
 50 | 
 51 | }
 52 | 
 53 | # Enable or disable spider middlewares
 54 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 55 | # SPIDER_MIDDLEWARES = {
 56 | #    'boss_scrapy.middlewares.randomIPMiddleware': 100,
 57 | # }
 58 | 
 59 | # Enable or disable downloader middlewares
 60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 61 | DOWNLOADER_MIDDLEWARES = {
 62 |    'boss_scrapy.middlewares.RandomProxy': 100,
 63 | }
 64 | 
 65 | # Enable or disable extensions
 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 67 | #EXTENSIONS = {
 68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 69 | #}
 70 | 
 71 | # Configure item pipelines
 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 73 | ITEM_PIPELINES = {
 74 |    'boss_scrapy.pipelines.BossScrapyPipeline': 300,
 75 | }
 76 | 
 77 | # Enable and configure the AutoThrottle extension (disabled by default)
 78 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 79 | #AUTOTHROTTLE_ENABLED = True
 80 | # The initial download delay
 81 | #AUTOTHROTTLE_START_DELAY = 5
 82 | # The maximum download delay to be set in case of high latencies
 83 | #AUTOTHROTTLE_MAX_DELAY = 60
 84 | # The average number of requests Scrapy should be sending in parallel to
 85 | # each remote server
 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 87 | # Enable showing throttling stats for every response received:
 88 | #AUTOTHROTTLE_DEBUG = False
 89 | 
 90 | # Enable and configure HTTP caching (disabled by default)
 91 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 92 | #HTTPCACHE_ENABLED = True
 93 | #HTTPCACHE_EXPIRATION_SECS = 0
 94 | #HTTPCACHE_DIR = 'httpcache'
 95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 97 | 
 98 | USER_AGENT_LIST = [
 99 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
100 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
101 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
102 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
103 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
104 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
105 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
106 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
107 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
108 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
109 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
110 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
111 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
112 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
113 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
114 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
115 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
116 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
117 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
118 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
119 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
120 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
121 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
122 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
123 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
124 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
125 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
126 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
127 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
128 |     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
129 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
130 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
131 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
132 |     "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
133 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
134 |     ]
135 | 


--------------------------------------------------------------------------------
/boss_scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/boss_scrapy/spiders/boss_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from boss_scrapy.items import BossScrapyItem
 6 | 
 7 | 
 8 | class BossSpiderSpider(CrawlSpider):
 9 |     name = 'boss_spider'
10 |     # allowed_domains = ['zhipin.com"']
11 |     # start_urls = ['http://httpbin.org/']
12 |     start_urls = ['https://www.zhipin.com/c101010100/?query=python%E7%88%AC%E8%99%AB&page=1&ka=page-1']
13 | 
14 |     rules = (
15 |         Rule(LinkExtractor(allow=r'.+page=%d+&ka=page-%d+/'), follow=True),
16 |         Rule(LinkExtractor(allow=r'.+/job_detail/.+\.html'), callback='parse_item', follow=False),
17 |         # Rule(LinkExtractor(allow=r'http://httpbin.org/ip'), callback='parse_item'),
18 |     )
19 | 
20 |     def parse_item(self, response):
21 |         # print("="*40)
22 |         # print(response.body)
23 |         # print("=" * 40)
24 |         job_name = response.xpath('//div[@class="name"]/h1/text()').get()
25 |         job_detail = response.xpath('//div[@class="info-primary"]/p//text()').getall()
26 |         job_city = job_detail[0]
27 |         job_experience = job_detail[1]
28 |         job_education = job_detail[2]
29 |         company_name = response.xpath('//div[@class="info-company"]/h3/a/text()').get()
30 |         loc_job = response.xpath('//div[@class="location-address"]/text()').get()
31 | 
32 |         job_secs = response.xpath('//div[@class="job-sec company-info"]/h3/text()').get()
33 |         if job_secs == '公司介绍':
34 |             company_describe_detail = response.xpath('//div[@class="job-sec company-info"]/div[@class="text"]/text()').getall()
35 |             company_describe_detail = '\n'.join(company_describe_detail).strip()
36 |         else:
37 |             company_describe_detail = '无'
38 | 
39 |         job_secs = response.xpath('//div[@class="job-sec"]')
40 |         job_describe_detail,team_describe_detail,business_information = '无','无','无'
41 |         for job_sec in job_secs:
42 |             sec_describe = job_sec.xpath('./h3/text()').get()
43 |             if sec_describe == '职位描述':
44 |                 job_describe_detail = job_sec.xpath('./div[@class="text"]/text()').getall()
45 |                 job_describe_detail = '\n'.join(job_describe_detail).strip()
46 | 
47 |             if sec_describe == '团队介绍':
48 |                 team_describe_detail = job_sec.xpath('./div[@class="text"]/text()').getall()
49 |                 team_describe_detail = '\n'.join(team_describe_detail).strip()
50 | 
51 |             if sec_describe == '工商信息':
52 |                 business_information_1 = job_sec.xpath('./div[@class="name"]/text()').getall()
53 |                 business_information_1 = ''.join(business_information_1).strip()
54 |                 business_information_2 = job_sec.xpath('./div[@class="level-list"]/li//text()').getall()
55 |                 business_information_2 = '\n'.join(business_information_2).strip()
56 |                 business_information = business_information_1+'\n'+business_information_2
57 | 
58 |         item = BossScrapyItem(
59 |             job_name =  job_name,
60 |             job_city = job_city,
61 |             job_experience = job_experience,
62 |             job_education = job_education,
63 |             company_name = company_name,
64 |             loc_job = loc_job,
65 |             company_describe_detail = company_describe_detail,
66 |             job_describe_detail = job_describe_detail,
67 |             team_describe_detail = team_describe_detail,
68 |             business_information = business_information
69 |         )
70 | 
71 |         yield item
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/boss_scrapy/try_to_getProxy.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime,timedelta
 2 | 
 3 | class ProxyModel(object):
 4 |     def __init__(self,data):
 5 |         self.ip = data['ip']
 6 |         self.port = data['port']
 7 |         self.expire_str = data['expire_time']
 8 |         self.proxy = 'http://'+ '%s:%s' % (self.ip, self.port)
 9 |         self.expire_time = self.detail_time
10 |         #代理是否已经被拉入黑名单了
11 |         self.blacked = False
12 |     #这个函数用于把str格式的过期时间（expire_time）转化为datetime格式，方便我们来
13 |     #根据过期时间换新的代理
14 |     @property
15 |     def detail_time(self):
16 |         date_str,time_str = self.expire_str.split(" ")
17 |         year,month,day = date_str.split('-')
18 |         hour,minute,second = time_str.split(':')
19 |         expire_time = datetime(
20 |             year=int(year),
21 |             month=int(month),
22 |             day=int(day),
23 |             hour=int(hour),
24 |             minute=int(minute),
25 |             second=int(second),
26 |         )
27 |         return expire_time
28 |     #比较代理的过期时间和现在的时间
29 |     #如果这个代理的存活时间少于10，那么就要准备更换代理IP了
30 |     @property
31 |     def is_expiring(self):
32 |         now = datetime.now()
33 |         if (self.expire_time - now) <timedelta(seconds=10):
34 |             return True
35 |         else:
36 |             return False
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/boss_start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute('sc/rapy crawl boss_spider'.split())


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = boss_scrapy.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = boss_scrapy
12 | 


--------------------------------------------------------------------------------