├── Posistion-Spider.py ├── README.md └── images └── 2016-10-17_22-04-30.png /Posistion-Spider.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/env python 2 | #_*_ coding: utf-8 _*_ 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | 7 | from requests import request 8 | from urlparse import urlparse 9 | import urllib 10 | import urllib2 11 | from bs4 import BeautifulSoup as Bs 12 | from collections import Counter 13 | import lxml 14 | import json 15 | import datetime 16 | import xlsxwriter 17 | import re 18 | 19 | starttime = datetime.datetime.now() 20 | 21 | url = r'http://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC' 22 | keyword = raw_input('请输入您所需要查找的关键词:') 23 | #获取职位的查询页面,(参数分别为网址,当前页面数,关键词) 24 | def get_page(url, pn, keyword): 25 | headers = { 26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' 27 | 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 28 | 'Host': 'www.lagou.com', 29 | 'Connection': 'keep-alive', 30 | 'Origin': 'http://www.lagou.com' 31 | } 32 | if pn == 1: 33 | boo = 'true' 34 | else: 35 | boo = 'false' 36 | page_data = urllib.urlencode([ 37 | ('first', boo), 38 | ('pn', pn), 39 | ('kd', keyword) 40 | ]) 41 | req = urllib2.Request(url, headers=headers) 42 | 43 | page = urllib2.urlopen(req, data=page_data.encode('utf-8')).read() 44 | page = page.decode('utf-8') 45 | return page 46 | 47 | #获取所需的岗位ID,每一个招聘页面详情都有一个所属的ID索引 48 | def read_id(page): 49 | tag = 'positionId' 50 | page_json = json.loads(page) 51 | page_json = page_json['content']['positionResult']['result'] 52 | company_list = [] 53 | for i in range(15): 54 | company_list.append(page_json[i].get(tag)) 55 | return company_list 56 | 57 | # 获取当前招聘关键词的最大页数,大于30的将会被覆盖,所以最多只能抓取30页的招聘信息 58 | def read_max_page(page): 59 | page_json = json.loads(page) 60 | max_page_num = page_json['content']['pageSize'] 61 | if max_page_num > 30: 62 | max_page_num = 30 63 | return max_page_num 64 | 65 | #获取职位页面,由positionId和BaseUrl组合成目标地址 66 | def get_content(company_id): 67 | fin_url = r'http://www.lagou.com/jobs/%s.html' % company_id 68 | headers = { 69 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' 70 | 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 71 | 'Host': 'www.lagou.com', 72 | 'Connection': 'keep-alive', 73 | 'Origin': 'http://www.lagou.com' 74 | } 75 | req = urllib2.Request(fin_url, headers=headers) 76 | #page = urllib.urlopen(req).read() 77 | page = urllib2.urlopen(req).read() 78 | content = page.decode('utf-8') 79 | return content 80 | 81 | #获取职位需求(通过re来去除html标记),可以将职位详情单独存储 82 | def get_result(content): 83 | soup = Bs(content, 'lxml') 84 | job_description = soup.select('dd[class="job_bt"]') 85 | job_description = str(job_description[0]) 86 | rule = re.compile(r'<[^>]+>') 87 | result = rule.sub('', job_description) 88 | return result 89 | 90 | #过滤关键词:目前筛选的方式只是选取英文关键词 91 | def search_skill(result): 92 | rule = re.compile(r'[a-zA-z]+') 93 | skil_list = rule.findall(result) 94 | return skil_list 95 | 96 | # 对出现的关键词计数,并排序,选取Top80的关键词作为数据的样本 97 | def count_skill(skill_list): 98 | for i in range(len(skill_list)): 99 | skill_list[i] = skill_list[i].lower() 100 | count_dict = Counter(skill_list).most_common(80) 101 | return count_dict 102 | 103 | # 对结果进行存储并生成Area图 104 | def save_excel(count_dict, file_name): 105 | book = xlsxwriter.Workbook(r'E:\positions\%s.xls' % file_name) 106 | tmp = book.add_worksheet() 107 | row_num = len(count_dict) 108 | for i in range(1, row_num): 109 | if i == 1: 110 | tag_pos = 'A%s' % i 111 | tmp.write_row(tag_pos, ['关键词', '频次']) 112 | else: 113 | con_pos = 'A%s' % i 114 | k_v = list(count_dict[i-2]) 115 | tmp.write_row(con_pos, k_v) 116 | chart1 = book.add_chart({'type':'area'}) 117 | chart1.add_series({ 118 | 'name' : '=Sheet1!$B$1', 119 | 'categories' : '=Sheet1!$A$2:$A$80', 120 | 'values' : '=Sheet1!$B$2:$B$80' 121 | }) 122 | chart1.set_title({'name':'关键词排名'}) 123 | chart1.set_x_axis({'name': '关键词'}) 124 | chart1.set_y_axis({'name': '频次(/次)'}) 125 | tmp.insert_chart('C2', chart1, {'x_offset':15, 'y_offset':10}) 126 | book.close() 127 | 128 | if __name__ == '__main__': 129 | max_pn = read_max_page(get_page(url, 1, keyword)) # 获取招聘页数 130 | fin_skill_list = [] # 关键词总表 131 | for pn in range(1, max_pn): 132 | print(('***********************正在抓取第%s页信息***********************' % pn)) 133 | page = get_page(url, pn, keyword) 134 | company_list = read_id(page) 135 | for company_id in company_list: 136 | content = get_content(company_id) 137 | result = get_result(content) 138 | skill_list = search_skill(result) 139 | fin_skill_list.extend(skill_list) 140 | print('***********************开始统计关键词出现频率***********************') 141 | count_dict = count_skill(fin_skill_list) 142 | print(count_dict) 143 | file_name = raw_input(r'请输入要保存的文件名:') 144 | save_excel(count_dict, file_name) 145 | print('***********************正在保存到E:\positions***********************') 146 | endtime = datetime.datetime.now() 147 | time = (endtime - starttime).seconds 148 | print('总共用时:%s s' % time) 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-spider 2 | 知己知彼,方可百战不殆。在学习技术的时候我们往往面临太多选择而不知所措,可能是各个方面都有涉猎,对某个领域没有深入研究,看似什么都会,真要让你做个什么东西的时候就显得捉肘见襟。如果我们能从招聘职位所需的技能开始学习,便可练就一身硬功夫,为实战应用中打下良好的基础。 3 | 4 | 我们的目的主要是通过python抓取拉钩网的招聘详情,并筛选其中的技能关键词,存储到excel中。 5 | 6 | #一、获取职位需求数据 7 | 8 | 通过观察可以发现,拉勾网的职位页面详情是由 http://www.lagou.com/jobs/ + ***** (PositionId).html 组成,而PositionId可以通过分析Json的XHR获得。而红框里的职位描述内容是我们要抓取的数据。 9 | 10 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/2815894-6c3c3f2d777c2019.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 11 | 12 | 13 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/2815894-df6eb3d1ae9bca28.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 14 | 15 | 知道了数据的源头,接下来就按照常规步骤包装Headers,提交FormData来获取反馈数据。 16 | ##获取PositionId列表所在页面: 17 | 18 | ```python 19 | #获取职位的查询页面,(参数分别为网址,当前页面数,关键词) 20 | def get_page(url, pn, keyword): 21 | headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 23 | 'Host': 'www.lagou.com', 24 | 'Connection': 'keep-alive', 25 | 'Origin': 'http://www.lagou.com' 26 | } 27 | if pn == 1: 28 | boo = 'true' 29 | else: 30 | boo = 'false' 31 | page_data = urllib.urlencode([ 32 | ('first', boo), 33 | ('pn', pn), 34 | ('kd', keyword) 35 | ]) 36 | req = urllib2.Request(url, headers=headers) 37 | 38 | page = urllib2.urlopen(req, data=page_data.encode('utf-8')).read() 39 | page = page.decode('utf-8') 40 | return page 41 | ``` 42 | ##通过Json获取PositionId: 43 | ```python 44 | 45 | #获取所需的岗位ID,每一个招聘页面详情都有一个所属的ID索引 46 | def read_id(page): 47 | tag = 'positionId' 48 | page_json = json.loads(page) 49 | page_json = page_json['content']['positionResult']['result'] 50 | company_list = [] 51 | for i in range(15): 52 | company_list.append(page_json[i].get(tag)) 53 | return company_list 54 | ``` 55 | ##合成目标url: 56 | ``` 57 | #获取职位页面,由positionId和BaseUrl组合成目标地址 58 | def get_content(company_id): 59 | fin_url = r'http://www.lagou.com/jobs/%s.html' % company_id 60 | headers = { 61 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' 62 | 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 63 | 'Host': 'www.lagou.com', 64 | 'Connection': 'keep-alive', 65 | 'Origin': 'http://www.lagou.com' 66 | } 67 | req = urllib2.Request(fin_url, headers=headers) 68 | #page = urllib.urlopen(req).read() 69 | page = urllib2.urlopen(req).read() 70 | content = page.decode('utf-8') 71 | return content 72 | ``` 73 |  #二、对数据进行处理 74 | 获取数据之后,需要对数据进行清洗,通过BeautifulSoup抓取的职位内容包含Html标签,需要让数据脱去这层“外衣”。 75 | 76 | ``` 77 | #获取职位需求(通过re来去除html标记),可以将职位详情单独存储 78 | def get_result(content): 79 | soup = Bs(content, 'lxml') 80 | job_description = soup.select('dd[class="job_bt"]') 81 | job_description = str(job_description[0]) 82 | rule = re.compile(r'<[^>]+>') 83 | result = rule.sub('', job_description) 84 | return result 85 | ``` 86 | 现在得到的数据就是职位描述信息,我们要从职位信息当中筛选我们所关注的任职要求关键词。 87 | 88 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/2815894-3f07bb128d91e883.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 89 | 90 | 我们将这些关键词筛选出来,存储到List当中。经过对整个500+职位进行爬取,我们得到了职位技能关键词的总表。 91 | ``` 92 | #过滤关键词:目前筛选的方式只是选取英文关键词 93 | def search_skill(result): 94 | rule = re.compile(r'[a-zA-z]+') 95 | skil_list = rule.findall(result) 96 | return skil_list 97 | ``` 98 | 对关键词按照500+职位需求出现的频次进行排序,选取频次排序Top80的关键词,去除无效的关键词。 99 | 100 | ``` 101 | # 对出现的关键词计数,并排序,选取Top80的关键词作为数据的样本 102 | def count_skill(skill_list): 103 | for i in range(len(skill_list)): 104 | skill_list[i] = skill_list[i].lower() 105 | count_dict = Counter(skill_list).most_common(80) 106 | return count_dict 107 | ``` 108 | 109 | #三、对数据进行存储和可视化处理 110 | 111 | ``` 112 | # 对结果进行存储并生成Area图 113 | def save_excel(count_dict, file_name): 114 | book = xlsxwriter.Workbook(r'E:\positions\%s.xls' % file_name) 115 | tmp = book.add_worksheet() 116 | row_num = len(count_dict) 117 | for i in range(1, row_num): 118 | if i == 1: 119 | tag_pos = 'A%s' % i 120 | tmp.write_row(tag_pos, ['关键词', '频次']) 121 | else: 122 | con_pos = 'A%s' % i 123 | k_v = list(count_dict[i-2]) 124 | tmp.write_row(con_pos, k_v) 125 | chart1 = book.add_chart({'type':'area'}) 126 | chart1.add_series({ 127 | 'name' : '=Sheet1!$B$1', 128 | 'categories' : '=Sheet1!$A$2:$A$80', 129 | 'values' : '=Sheet1!$B$2:$B$80' 130 | }) 131 | chart1.set_title({'name':'关键词排名'}) 132 | chart1.set_x_axis({'name': '关键词'}) 133 | chart1.set_y_axis({'name': '频次(/次)'}) 134 | tmp.insert_chart('C2', chart1, {'x_offset':15, 'y_offset':10}) 135 | book.close() 136 | ``` 137 | 138 | ![数据可视化展示](http://upload-images.jianshu.io/upload_images/2815894-b47a78419a9c2d6c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 139 | 140 | -------------------------------------------------------------------------------- /images/2016-10-17_22-04-30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fuyunzhishang/python-spider/79c93b06dd2f2c278b8185a515650e7a0b8668f1/images/2016-10-17_22-04-30.png --------------------------------------------------------------------------------