├── Posistion-Spider.py
├── README.md
└── images
    └── 2016-10-17_22-04-30.png


/Posistion-Spider.py:
--------------------------------------------------------------------------------
  1 | #!usr/bin/env python
  2 | #_*_ coding: utf-8 _*_
  3 | import sys
  4 | reload(sys)
  5 | sys.setdefaultencoding('utf-8')
  6 | 
  7 | from requests import request
  8 | from urlparse import  urlparse
  9 | import urllib
 10 | import urllib2
 11 | from bs4 import BeautifulSoup as Bs
 12 | from collections import Counter
 13 | import lxml
 14 | import json
 15 | import datetime
 16 | import xlsxwriter
 17 | import re
 18 | 
 19 | starttime = datetime.datetime.now()
 20 | 
 21 | url = r'http://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC'
 22 | keyword = raw_input('请输入您所需要查找的关键词：')
 23 | #获取职位的查询页面，（参数分别为网址，当前页面数，关键词）
 24 | def get_page(url, pn, keyword):
 25 |     headers = {
 26 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
 27 |         'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
 28 |         'Host': 'www.lagou.com',
 29 |         'Connection': 'keep-alive',
 30 |         'Origin': 'http://www.lagou.com'
 31 |     }
 32 |     if pn == 1:
 33 |         boo = 'true'
 34 |     else:
 35 |         boo = 'false'
 36 |     page_data = urllib.urlencode([
 37 |         ('first', boo),
 38 |         ('pn', pn),
 39 |         ('kd', keyword)
 40 |     ])
 41 |     req = urllib2.Request(url, headers=headers)
 42 | 
 43 |     page = urllib2.urlopen(req, data=page_data.encode('utf-8')).read()
 44 |     page = page.decode('utf-8')
 45 |     return page
 46 | 
 47 | #获取所需的岗位ID，每一个招聘页面详情都有一个所属的ID索引
 48 | def read_id(page):
 49 |     tag = 'positionId'
 50 |     page_json = json.loads(page)
 51 |     page_json = page_json['content']['positionResult']['result']
 52 |     company_list = []
 53 |     for i in range(15):
 54 |         company_list.append(page_json[i].get(tag))
 55 |     return  company_list
 56 | 
 57 | # 获取当前招聘关键词的最大页数，大于30的将会被覆盖，所以最多只能抓取30页的招聘信息
 58 | def read_max_page(page):
 59 |     page_json = json.loads(page)
 60 |     max_page_num = page_json['content']['pageSize']
 61 |     if max_page_num > 30:
 62 |         max_page_num = 30
 63 |     return max_page_num
 64 | 
 65 | #获取职位页面，由positionId和BaseUrl组合成目标地址
 66 | def get_content(company_id):
 67 |     fin_url = r'http://www.lagou.com/jobs/%s.html' % company_id
 68 |     headers = {
 69 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
 70 |         'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
 71 |         'Host': 'www.lagou.com',
 72 |         'Connection': 'keep-alive',
 73 |         'Origin': 'http://www.lagou.com'
 74 |     }
 75 |     req = urllib2.Request(fin_url, headers=headers)
 76 |     #page = urllib.urlopen(req).read()
 77 |     page = urllib2.urlopen(req).read()
 78 |     content = page.decode('utf-8')
 79 |     return content
 80 | 
 81 | #获取职位需求（通过re来去除html标记），可以将职位详情单独存储
 82 | def get_result(content):
 83 |     soup = Bs(content, 'lxml')
 84 |     job_description = soup.select('dd[class="job_bt"]')
 85 |     job_description = str(job_description[0])
 86 |     rule = re.compile(r'<[^>]+>')
 87 |     result = rule.sub('', job_description)
 88 |     return result
 89 | 
 90 | #过滤关键词：目前筛选的方式只是选取英文关键词
 91 | def search_skill(result):
 92 |     rule = re.compile(r'[a-zA-z]+')
 93 |     skil_list = rule.findall(result)
 94 |     return skil_list
 95 | 
 96 | # 对出现的关键词计数，并排序，选取Top80的关键词作为数据的样本
 97 | def count_skill(skill_list):
 98 |     for i in range(len(skill_list)):
 99 |         skill_list[i] = skill_list[i].lower()
100 |     count_dict = Counter(skill_list).most_common(80)
101 |     return count_dict
102 | 
103 | # 对结果进行存储并生成Area图
104 | def save_excel(count_dict, file_name):
105 |     book = xlsxwriter.Workbook(r'E:\positions\%s.xls' % file_name)
106 |     tmp = book.add_worksheet()
107 |     row_num = len(count_dict)
108 |     for i in range(1, row_num):
109 |         if i == 1:
110 |             tag_pos = 'A%s' % i
111 |             tmp.write_row(tag_pos, ['关键词', '频次'])
112 |         else:
113 |             con_pos = 'A%s' % i
114 |             k_v = list(count_dict[i-2])
115 |             tmp.write_row(con_pos, k_v)
116 |     chart1 = book.add_chart({'type':'area'})
117 |     chart1.add_series({
118 |         'name' : '=Sheet1!$B$1',
119 |         'categories' : '=Sheet1!$A$2:$A$80',
120 |         'values' : '=Sheet1!$B$2:$B$80'
121 |     })
122 |     chart1.set_title({'name':'关键词排名'})
123 |     chart1.set_x_axis({'name': '关键词'})
124 |     chart1.set_y_axis({'name': '频次(/次)'})
125 |     tmp.insert_chart('C2', chart1, {'x_offset':15, 'y_offset':10})
126 |     book.close()
127 | 
128 | if __name__ == '__main__':
129 |     max_pn = read_max_page(get_page(url, 1, keyword)) # 获取招聘页数
130 |     fin_skill_list = [] # 关键词总表
131 |     for pn in range(1, max_pn):
132 |         print(('***********************正在抓取第%s页信息***********************' % pn))
133 |         page = get_page(url, pn, keyword)
134 |         company_list = read_id(page)
135 |         for company_id in company_list:
136 |             content = get_content(company_id)
137 |             result = get_result(content)
138 |             skill_list = search_skill(result)
139 |             fin_skill_list.extend(skill_list)
140 |     print('***********************开始统计关键词出现频率***********************')
141 |     count_dict = count_skill(fin_skill_list)
142 |     print(count_dict)
143 |     file_name = raw_input(r'请输入要保存的文件名：')
144 |     save_excel(count_dict, file_name)
145 |     print('***********************正在保存到E:\positions***********************')
146 |     endtime = datetime.datetime.now()
147 |     time = (endtime - starttime).seconds
148 |     print('总共用时：%s s' % time)
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # python-spider
  2 | 知己知彼，方可百战不殆。在学习技术的时候我们往往面临太多选择而不知所措，可能是各个方面都有涉猎，对某个领域没有深入研究，看似什么都会，真要让你做个什么东西的时候就显得捉肘见襟。如果我们能从招聘职位所需的技能开始学习，便可练就一身硬功夫，为实战应用中打下良好的基础。
  3 | 
  4 | 我们的目的主要是通过python抓取拉钩网的招聘详情，并筛选其中的技能关键词，存储到excel中。
  5 | 
  6 | #一、获取职位需求数据
  7 | 
  8 | 通过观察可以发现，拉勾网的职位页面详情是由 http://www.lagou.com/jobs/ + ***** (PositionId).html 组成，而PositionId可以通过分析Json的XHR获得。而红框里的职位描述内容是我们要抓取的数据。
  9 | 
 10 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/2815894-6c3c3f2d777c2019.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 11 | 
 12 | 
 13 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/2815894-df6eb3d1ae9bca28.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 14 | 
 15 | 知道了数据的源头，接下来就按照常规步骤包装Headers，提交FormData来获取反馈数据。
 16 | ##获取PositionId列表所在页面：
 17 | 
 18 | ```python
 19 | #获取职位的查询页面，（参数分别为网址，当前页面数，关键词）
 20 | def get_page(url, pn, keyword):
 21 |     headers = {
 22 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
 23 |         'Host': 'www.lagou.com',
 24 |         'Connection': 'keep-alive',
 25 |         'Origin': 'http://www.lagou.com'
 26 |     }
 27 |     if pn == 1:
 28 |         boo = 'true'
 29 |     else:
 30 |         boo = 'false'
 31 |     page_data = urllib.urlencode([
 32 |         ('first', boo),
 33 |         ('pn', pn),
 34 |         ('kd', keyword)
 35 |     ])
 36 |     req = urllib2.Request(url, headers=headers)
 37 | 
 38 |     page = urllib2.urlopen(req, data=page_data.encode('utf-8')).read()
 39 |     page = page.decode('utf-8')
 40 |     return page
 41 | ```
 42 | ##通过Json获取PositionId：
 43 | ```python
 44 | 
 45 | #获取所需的岗位ID，每一个招聘页面详情都有一个所属的ID索引
 46 | def read_id(page):
 47 |     tag = 'positionId'
 48 |     page_json = json.loads(page)
 49 |     page_json = page_json['content']['positionResult']['result']
 50 |     company_list = []
 51 |     for i in range(15):
 52 |         company_list.append(page_json[i].get(tag))
 53 |     return  company_list
 54 | ```
 55 | ##合成目标url：
 56 | ```
 57 | #获取职位页面，由positionId和BaseUrl组合成目标地址
 58 | def get_content(company_id):
 59 |     fin_url = r'http://www.lagou.com/jobs/%s.html' % company_id
 60 |     headers = {
 61 |         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
 62 |         'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
 63 |         'Host': 'www.lagou.com',
 64 |         'Connection': 'keep-alive',
 65 |         'Origin': 'http://www.lagou.com'
 66 |     }
 67 |     req = urllib2.Request(fin_url, headers=headers)
 68 |     #page = urllib.urlopen(req).read()
 69 |     page = urllib2.urlopen(req).read()
 70 |     content = page.decode('utf-8')
 71 |     return content
 72 | ```
 73 |  #二、对数据进行处理
 74 | 获取数据之后，需要对数据进行清洗，通过BeautifulSoup抓取的职位内容包含Html标签，需要让数据脱去这层“外衣”。
 75 | 
 76 | ```
 77 | #获取职位需求（通过re来去除html标记），可以将职位详情单独存储
 78 | def get_result(content):
 79 |     soup = Bs(content, 'lxml')
 80 |     job_description = soup.select('dd[class="job_bt"]')
 81 |     job_description = str(job_description[0])
 82 |     rule = re.compile(r'<[^>]+>')
 83 |     result = rule.sub('', job_description)
 84 |     return result
 85 | ```
 86 | 现在得到的数据就是职位描述信息，我们要从职位信息当中筛选我们所关注的任职要求关键词。
 87 | 
 88 | ![Paste_Image.png](http://upload-images.jianshu.io/upload_images/2815894-3f07bb128d91e883.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 89 | 
 90 | 我们将这些关键词筛选出来，存储到List当中。经过对整个500+职位进行爬取，我们得到了职位技能关键词的总表。
 91 | ```
 92 | #过滤关键词：目前筛选的方式只是选取英文关键词
 93 | def search_skill(result):
 94 |     rule = re.compile(r'[a-zA-z]+')
 95 |     skil_list = rule.findall(result)
 96 |     return skil_list
 97 | ```
 98 | 对关键词按照500+职位需求出现的频次进行排序，选取频次排序Top80的关键词，去除无效的关键词。
 99 | 
100 | ```
101 | # 对出现的关键词计数，并排序，选取Top80的关键词作为数据的样本
102 | def count_skill(skill_list):
103 |     for i in range(len(skill_list)):
104 |         skill_list[i] = skill_list[i].lower()
105 |     count_dict = Counter(skill_list).most_common(80)
106 |     return count_dict
107 | ```
108 | 
109 | #三、对数据进行存储和可视化处理
110 | 
111 | ```
112 | # 对结果进行存储并生成Area图
113 | def save_excel(count_dict, file_name):
114 |     book = xlsxwriter.Workbook(r'E:\positions\%s.xls' % file_name)
115 |     tmp = book.add_worksheet()
116 |     row_num = len(count_dict)
117 |     for i in range(1, row_num):
118 |         if i == 1:
119 |             tag_pos = 'A%s' % i
120 |             tmp.write_row(tag_pos, ['关键词', '频次'])
121 |         else:
122 |             con_pos = 'A%s' % i
123 |             k_v = list(count_dict[i-2])
124 |             tmp.write_row(con_pos, k_v)
125 |     chart1 = book.add_chart({'type':'area'})
126 |     chart1.add_series({
127 |         'name' : '=Sheet1!$B$1',
128 |         'categories' : '=Sheet1!$A$2:$A$80',
129 |         'values' : '=Sheet1!$B$2:$B$80'
130 |     })
131 |     chart1.set_title({'name':'关键词排名'})
132 |     chart1.set_x_axis({'name': '关键词'})
133 |     chart1.set_y_axis({'name': '频次(/次)'})
134 |     tmp.insert_chart('C2', chart1, {'x_offset':15, 'y_offset':10})
135 |     book.close()
136 | ```
137 | 
138 | ![数据可视化展示](http://upload-images.jianshu.io/upload_images/2815894-b47a78419a9c2d6c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
139 | 
140 | 


--------------------------------------------------------------------------------
/images/2016-10-17_22-04-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fuyunzhishang/python-spider/79c93b06dd2f2c278b8185a515650e7a0b8668f1/images/2016-10-17_22-04-30.png


--------------------------------------------------------------------------------