├── .gitattributes
├── README.md
├── analysis&machine-learning
└── jobs_ml_reg.ipynb
├── analysis&visualizations
└── jobs_analysis.ipynb
├── city_data
├── city.csv
├── city.ipynb
└── city.py
├── pics
├── 1.png
├── 10.png
├── 11.png
├── 12.png
├── 13.png
├── 14.png
├── 2.png
├── 3.png
├── 4.png
├── 5.png
├── 6.png
├── 7.png
├── 8.png
├── 9.png
├── ML部分
│ ├── 为标签编码.png
│ ├── 决策树训练集误差.png
│ ├── 划分训练集与测试集.png
│ ├── 变量重要性.png
│ ├── 属性合并.png
│ ├── 筛选岗位数量前150.png
│ ├── 编码分类结果.png
│ ├── 过滤后的分布.png
│ ├── 过滤薪资.png
│ ├── 重要性分析.png
│ └── 随机森林训练集误差.png
└── url.png
└── spiders
├── citylist_spider.py
└── jobs_spider.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=python
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BOSS直聘的数据分析
2 | ---
3 | ## 库依赖
4 | - 基于Python3.6
5 | - Jupyter Notebook
6 | - pyecharts
7 | - pymongo
8 | - pandas
9 | - numpy
10 | ---
11 | ## 爬虫实现过程
12 | ### 分析URL
13 | 
14 | - c后的编号对应不同城市
15 | - page后的数字则对应页码
16 |
17 | ---
18 |
19 | ### 爬取所有省、市对应的code
20 | city.py实现地区与对应code的爬取
21 |
22 | ---
23 | ### 根据codelist爬取所有地区的职位
24 | - 爬取内容包含:signal、省、市、职位名称、薪资、公司名称、工作经验、学历要求、公司规模。
25 |
26 | - signal字段作用在于重复爬取时跳过已爬取的页面。
27 | - 存入MongoDB中
28 |
29 |
30 | ---
31 | ## 进行数据分析
32 | ### 读取数据
33 | 利用pandas读取数据库中数据
34 |
35 |
36 | ---
37 | ### 添加新列:salary
38 | 利用正则提取出[职位薪资]
39 |
40 |
41 | ---
42 | ### 数据清洗
43 | - 移除重复数据
44 |
45 |
46 |
47 | ---
48 |
49 | - 数据筛选过滤,去除过高和过低的薪资
50 |
51 |
52 |
53 | - 去除与“数据分析”无关的岗位信息
54 |
55 | ---
56 | ## 数据可视化
57 |
58 |
59 | ---
60 |
61 |
62 |
63 | ---
64 |
65 |
66 |
67 | ---
68 |
69 |
70 |
71 | ---
72 |
73 |
74 |
75 | ---
76 |
77 | ## 机器学习部分分析
78 |
79 |
80 | ---
81 |
82 |
83 |
84 | ---
85 |
86 |
87 |
88 | ---
89 |
90 |
91 |
92 | ---
93 |
94 |
95 |
96 | ---
97 |
98 |
99 |
100 | ---
101 | ### **变量重要性**
102 |
103 |
104 |
105 | ---
106 |
107 |
108 |
--------------------------------------------------------------------------------
/analysis&machine-learning/jobs_ml_reg.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# **机器学习**"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 262,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import pymongo\n",
19 | "import pandas as pd\n",
20 | "from pandas import Series\n",
21 | "client = pymongo.MongoClient('localhost',27017)\n",
22 | "db = client['Graduation_project']\n",
23 | "table = db['jobs_info']\n",
24 | "data = pd.DataFrame(list(table.find()))\n",
25 | "del data['_id']\n",
26 | "del data['signal']"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 263,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/plain": [
37 | "(65059, 8)"
38 | ]
39 | },
40 | "execution_count": 263,
41 | "metadata": {},
42 | "output_type": "execute_result"
43 | }
44 | ],
45 | "source": [
46 | "data.shape # 65059 rows × 8 columns"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 264,
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "outputs": [],
56 | "source": [
57 | "import re\n",
58 | "# 均值函数\n",
59 | "def average(job_salary):\n",
60 | " # 取薪资均值----------------\n",
61 | " pattern = re.compile('\\d+')\n",
62 | " salary = job_salary\n",
63 | " try:\n",
64 | " res = re.findall(pattern, salary)\n",
65 | " avg_salary = 0\n",
66 | " sum = 0\n",
67 | " for i in res:\n",
68 | " a = int(i)\n",
69 | " sum = sum + a\n",
70 | " avg_salary = sum / 2\n",
71 | " except Exception:\n",
72 | " avg_salary = 0\n",
73 | " # 函数返回值\n",
74 | " return avg_salary"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 265,
80 | "metadata": {
81 | "collapsed": true
82 | },
83 | "outputs": [],
84 | "source": [
85 | "salary_list = []\n",
86 | "for i in range(0,65059):\n",
87 | " avg_sal = average(data['职位薪资'][i])\n",
88 | " salary_list.append(avg_sal)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 266,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "sal = Series(salary_list)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 267,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "data.insert(8,'salary',sal)"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 268,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "#data"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 269,
121 | "metadata": {
122 | "scrolled": false
123 | },
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/html": [
128 | "
\n",
129 | "\n",
142 | "
\n",
143 | " \n",
144 | " \n",
145 | " | \n",
146 | " 城市 | \n",
147 | " 职位名称 | \n",
148 | " 工作经验 | \n",
149 | " 公司规模 | \n",
150 | " 学历要求 | \n",
151 | " salary | \n",
152 | "
\n",
153 | " \n",
154 | " \n",
155 | " \n",
156 | " 0 | \n",
157 | " 北京 | \n",
158 | " 数据分析 | \n",
159 | " 经验不限 | \n",
160 | " 10000人以上 | \n",
161 | " 本科 | \n",
162 | " 8.0 | \n",
163 | "
\n",
164 | " \n",
165 | " 1 | \n",
166 | " 北京 | \n",
167 | " 数据分析师 | \n",
168 | " 1-3年 | \n",
169 | " 100-499人 | \n",
170 | " 本科 | \n",
171 | " 27.5 | \n",
172 | "
\n",
173 | " \n",
174 | " 2 | \n",
175 | " 北京 | \n",
176 | " 数据分析师 | \n",
177 | " 3-5年 | \n",
178 | " 1000-9999人 | \n",
179 | " 本科 | \n",
180 | " 20.0 | \n",
181 | "
\n",
182 | " \n",
183 | " 3 | \n",
184 | " 北京 | \n",
185 | " 数据分析 | \n",
186 | " 经验不限 | \n",
187 | " 10000人以上 | \n",
188 | " 本科 | \n",
189 | " 9.0 | \n",
190 | "
\n",
191 | " \n",
192 | " 4 | \n",
193 | " 北京 | \n",
194 | " 数据分析师 | \n",
195 | " 3-5年 | \n",
196 | " 10000人以上 | \n",
197 | " 本科 | \n",
198 | " 12.5 | \n",
199 | "
\n",
200 | " \n",
201 | "
\n",
202 | "
"
203 | ],
204 | "text/plain": [
205 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n",
206 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n",
207 | "1 北京 数据分析师 1-3年 100-499人 本科 27.5\n",
208 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n",
209 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n",
210 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5"
211 | ]
212 | },
213 | "execution_count": 269,
214 | "metadata": {},
215 | "output_type": "execute_result"
216 | }
217 | ],
218 | "source": [
219 | "job_data = data[['城市','职位名称','工作经验','公司规模','学历要求','salary']]\n",
220 | "job_data.head()"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 270,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "data": {
230 | "text/plain": [
231 | "(65059, 6)"
232 | ]
233 | },
234 | "execution_count": 270,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "job_data.shape # 65059条数据"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "### 薪资分布"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 271,
253 | "metadata": {
254 | "scrolled": false
255 | },
256 | "outputs": [
257 | {
258 | "data": {
259 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfYAAAE/CAYAAAC0DOHAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAHWBJREFUeJzt3X+QXfV53/H3J+JHHMvmh7F3VKFW\npFZSEzPBsAU6jtMFPCBwEpE27kCJEQ4ZJQ6kces0yMkkONi00DbxlIY4lYtqkdgW1LEHDcglDGHj\n8QxgwMYIjAkyVoxAhmBhjGwHV87TP+5XmWuxv3XF3j37fs3c2Xuf8z1nv4/O3v3cc+7R3VQVkiSp\nG35ovicgSZIGx2CXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SVNKUkleP9/zkDQzBrskSR1i\nsEs6KJIcMt9zkBYjg11aRJJcnuTJJC8keTTJmUlOSXJXkm8m2ZXkj5IcNsn6b0vyhSTfSvJEkvf1\nLVvZTttfkuRrwF8muTXJr++3jQeTnHdwO5UWL4NdWiSS/DhwGfDPq+pVwNnADuD7wL8HjgH+BXAm\n8GuTbObbwEXAkcDbgHdNENL/EnhD2/4m4Bf75vCTwHJg60CakvQSBru0eHwfOBw4PsmhVbWjqr5S\nVfdX1d1VtbeqdgD/k144v0RVjVfVtqr6+6p6EPj4BGPfV1XfrqrvAjcDq5KsasveAdxYVd87GA1K\nMtilRaOqtgPvBt4HPJNkc5J/lOTHktyS5OtJvgX8J3pH7y+R5NQkdyb52yTPA786wdgn+r7ni8BN\nwC8m+SHgAuBPB96cpH9gsEuLSFV9rKp+CvgnQAHXAB8CvgysqqpXA78NZJJNfAzYAqyoqiOAP5lg\n7P5/MnITcCG9U/zfqaq7BtGLpIkZ7NIikeTHk5yR5HDg74Dv0js9/yrgW8CeJP8MeNcUm3kVsLuq\n/i7JKcC/ne77tiD/e+AP8GhdOugMdmnxOBy4GngW+DrwOnpH579JL6BfAD4M3DjFNn4NuDLJC8Dv\n0TvNPhM3ACcAfzanmUuasVTtf9ZMkgYryUXAuvY2gKSDyCN2SQdVkh+hd6S/Yb7nIi0GBrukgybJ\n2cDfAk/Tu/BO0kHmqXhJkjrEI3ZJkjrEYJckqUMW7F9fOuaYY2rlypUD2963v/1tXvnKVw5se/PN\nfoab/Qw3+xlui7Wf+++//9mqeu104xZssK9cuZL77rtvYNsbHx9nbGxsYNubb/Yz3OxnuNnPcFus\n/ST5m5lsz1PxkiR1iMEuSVKHGOySJHWIwS5JUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOySJHWI\nwS5JUodMG+xJfjjJ55J8McnDSX6/1T+S5KtJHmi3E1s9Sa5Nsj3Jg0lO6tvW2iSPtdvavvrJSba1\nda5NkoPRrCRJXTeTz4p/ETijqvYkORT4bJJPt2X/sao+sd/4c4BV7XYq8CHg1CRHA1cAo0AB9yfZ\nUlXPtTHrgLuBrcBq4NMMkZXrb51y+Y6r3/YyzUSSpMlNe8RePXvaw0PbraZYZQ1wQ1vvbuDIJMuA\ns4Hbq2p3C/PbgdVt2aur6q6qKuAG4LwD6EmSpEVrRu+xJ1mS5AHgGXrhfE9bdFU73f7BJIe32nLg\nib7Vd7baVPWdE9QlSdIspXeQPMPByZHAp4BfB74BfB04DNgAfKWqrkxyK/Cfq+qzbZ07gN8CzgAO\nr6oPtPrvAt8BPtPGv7XV3wL8VlX97ATffx29U/aMjIycvHnz5jk1PZE9e/awdOnSSZdve/L5Kdc/\nYfkRA5vLIEzXz0JjP8PNfoab/Qy3mfZz+umn319Vo9ONm9XfY6+qbyYZB1ZX1X9r5ReT/G/gN9vj\nncCKvtWOBZ5q9bH96uOtfuwE4yf6/hvovYhgdHS0Bvn3eKf7e7gXT/ce+4WDm8sgLNa/V7xQ2M9w\ns5/hZj9Tm8lV8a9tR+okeQXwVuDL7b1x2hXs5wEPtVW2ABe1q+NPA56vql3AbcBZSY5KchRwFnBb\nW/ZCktPati4Cbh5Yh5IkLSIzOWJfBmxKsoTeC4GbquqWJH+Z5LVAgAeAX23jtwLnAtvpnWp/J0BV\n7U7yfuDeNu7Kqtrd7r8L+AjwCnpXww/VFfGSJC0U0wZ7VT0IvGmC+hmTjC/g0kmWbQQ2TlC/D3jj\ndHORJElT85PnJEnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQ\ng12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINd\nkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeqQaYM9yQ8n+VySLyZ5OMnv\nt/pxSe5J8liSG5Mc1uqHt8fb2/KVfdt6b6s/muTsvvrqVtueZP3g25QkaXGYyRH7i8AZVfWTwInA\n6iSnAdcAH6yqVcBzwCVt/CXAc1X1euCDbRxJjgfOB34CWA38cZIlSZYA1wHnAMcDF7SxkiRplqYN\n9urZ0x4e2m4FnAF8otU3Aee1+2vaY9ryM5Ok1TdX1YtV9VVgO3BKu22vqser6nvA5jZWkiTN0oze\nY29H1g8AzwC3A18BvllVe9uQncDydn858ARAW/488Jr++n7rTFaXJEmzdMhMBlXV94ETkxwJfAp4\nw0TD2tdMsmyy+kQvLmqCGknWAesARkZGGB8fn3ris7Bnz54pt/eeE/ZOugwY6FwGYbp+Fhr7GW72\nM9zsZ7gNup8ZBfs+VfXNJOPAacCRSQ5pR+XHAk+1YTuBFcDOJIcARwC7++r79K8zWX3/778B2AAw\nOjpaY2Njs5n+lMbHx5lqexevv3XK9XdcOLi5DMJ0/Sw09jPc7Ge42c9wG3Q/M7kq/rXtSJ0krwDe\nCjwC3An8Qhu2Fri53d/SHtOW/2VVVauf366aPw5YBXwOuBdY1a6yP4zeBXZbBtGcJEmLzUyO2JcB\nm9rV6z8E3FRVtyT5ErA5yQeALwDXt/HXA3+aZDu9I/XzAarq4SQ3AV8C9gKXtlP8JLkMuA1YAmys\nqocH1qEkSYvItMFeVQ8Cb5qg/ji9K9r3r/8d8PZJtnUVcNUE9a3A1hnMV5IkTcFPnpMkqUMMdkmS\nOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrE\nYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCX\nJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6ZNpgT7IiyZ1JHknycJLfaPX3JXkyyQPtdm7f\nOu9Nsj3Jo0nO7quvbrXtSdb31Y9Lck+Sx5LcmOSwQTcqSdJiMJMj9r3Ae6rqDcBpwKVJjm/LPlhV\nJ7bbVoC27HzgJ4DVwB8nWZJkCXAdcA5wPHBB33auadtaBTwHXDKg/iRJWlSmDfaq2lVVn2/3XwAe\nAZZPscoaYHNVvVhVXwW2A6e02/aqeryqvgdsBtYkCXAG8Im2/ibgvLk2JEnSYjar99iTrATeBNzT\nSpcleTDJxiRHtdpy4Im+1Xa22mT11wDfrKq9+9UlSdIspapmNjBZCvwVcFVVfTLJCPAsUMD7gWVV\n9UtJrgPuqqo/a+tdD2yl9yLi7Kr65VZ/B72j+Cvb+Ne3+gpga1WdMMEc1gHrAEZGRk7evHnz3Dvf\nz549e1i6dOmky7c9+fyU65+w/IiBzWUQputnobGf4WY/w81+httM+zn99NPvr6rR6cYdMpNvmuRQ\n4M+Bj1bVJwGq6um+5R8GbmkPdwIr+lY/Fniq3Z+o/ixwZJJD2lF7//gfUFUbgA0Ao6OjNTY2NpPp\nz8j4+DhTbe/i9bdOuf6OCwc3l0GYrp+Fxn6Gm/0MN/sZboPuZyZXxQe4Hnikqv6wr76sb9jPAw+1\n+1uA85McnuQ4YBXwOeBeYFW7Av4wehfYbaneKYM7gV9o668Fbj6wtiRJWpxmcsT+ZuAdwLYkD7Ta\nb9O7qv1EeqfidwC/AlBVDye5CfgSvSvqL62q7wMkuQy4DVgCbKyqh9v2Lgc2J/kA8AV6LyQkSdIs\nTRvsVfVZIBMs2jrFOlcBV01Q3zrRelX1OL332yVJ0gHwk+ckSeoQg12SpA4x2CVJ6hCDXZKkDjHY\nJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ\n6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQ\ng12SpA4x2CVJ6hCDXZKkDpk22JOsSHJnkkeSPJzkN1r96CS3J3msfT2q1ZPk2iTbkzyY5KS+ba1t\n4x9LsravfnKSbW2da5PkYDQrSVLXzeSIfS/wnqp6A3AacGmS44H1wB1VtQq4oz0GOAdY1W7rgA9B\n74UAcAVwKnAKcMW+FwNtzLq+9VYfeGuSJC0+0wZ7Ve2qqs+3+y8AjwDLgTXApjZsE3Beu78GuKF6\n7gaOTLIMOBu4vap2V9VzwO3A6rbs1VV1V1UVcEPftiRJ0iykl6UzHJysBD4DvBH4WlUd2bfsuao6\nKsktwNVV9dlWvwO4HBgDfriqPtDqvwt8Fxhv49/a6m8BLq+qn5ng+6+jd2TPyMjIyZs3b55lu5Pb\ns2cPS5cunXT5tiefn3L9E5YfMbC5DMJ0/Sw09jPc7Ge42c9wm2k/p59++v1VNTrduENm+o2TLAX+\nHHh3VX1rirfBJ1pQc6i/tFi1AdgAMDo6WmNjY9PMeubGx8eZansXr791yvV3XDi4uQzCdP0sNPYz\n3OxnuNnPcBt0PzO6Kj7JofRC/aNV9clWfrqdRqd9fabVdwIr+lY/FnhqmvqxE9QlSdIszeSq+ADX\nA49U1R/2LdoC7LuyfS1wc1/9onZ1/GnA81W1C7gNOCvJUe2iubOA29qyF5Kc1r7XRX3bkiRJszCT\nU/FvBt4BbEvyQKv9NnA1cFOSS4CvAW9vy7YC5wLbge8A7wSoqt1J3g/c28ZdWVW72/13AR8BXgF8\nut0kSdIsTRvs7SK4yd5QP3OC8QVcOsm2NgIbJ6jfR++CPEmSdAD85DlJkjrEYJckqUMMdkmSOsRg\nlySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJck\nqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlD\nDHZJkjrEYJckqUMMdkmSOmTaYE+yMckzSR7qq70vyZNJHmi3c/uWvTfJ9iSPJjm7r7661bYnWd9X\nPy7JPUkeS3JjksMG2aAkSYvJTI7YPwKsnqD+wao6sd22AiQ5Hjgf+Im2zh8nWZJkCXAdcA5wPHBB\nGwtwTdvWKuA54JIDaUiSpMVs2mCvqs8Au2e4vTXA5qp6saq+CmwHTmm37VX1eFV9D9gMrEkS4Azg\nE239TcB5s+xBkiQ1B/Ie+2VJHmyn6o9qteXAE31jdrbaZPXXAN+sqr371SVJ0hykqqYflKwEbqmq\nN7bHI8CzQAHvB5ZV1S8luQ64q6r+rI27HthK7wXE2VX1y63+DnpH8Ve28a9v9RXA1qo6YZJ5rAPW\nAYyMjJy8efPmObb9Unv27GHp0qWTLt/25PNTrn/C8iMGNpdBmK6fhcZ+hpv9DDf7GW4z7ef000+/\nv6pGpxt3yFwmUVVP77uf5MPALe3hTmBF39Bjgafa/YnqzwJHJjmkHbX3j5/o+24ANgCMjo7W2NjY\nXKY/ofHxcaba3sXrb51y/R0XDm4ugzBdPwuN/Qw3+xlu9jPcBt3PnE7FJ1nW9/DngX1XzG8Bzk9y\neJLjgFXA54B7gVXtCvjD6F1gt6V6pwvuBH6hrb8WuHkuc5IkSTM4Yk/ycWAMOCbJTuAKYCzJifRO\nxe8AfgWgqh5OchPwJWAvcGlVfb9t5zLgNmAJsLGqHm7f4nJgc5IPAF8Arh9Yd5IkLTLTBntVXTBB\nedLwraqrgKsmqG+l9377/vXH6b3fLkmSDpCfPCdJUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOyS\nJHWIwS5JUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOySJHWIwS5JUocY7JIkdYjBLklShxjskiR1\niMEuSVKHGOySJHWIwS5JUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOySJHWIwS5JUocY7JIkdYjB\nLklShxjskiR1yLTBnmRjkmeSPNRXOzrJ7Ukea1+PavUkuTbJ9iQPJjmpb521bfxjSdb21U9Osq2t\nc22SDLpJSZIWi0NmMOYjwB8BN/TV1gN3VNXVSda3x5cD5wCr2u1U4EPAqUmOBq4ARoEC7k+ypaqe\na2PWAXcDW4HVwKcPvLWX18r1t065fMfVb3uZZiJJWsymPWKvqs8Au/crrwE2tfubgPP66jdUz93A\nkUmWAWcDt1fV7hbmtwOr27JXV9VdVVX0XjychyRJmpO5vsc+UlW7ANrX17X6cuCJvnE7W22q+s4J\n6pIkaQ5mcip+NiZ6f7zmUJ9448k6eqftGRkZYXx8fA5TnNiePXum3N57Tth7QNsf5FxnYrp+Fhr7\nGW72M9zsZ7gNup+5BvvTSZZV1a52Ov2ZVt8JrOgbdyzwVKuP7Vcfb/VjJxg/oaraAGwAGB0drbGx\nscmGztr4+DhTbe/iad5Dn86OCyff9sEwXT8Ljf0MN/sZbvYz3Abdz1xPxW8B9l3Zvha4ua9+Ubs6\n/jTg+Xaq/jbgrCRHtSvozwJua8teSHJauxr+or5tSZKkWZr2iD3Jx+kdbR+TZCe9q9uvBm5Kcgnw\nNeDtbfhW4FxgO/Ad4J0AVbU7yfuBe9u4K6tq3wV576J35f0r6F0Nv+CuiJckaVhMG+xVdcEki86c\nYGwBl06ynY3Axgnq9wFvnG4ekiRpen7ynCRJHWKwS5LUIQa7JEkdYrBLktQhBrskSR1isEuS1CEG\nuyRJHWKwS5LUIQa7JEkdYrBLktQhBrskSR1isEuS1CEGuyRJHWKwS5LUIdP+2dbFYtuTz3Px+lvn\nexqSJB0Qj9glSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA7xA2peJiun\n+fCbHVe/7WWaiSSpyzxilySpQwx2SZI6xGCXJKlDDijYk+xIsi3JA0nua7Wjk9ye5LH29ahWT5Jr\nk2xP8mCSk/q2s7aNfyzJ2gNrSZKkxWsQR+ynV9WJVTXaHq8H7qiqVcAd7THAOcCqdlsHfAh6LwSA\nK4BTgVOAK/a9GJAkSbNzME7FrwE2tfubgPP66jdUz93AkUmWAWcDt1fV7qp6DrgdWH0Q5iVJUucd\naLAX8BdJ7k+yrtVGqmoXQPv6ulZfDjzRt+7OVpusLkmSZulA/x/7m6vqqSSvA25P8uUpxmaCWk1R\nf+kGei8e1gGMjIwwPj4+y+lObuQV8J4T9g5se7M1yF4A9uzZM/Btzif7GW72M9zsZ7gNup8DCvaq\neqp9fSbJp+i9R/50kmVVtaudan+mDd8JrOhb/VjgqVYf268+Psn32wBsABgdHa2xsbGJhs3J//jo\nzfzBtvn7vJ4dF44NdHvj4+MM8t9nvtnPcLOf4WY/w23Q/cz5VHySVyZ51b77wFnAQ8AWYN+V7WuB\nm9v9LcBF7er404Dn26n624CzkhzVLpo7q9UkSdIsHcgh6gjwqST7tvOxqvq/Se4FbkpyCfA14O1t\n/FbgXGA78B3gnQBVtTvJ+4F727grq2r3AcxLkqRFa87BXlWPAz85Qf0bwJkT1Au4dJJtbQQ2znUu\nkiSpx0+ekySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJ\nkjrEYJckqUMMdkmSOsRglySpQwx2SZI65ED+HrteZivX3zrl8h1Xv+1lmokkaVh5xC5JUocY7JIk\ndYjBLklShxjskiR1iMEuSVKHGOySJHWIwS5JUof4/9iHxHT/R12SpJnwiF2SpA4x2CVJ6hCDXZKk\nDvE99kXEz5qXpO7ziF2SpA4ZmiP2JKuB/w4sAf5XVV09z1NacPqPyN9zwl4u9kp7SVp0huKIPckS\n4DrgHOB44IIkx8/vrCRJWniG5Yj9FGB7VT0OkGQzsAb40rzOSj/A9+glafgNS7AvB57oe7wTOHWe\n5rJoHeiH5BzMD9nZ99aCLx4kaWrDEuyZoFYvGZSsA9a1h3uSPDrAORwDPDvA7c2rf9fRfnLNfM9k\nYDq1f7CfYWc/w22m/fyTmWxsWIJ9J7Ci7/GxwFP7D6qqDcCGgzGBJPdV1ejB2PZ8sJ/hZj/DzX6G\nm/1MbSgungPuBVYlOS7JYcD5wJZ5npMkSQvOUByxV9XeJJcBt9H7724bq+rheZ6WJEkLzlAEO0BV\nbQW2zuMUDsop/nlkP8PNfoab/Qw3+5lCql5yjZokSVqghuU9dkmSNACLPtiTrE7yaJLtSdbP93xm\nK8mKJHcmeSTJw0l+o9Xfl+TJJA+027nzPdfZSLIjybY29/ta7egktyd5rH09ar7nORNJfrxvPzyQ\n5FtJ3r2Q9lGSjUmeSfJQX23C/ZGea9tz6sEkJ83fzCc2ST//NcmX25w/leTIVl+Z5Lt9++lP5m/m\nE5ukn0l/vpK8t+2fR5OcPT+zntwk/dzY18uOJA+0+kLYP5P9nj44z6GqWrQ3ehfqfQX4UeAw4IvA\n8fM9r1n2sAw4qd1/FfDX9D6W933Ab873/A6grx3AMfvV/guwvt1fD1wz3/OcQ19LgK/T+/+oC2Yf\nAT8NnAQ8NN3+AM4FPk3v8ylOA+6Z7/nPsJ+zgEPa/Wv6+lnZP24Yb5P0M+HPV/v98EXgcOC49jtw\nyXz3MF0/+y3/A+D3FtD+mez39EF5Di32I/Z/+CjbqvoesO+jbBeMqtpVVZ9v918AHqH3SX5dtAbY\n1O5vAs6bx7nM1ZnAV6rqb+Z7IrNRVZ8Bdu9Xnmx/rAFuqJ67gSOTLHt5ZjozE/VTVX9RVXvbw7vp\nfZ7GgjDJ/pnMGmBzVb1YVV8FttP7XTg0puonSYB/A3z8ZZ3UAZji9/RBeQ4t9mCf6KNsF2woJlkJ\nvAm4p5Uua6dxNi6U09Z9CviLJPen94mDACNVtQt6TxTgdfM2u7k7nx/8hbSQ99Fk+6MLz6tfonfE\ntM9xSb6Q5K+SvGW+JjUHE/18LfT98xbg6ap6rK+2YPbPfr+nD8pzaLEH+4w+ynYhSLIU+HPg3VX1\nLeBDwD8FTgR20Tt1tZC8uapOovcX/y5N8tPzPaEDld6HL/0c8H9aaaHvo8ks6OdVkt8B9gIfbaVd\nwD+uqjcB/wH4WJJXz9f8ZmGyn68FvX+AC/jBF8cLZv9M8Ht60qET1Ga8jxZ7sM/oo2yHXZJD6f2w\nfLSqPglQVU9X1fer6u+BDzNkp9qmU1VPta/PAJ+iN/+n952Oal+fmb8Zzsk5wOer6mlY+PuIyffH\ngn1eJVkL/AxwYbU3O9sp62+0+/fTe0/6x+ZvljMzxc/XQt4/hwD/CrhxX22h7J+Jfk9zkJ5Diz3Y\nF/xH2bb3m64HHqmqP+yr978f8/PAQ/uvO6ySvDLJq/bdp3dR00P09s3aNmwtcPP8zHDOfuBIYyHv\no2ay/bEFuKhd2Xsa8Py+043DLMlq4HLg56rqO3311yZZ0u7/KLAKeHx+ZjlzU/x8bQHOT3J4kuPo\n9fO5l3t+c/RW4MtVtXNfYSHsn8l+T3OwnkPzfbXgfN/oXX341/Re5f3OfM9nDvP/KXqnaB4EHmi3\nc4E/Bba1+hZg2XzPdRY9/Si9q3a/CDy8b78ArwHuAB5rX4+e77nOoqcfAb4BHNFXWzD7iN4Lkl3A\n/6N3NHHJZPuD3mnE69pzahswOt/zn2E/2+m9r7nvefQnbey/bj+HXwQ+D/zsfM9/hv1M+vMF/E7b\nP48C58z3/GfST6t/BPjV/cYuhP0z2e/pg/Ic8pPnJEnqkMV+Kl6SpE4x2CVJ6hCDXZKkDjHYJUnq\nEINdkqQOMdglSeoQg12SpA4x2CVJ6pD/D5Xm/PvE0XwEAAAAAElFTkSuQmCC\n",
260 | "text/plain": [
261 | ""
262 | ]
263 | },
264 | "metadata": {},
265 | "output_type": "display_data"
266 | }
267 | ],
268 | "source": [
269 | "%matplotlib inline\n",
270 | "import matplotlib.pyplot as plt\n",
271 | "job_data.hist(bins=50, figsize=(8,5))\n",
272 | "# save_fig(\"attribute_histogram_plots\")\n",
273 | "plt.show()"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "# 数据清洗"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {},
286 | "source": [
287 | "## 去重"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 272,
293 | "metadata": {},
294 | "outputs": [
295 | {
296 | "data": {
297 | "text/html": [
298 | "\n",
299 | "\n",
312 | "
\n",
313 | " \n",
314 | " \n",
315 | " | \n",
316 | " 城市 | \n",
317 | " 职位名称 | \n",
318 | " 工作经验 | \n",
319 | " 公司规模 | \n",
320 | " 学历要求 | \n",
321 | " salary | \n",
322 | "
\n",
323 | " \n",
324 | " \n",
325 | " \n",
326 | " 0 | \n",
327 | " 北京 | \n",
328 | " 数据分析 | \n",
329 | " 经验不限 | \n",
330 | " 10000人以上 | \n",
331 | " 本科 | \n",
332 | " 8.0 | \n",
333 | "
\n",
334 | " \n",
335 | " 1 | \n",
336 | " 北京 | \n",
337 | " 数据分析师 | \n",
338 | " 1-3年 | \n",
339 | " 100-499人 | \n",
340 | " 本科 | \n",
341 | " 27.5 | \n",
342 | "
\n",
343 | " \n",
344 | " 2 | \n",
345 | " 北京 | \n",
346 | " 数据分析师 | \n",
347 | " 3-5年 | \n",
348 | " 1000-9999人 | \n",
349 | " 本科 | \n",
350 | " 20.0 | \n",
351 | "
\n",
352 | " \n",
353 | " 3 | \n",
354 | " 北京 | \n",
355 | " 数据分析 | \n",
356 | " 经验不限 | \n",
357 | " 10000人以上 | \n",
358 | " 本科 | \n",
359 | " 9.0 | \n",
360 | "
\n",
361 | " \n",
362 | " 4 | \n",
363 | " 北京 | \n",
364 | " 数据分析师 | \n",
365 | " 3-5年 | \n",
366 | " 10000人以上 | \n",
367 | " 本科 | \n",
368 | " 12.5 | \n",
369 | "
\n",
370 | " \n",
371 | "
\n",
372 | "
"
373 | ],
374 | "text/plain": [
375 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n",
376 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n",
377 | "1 北京 数据分析师 1-3年 100-499人 本科 27.5\n",
378 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n",
379 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n",
380 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5"
381 | ]
382 | },
383 | "execution_count": 272,
384 | "metadata": {},
385 | "output_type": "execute_result"
386 | }
387 | ],
388 | "source": [
389 | "jobs = job_data.drop_duplicates()\n",
390 | "jobs.head()"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 273,
396 | "metadata": {},
397 | "outputs": [
398 | {
399 | "data": {
400 | "text/plain": [
401 | "(59142, 6)"
402 | ]
403 | },
404 | "execution_count": 273,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "jobs.shape # 61759条数据"
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {},
416 | "source": [
417 | "## 过滤"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 274,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "jobs_copy = jobs.copy()"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "### 过滤出薪资在6-27k之间"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 275,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "jobs_copy = jobs_copy[jobs_copy['salary']<27]"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 276,
448 | "metadata": {},
449 | "outputs": [
450 | {
451 | "data": {
452 | "text/html": [
453 | "\n",
454 | "\n",
467 | "
\n",
468 | " \n",
469 | " \n",
470 | " | \n",
471 | " 城市 | \n",
472 | " 职位名称 | \n",
473 | " 工作经验 | \n",
474 | " 公司规模 | \n",
475 | " 学历要求 | \n",
476 | " salary | \n",
477 | "
\n",
478 | " \n",
479 | " \n",
480 | " \n",
481 | " 0 | \n",
482 | " 北京 | \n",
483 | " 数据分析 | \n",
484 | " 经验不限 | \n",
485 | " 10000人以上 | \n",
486 | " 本科 | \n",
487 | " 8.0 | \n",
488 | "
\n",
489 | " \n",
490 | " 2 | \n",
491 | " 北京 | \n",
492 | " 数据分析师 | \n",
493 | " 3-5年 | \n",
494 | " 1000-9999人 | \n",
495 | " 本科 | \n",
496 | " 20.0 | \n",
497 | "
\n",
498 | " \n",
499 | " 3 | \n",
500 | " 北京 | \n",
501 | " 数据分析 | \n",
502 | " 经验不限 | \n",
503 | " 10000人以上 | \n",
504 | " 本科 | \n",
505 | " 9.0 | \n",
506 | "
\n",
507 | " \n",
508 | " 4 | \n",
509 | " 北京 | \n",
510 | " 数据分析师 | \n",
511 | " 3-5年 | \n",
512 | " 10000人以上 | \n",
513 | " 本科 | \n",
514 | " 12.5 | \n",
515 | "
\n",
516 | " \n",
517 | " 8 | \n",
518 | " 北京 | \n",
519 | " 数据分析师 | \n",
520 | " 1-3年 | \n",
521 | " 10000人以上 | \n",
522 | " 本科 | \n",
523 | " 25.5 | \n",
524 | "
\n",
525 | " \n",
526 | "
\n",
527 | "
"
528 | ],
529 | "text/plain": [
530 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n",
531 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n",
532 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n",
533 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n",
534 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5\n",
535 | "8 北京 数据分析师 1-3年 10000人以上 本科 25.5"
536 | ]
537 | },
538 | "execution_count": 276,
539 | "metadata": {},
540 | "output_type": "execute_result"
541 | }
542 | ],
543 | "source": [
544 | "jobs_copy = jobs_copy[jobs_copy['salary']>6]\n",
545 | "jobs_copy.head()"
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 277,
551 | "metadata": {},
552 | "outputs": [
553 | {
554 | "data": {
555 | "text/plain": [
556 | "(27293, 6)"
557 | ]
558 | },
559 | "execution_count": 277,
560 | "metadata": {},
561 | "output_type": "execute_result"
562 | }
563 | ],
564 | "source": [
565 | "jobs_copy.shape"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 278,
571 | "metadata": {
572 | "scrolled": true
573 | },
574 | "outputs": [
575 | {
576 | "data": {
577 | "text/html": [
578 | "\n",
579 | "\n",
592 | "
\n",
593 | " \n",
594 | " \n",
595 | " | \n",
596 | " salary | \n",
597 | "
\n",
598 | " \n",
599 | " \n",
600 | " \n",
601 | " count | \n",
602 | " 27293.000000 | \n",
603 | "
\n",
604 | " \n",
605 | " mean | \n",
606 | " 10.915528 | \n",
607 | "
\n",
608 | " \n",
609 | " std | \n",
610 | " 4.546630 | \n",
611 | "
\n",
612 | " \n",
613 | " min | \n",
614 | " 6.500000 | \n",
615 | "
\n",
616 | " \n",
617 | " 25% | \n",
618 | " 7.500000 | \n",
619 | "
\n",
620 | " \n",
621 | " 50% | \n",
622 | " 9.000000 | \n",
623 | "
\n",
624 | " \n",
625 | " 75% | \n",
626 | " 12.500000 | \n",
627 | "
\n",
628 | " \n",
629 | " max | \n",
630 | " 26.500000 | \n",
631 | "
\n",
632 | " \n",
633 | "
\n",
634 | "
"
635 | ],
636 | "text/plain": [
637 | " salary\n",
638 | "count 27293.000000\n",
639 | "mean 10.915528\n",
640 | "std 4.546630\n",
641 | "min 6.500000\n",
642 | "25% 7.500000\n",
643 | "50% 9.000000\n",
644 | "75% 12.500000\n",
645 | "max 26.500000"
646 | ]
647 | },
648 | "execution_count": 278,
649 | "metadata": {},
650 | "output_type": "execute_result"
651 | }
652 | ],
653 | "source": [
654 | "jobs_copy.describe()"
655 | ]
656 | },
657 | {
658 | "cell_type": "markdown",
659 | "metadata": {},
660 | "source": [
661 | "### 过滤后的薪资分布图"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": 279,
667 | "metadata": {},
668 | "outputs": [
669 | {
670 | "data": {
671 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAE/CAYAAACqxdFzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAH6JJREFUeJzt3X+U3XV95/HnW35JDSVBdJqGnIau\nqVs0R9TZQNdudwIWAvQU3FN7cKkmiie1hZ66m26J7bZSlTbuNnX7w9LGkjX+qAPb6pIDcWkWnfV4\nTlGIIgGRZcRUAjGpTQiOIN3ge/+4n5Gb4d65d2buzNz55Pk4556538/38/3ez/t+73de8/3Od74T\nmYkkSarLC+Z7AJIkqfcMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCQAIiIj4mXzPQ5JvWHA\nS5JUIQNe0oxExInzPQZJz2fASxWKiOsi4rGI+E5EPBQRF0bE6oj4+4h4IiL2R8SfRcTJbZa/LCK+\nHBFPRsSjEXF907wV5XT+1RHxTeAzEXF7RPzahHXcFxFXzG6lktox4KXKRMTLgWuBf5WZpwEXA3uB\nZ4H/AJwJ/BRwIfCrbVbzXeAtwGLgMuBXWoT1vwV+sqx/O/BLTWN4FbAM2NmToiRNmQEv1edZ4BTg\nnIg4KTP3ZubXM3N3Zt6VmUczcy/wlzRC+nkycyQz92Tm9zPzPuATLfpen5nfzcyngVuBlRGxssx7\nM3BzZv7zbBQoqTMDXqpMZo4C7wSuBw5GxHBE/GhE/ERE3BYR34qIJ4Hfp3E0/zwRcV5EfDYi/jEi\njgDvaNH30abXfAa4BfiliHgB8Cbgoz0vTlLXDHipQpn515n508CPAQm8H7gR+BqwMjN/GPgtINqs\n4q+BHcDyzDwd+IsWfSf+K8rtwFU0Tv0/lZl/34taJE2PAS9VJiJeHhEXRMQpwPeAp2mctj8NeBIY\ni4h/CfzKJKs5DTiUmd+LiNXAv+/0uiXQvw9swaN3ad4Z8FJ9TgE2A98GvgW8lMbR+m/QCOrvAB8C\nbp5kHb8KvCcivgP8Lo3T7934CLAK+Ni0Ri6pZyJz4lk2SZqeiHgLsKH8ekDSPPIIXlJPRMQP0Tjy\n3zrfY5FkwEvqgYi4GPhH4ACNC/QkzTNP0UuSVCGP4CVJqpABL0lShfr6v0CdeeaZuWLFivkexqS+\n+93v8qIXvWi+h9ETtdRSSx1gLf2qllpqqQPqqmX37t3fzsyXzHQ9fR3wK1as4J577pnvYUxqZGSE\noaGh+R5GT9RSSy11gLX0q1pqqaUOqKuWiPiHXqzHU/SSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKF\nDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRXq63vR96MVm24/ZnrjqqOsb2rbu/myuR6S\nJEnP4xG8JEkVMuAlSapQ1wEfESdExJcj4rYyfXZEfCEiHo6ImyPi5NJ+SpkeLfNXNK3jXaX9oYi4\nuNfFSJKkhqkcwf868GDT9PuBD2TmSuAwcHVpvxo4nJkvAz5Q+hER5wBXAq8A1gJ/HhEnzGz4kiSp\nla4CPiLOAi4D/qpMB3AB8Dely3bgivL88jJNmX9h6X85MJyZz2TmN4BRYHUvipAkScfq9gj+vwG/\nCXy/TL8YeCIzj5bpfcCy8nwZ8ChAmX+k9P9Be4tlJElSD3X8M7mI+DngYGbujoih8eYWXbPDvMmW\naX69DcAGgIGBAUZGRjoNcU5tXHX0mOmBU49t67fxTsXY2NiCHv+4WuoAa+lXtdRSSx1QVy290s3f\nwb8O+PmIuBR4IfDDNI7oF0fEieUo/Szg8dJ/H7Ac2BcRJwKnA4ea2sc1L/MDmbkV2AowODiYQ0ND\n0yhr9qxv8XfwW/Y89zbuvWpojkfUOyMjI/Tb+z0dtdQB1tKvaqmlljqgrlp6peMp+sx8V2aelZkr\naFwk95nMvAr4LPALpds64NbyfEeZpsz/TGZmab+yXGV/NrAS+GLPKpEkST8wkzvZXQcMR8T7gC8D\nN5X2m4CPRsQojSP3KwEy84GIuAX4KnAUuCYzn53B60uSpDamFPCZOQKMlOeP0OIq+Mz8HvDGNsvf\nANww1UFKkqSp8U52kiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIk\nVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLg\nJUmqkAEvSVKFDHhJkipkwEuSVKGOAR8RL4yIL0bEVyLigYj4vdL+4Yj4RkTcWx7nlvaIiD+JiNGI\nuC8iXtO0rnUR8XB5rJu9siRJOr6d2EWfZ4ALMnMsIk4CPh8Rny7z/lNm/s2E/pcAK8vjPOBG4LyI\nOAN4NzAIJLA7InZk5uFeFCJJkp7T8Qg+G8bK5EnlkZMscjnwkbLcXcDiiFgKXAzsysxDJdR3AWtn\nNnxJktRKZE6W1aVTxAnAbuBlwAcz87qI+DDwUzSO8O8ENmXmMxFxG7A5Mz9flr0TuA4YAl6Yme8r\n7b8DPJ2ZfzjhtTYAGwAGBgZeOzw83Is6e2bPY0eOmR44FQ48/dz0qmWnz/GIemdsbIxFixbN9zBm\nrJY6wFr6VS211FIH1FXLmjVrdmfm4EzX080pejLzWeDciFgMfCoiXgm8C/gWcDKwlUaIvweIVquY\npH3ia20t62NwcDCHhoa6GeKcWb/p9mOmN646ypY9z72Ne68amuMR9c7IyAj99n5PRy11gLX0q1pq\nqaUOqKuWXpnSVfSZ+QQwAqzNzP3lNPwzwH8HVpdu+4DlTYudBTw+SbskSeqxbq6if0k5ciciTgVe\nD3yt/F6diAjgCuD+ssgO4C3lavrzgSOZuR+4A7goIpZExBLgotImSZJ6rJtT9EuB7eX38C8AbsnM\n2yLiMxHxEhqn3u8F3lH67wQuBUaBp4C3AmTmoYh4L3B36feezDzUu1IkSdK4jgGfmfcBr27RfkGb\n/glc02beNmDbFMcoSZKmyDvZSZJUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJ\nkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ\n8JIkVciAlySpQga8JEkVMuAlSapQx4CPiBdGxBcj4isR8UBE/F5pPzsivhARD0fEzRFxcmk/pUyP\nlvkrmtb1rtL+UERcPFtFSZJ0vOvmCP4Z4ILMfBVwLrA2Is4H3g98IDNXAoeBq0v/q4HDmfky4AOl\nHxFxDnAl8ApgLfDnEXFCL4uRJEkNHQM+G8bK5EnlkcAFwN+U9u3AFeX55WWaMv/CiIjSPpyZz2Tm\nN4BRYHVPqpAkScfo6nfwEXFCRNwLHAR2AV8HnsjMo6XLPmBZeb4MeBSgzD8CvLi5vcUykiSph07s\nplNmPgucGxGLgU8BP9mqW/kabea1az9GRGwANgAMDAwwMjLSzRDnzMZVR4+ZHjj12LZ+G+9UjI2N\nLejxj6ulDrCWflVLLbXUAXXV0itdBfy4zHwiIkaA84HFEXFiOUo/C3i8dNsHLAf2RcSJwOnAoab2\ncc3LNL/GVmArwODgYA4NDU1liLNu/abbj5neuOooW/Y89zbuvWpojkfUOyMjI/Tb+z0dtdQB1tKv\naqmlljqgrlp6pZur6F9SjtyJiFOB1wMPAp8FfqF0WwfcWp7vKNOU+Z/JzCztV5ar7M8GVgJf7FUh\nkiTpOd0cwS8Ftpcr3l8A3JKZt0XEV4HhiHgf8GXgptL/JuCjETFK48j9SoDMfCAibgG+ChwFrimn\n/iVJUo91DPjMvA94dYv2R2hxFXxmfg94Y5t13QDcMPVhSpKkqfBOdpIkVciAlySpQga8JEkVMuAl\nSarQlP4OvgYrJvwd+0R7N182RyORJGn2eAQvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlC\nBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlySpQga8JEkVMuAlSaqQAS9J\nUoUMeEmSKtQx4CNieUR8NiIejIgHIuLXS/v1EfFYRNxbHpc2LfOuiBiNiIci4uKm9rWlbTQiNs1O\nSZIk6cQu+hwFNmbmlyLiNGB3ROwq8z6QmX/Y3DkizgGuBF4B/CjwvyPiJ8rsDwI/C+wD7o6IHZn5\n1V4UcrxYsen2jn32br5sDkYiSepnHQM+M/cD+8vz70TEg8CySRa5HBjOzGeAb0TEKLC6zBvNzEcA\nImK49DXgJUnqscjM7jtHrAA+B7wS+I/AeuBJ4B4aR/mHI+LPgLsy82NlmZuAT5dVrM3Mt5f2NwPn\nZea1E15jA7ABYGBg4LXDw8PTra2lPY8dmXT+qmWnT2n5gVPhwNPdLz9TncY/kzGMjY2xaNGiaS3b\nT2qpA6ylX9VSSy11QF21rFmzZndmDs50Pd2cogcgIhYBfwu8MzOfjIgbgfcCWb5uAd4GRIvFk9a/\n73/eTxeZuRXYCjA4OJhDQ0PdDrEr6zuc4t571eSvN3H5jauOsmXPc29jp+VnqtP4ZzKGkZERev1+\nz4da6gBr6Ve11FJLHVBXLb3SVcBHxEk0wv3jmflJgMw80DT/Q8BtZXIfsLxp8bOAx8vzdu2SJKmH\nurmKPoCbgAcz84+a2pc2dXsDcH95vgO4MiJOiYizgZXAF4G7gZURcXZEnEzjQrwdvSlDkiQ16+YI\n/nXAm4E9EXFvafst4E0RcS6N0+x7gV8GyMwHIuIWGhfPHQWuycxnASLiWuAO4ARgW2Y+0MNaJElS\n0c1V9J+n9e/Vd06yzA3ADS3ad062nCRJ6g3vZCdJUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJck\nqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRXq+P/gJc29\nFZtuB2DjqqOsL8+b7d182VwPSdIC4xG8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKdfwz\nuYhYDnwE+BHg+8DWzPzjiDgDuBlYAewFfjEzD0dEAH8MXAo8BazPzC+Vda0D/nNZ9fsyc3tvy5m5\nFS3+JEmSpIWmmyP4o8DGzPxJ4Hzgmog4B9gE3JmZK4E7yzTAJcDK8tgA3AhQfiB4N3AesBp4d0Qs\n6WEtkiSp6Bjwmbl//Ag8M78DPAgsAy4Hxo/AtwNXlOeXAx/JhruAxRGxFLgY2JWZhzLzMLALWNvT\naiRJEjDF38FHxArg1cAXgIHM3A+NHwKAl5Zuy4BHmxbbV9ratUuSpB6LzOyuY8Qi4P8AN2TmJyPi\nicxc3DT/cGYuiYjbgT/IzM+X9juB3wQuAE7JzPeV9t8BnsrMLRNeZwONU/sMDAy8dnh4eMZFNtvz\n2JGerm/gVDjw9HPTq5ad3tP1T9TN+Kc7hrGxMRYtWjStZftJDXWMb+eJn69xs/05mw01bJdxtdRS\nSx1QVy1r1qzZnZmDM11PV/eij4iTgL8FPp6ZnyzNByJiaWbuL6fgD5b2fcDypsXPAh4v7UMT2kcm\nvlZmbgW2AgwODubQ0NDELjPS6r7eM7Fx1VG27Hnubdx71VBP1z9RN+Of7hhGRkbo9fs9H2qoY33T\nveibP1/jZvtzNhtq2C7jaqmlljqgrlp6peMp+nJV/E3Ag5n5R02zdgDryvN1wK1N7W+JhvOBI+UU\n/h3ARRGxpFxcd1FpkyRJPdbNEfzrgDcDeyLi3tL2W8Bm4JaIuBr4JvDGMm8njT+RG6XxZ3JvBcjM\nQxHxXuDu0u89mXmoJ1VIkqRjdAz48rv0aDP7whb9E7imzbq2AdumMkBJkjR13slOkqQKGfCSJFXI\ngJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVqKv/Jqfurejw3972br5s\njkYiSTqeeQQvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4\nSZIqZMBLklQhA16SpAp1DPiI2BYRByPi/qa26yPisYi4tzwubZr3rogYjYiHIuLipva1pW00Ijb1\nvhRJkjSumyP4DwNrW7R/IDPPLY+dABFxDnAl8IqyzJ9HxAkRcQLwQeAS4BzgTaWvJEmaBR3/XWxm\nfi4iVnS5vsuB4cx8BvhGRIwCq8u80cx8BCAihkvfr055xJIkqaPIzM6dGgF/W2a+skxfD6wHngTu\nATZm5uGI+DPgrsz8WOl3E/Dpspq1mfn20v5m4LzMvLbFa20ANgAMDAy8dnh4eAblPd+ex470dH0D\np8KBp7vvv2rZ6TN6vW7GP93XGBsbY9GiRdNatp/UUMf4dm73+Zrp52g+1LBdxtVSSy11QF21rFmz\nZndmDs50PR2P4Nu4EXgvkOXrFuBtQLTom7T+VUDLnywycyuwFWBwcDCHhoamOcTW1m+6vafr27jq\nKFv2dP827r1qaEav1834p/saIyMj9Pr9ng811DG+ndt9vmb6OZoPNWyXcbXUUksdUFctvTKtgM/M\nA+PPI+JDwG1lch+wvKnrWcDj5Xm7dkmS1GPT+jO5iFjaNPkGYPwK+x3AlRFxSkScDawEvgjcDayM\niLMj4mQaF+LtmP6wJUnSZDoewUfEJ4Ah4MyI2Ae8GxiKiHNpnGbfC/wyQGY+EBG30Lh47ihwTWY+\nW9ZzLXAHcAKwLTMf6Hk1kiQJ6O4q+je1aL5pkv43ADe0aN8J7JzS6CRJ0rR4JztJkipkwEuSVCED\nXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqtB070Wv49iKDvfD37v5sjkaiSSpHY/gJUmqkAEvSVKF\nDHhJkirk7+A15/wdviTNPgNe1dnz2BHW+0OEpOOcp+glSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQK\nGfCSJFXIgJckqUIGvCRJFfJGN8ehdneS27jqKOs33e5NYCSpAh2P4CNiW0QcjIj7m9rOiIhdEfFw\n+bqktEdE/ElEjEbEfRHxmqZl1pX+D0fEutkpR5IkQXen6D8MrJ3Qtgm4MzNXAneWaYBLgJXlsQG4\nERo/EADvBs4DVgPvHv+hQJIk9V7HU/SZ+bmIWDGh+XJgqDzfDowA15X2j2RmAndFxOKIWFr67srM\nQwARsYvGDw2fmHEFkqRp8f821G26F9kNZOZ+gPL1paV9GfBoU799pa1duyRJmgXRONju0KlxBH9b\nZr6yTD+RmYub5h/OzCURcTvwB5n5+dJ+J/CbwAXAKZn5vtL+O8BTmbmlxWttoHF6n4GBgdcODw/P\nrMIJ9jx2pKfrGzgVDjzdff9Vy06f0et1M/5Or9FuHeO1THf5mb5+t8t3cvDQkY7bZKavMdvG36N2\nn69+H38rY2NjLFq0aL6H0RO11FLDvjKulm0CsGbNmt2ZOTjT9Uz3KvoDEbE0M/eXU/AHS/s+YHlT\nv7OAx0v70IT2kVYrzsytwFaAwcHBHBoaatVt2jqdjpqqjauOsmVP92/j3quGZvR63Yy/02u0W8d4\nLdNdfqav/wN7vjv5+jucMvzTj9/acZvMdDvMtvH3qN3nq9/H38rIyAi93p/nSy211LCvjKtlm/TS\ndE/R7wDGr4RfB9za1P6WcjX9+cCRcgr/DuCiiFhSLq67qLRJkqRZ0PHQMyI+QePo+8yI2EfjavjN\nwC0RcTXwTeCNpftO4FJgFHgKeCtAZh6KiPcCd5d+7xm/4E6SJPVeN1fRv6nNrAtb9E3gmjbr2QZs\nm9LoJEnStHirWkmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipk\nwEuSVKHp/jc5zZIVPf5vd5Kk45NH8JIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JU\nIQNekqQKeaMbSWqj3Y2nNq46yvpNt7N382VzPCKpex7BS5JUIQNekqQKGfCSJFVoRgEfEXsjYk9E\n3BsR95S2MyJiV0Q8XL4uKe0REX8SEaMRcV9EvKYXBUiSpOfrxUV2azLz203Tm4A7M3NzRGwq09cB\nlwAry+M84MbyVT3mf6STJM3GKfrLge3l+Xbgiqb2j2TDXcDiiFg6C68vSdJxLzJz+gtHfAM4DCTw\nl5m5NSKeyMzFTX0OZ+aSiLgN2JyZny/tdwLXZeY9E9a5AdgAMDAw8Nrh4eFpj6+VPY8d6en6Bk6F\nA09333/VstMnnd/r8U3FeC0zHeNs19hp/QcPHem4TTqtY76Nv0ftPl/9Pv5WxsbGWLRo0XwPY0ra\nfVa73Vf6XQ37yriF+PlqZ82aNbszc3Cm65npKfrXZebjEfFSYFdEfG2SvtGi7Xk/XWTmVmArwODg\nYA4NDc1wiMda3+PT1xtXHWXLnu7fxr1XDU06v9fjm4rxWmY6xtmusdP6//Tjt3bcJp3WMd/G36N2\nn69+H38rIyMj9Hp/nm3tPqvd7iv9roZ9ZdxC/HzNthmdos/Mx8vXg8CngNXAgfFT7+XrwdJ9H7C8\nafGzgMdn8vqSJKm1aQd8RLwoIk4bfw5cBNwP7ADWlW7rgFvL8x3AW8rV9OcDRzJz/7RHLkmS2prJ\nKfoB4FMRMb6ev87M/xURdwO3RMTVwDeBN5b+O4FLgVHgKeCtM3htSZI0iWkHfGY+AryqRfs/ARe2\naE/gmum+niRJ6p53spMkqUIGvCRJFTLgJUmqkP8PXtKsmHjL5PH/od4t/9e6NDMewUuSVCEDXpKk\nCnmKXj3nf7OTpPnnEbwkSRXyCF6ahk5nKeb7ArFuzqLM9xglzS4DXpK0YK1o+s+Lrf5K43j+QdZT\n9JIkVcgjeKkFLxSUtNB5BC9JUoUMeEmSKuQpeh2XPAUvqXYewUuSVCEDXpKkChnwkiRVyICXJKlC\nBrwkSRXyKnppHiyEq/j7/X77kiZnwEvSPPGHKM2mOQ/4iFgL/DFwAvBXmbl5rscgqX4L4SzJ8cAf\nYubPnAZ8RJwAfBD4WWAfcHdE7MjMr87lOKSFzvDyPZgrCz2gF/r4Z2Kuj+BXA6OZ+QhARAwDlwMG\nvLTAGLCz73gOp7lS83s81wG/DHi0aXofcN4cj0GS5oQ/BM3cfL+H3bx+v/4QEJk5dy8W8Ubg4sx8\ne5l+M7A6M3+tqc8GYEOZfDnw0JwNcHrOBL4934PokVpqqaUOsJZ+VUsttdQBddXy8sw8baYrmesj\n+H3A8qbps4DHmztk5lZg61wOaiYi4p7MHJzvcfRCLbXUUgdYS7+qpZZa6oD6aunFeub6Rjd3Aysj\n4uyIOBm4Etgxx2OQJKl6c3oEn5lHI+Ja4A4afya3LTMfmMsxSJJ0PJjzv4PPzJ3Azrl+3Vm0YH6d\n0IVaaqmlDrCWflVLLbXUAdbyPHN6kZ0kSZob/rMZSZIqZMB3ISJeHhH3Nj2ejIh3TugzFBFHmvr8\n7nyNd6KI2BYRByPi/qa2MyJiV0Q8XL4uabPsutLn4YhYN3ejbjmWVnX814j4WkTcFxGfiojFbZbd\nGxF7yrbpyRWqM9Gmlusj4rGmz9ClbZZdGxEPRcRoRGyau1G31qaWm5vq2BsR97ZZtm+2S0Qsj4jP\nRsSDEfFARPx6aV+I+0q7Whbc/jJJLQtqf5mkjtnbVzLTxxQeNC4O/BbwYxPah4Db5nt8bcb8M8Br\ngPub2v4LsKk83wS8v8VyZwCPlK9LyvMlfVbHRcCJ5fn7W9VR5u0FzpzvbdGhluuB3+iw3AnA14Ef\nB04GvgKc02+1TJi/Bfjdft8uwFLgNeX5acD/Bc5ZoPtKu1oW3P4ySS0Lan9pV8eEPj3dVzyCn7oL\nga9n5j/M90C6lZmfAw5NaL4c2F6ebweuaLHoxcCuzDyUmYeBXcDaWRtoB63qyMy/y8yjZfIuGvdW\n6Htttkk3fnC758z8Z2D8ds/zZrJaIiKAXwQ+MaeDmobM3J+ZXyrPvwM8SOPumwtxX2lZy0LcXybZ\nLt3om/2lUx2zsa8Y8FN3Je03wE9FxFci4tMR8Yq5HNQ0DGTmfmh88ICXtujT6tbC3e5Y8+FtwKfb\nzEvg7yJidzTultivri2nT7e1ORW80LbJvwEOZObDbeb35XaJiBXAq4EvsMD3lQm1NFtw+0uLWhbk\n/tJmm/R8XzHgpyAaN+f5eeB/tJj9JRqn7V8F/CnwP+dybLMkWrT15Z9dRMRvA0eBj7fp8rrMfA1w\nCXBNRPzMnA2uezcC/wI4F9hP43TdRAtmmxRvYvIjkr7bLhGxCPhb4J2Z+WS3i7Vom/ft0q6Whbi/\ntKhlQe4vk3y+er6vGPBTcwnwpcw8MHFGZj6ZmWPl+U7gpIg4c64HOAUHImIpQPl6sEWfjrcW7gfl\ngqafA67K8suqiTLz8fL1IPApGqfu+kpmHsjMZzPz+8CHaD3GBbFNACLiRODfATe369Nv2yUiTqLx\nzffjmfnJ0rwg95U2tSzI/aVVLQtxf5lkm8zKvmLAT03bn7Ai4kfK71CIiNU03tt/msOxTdUOYPxK\n33XArS363AFcFBFLyumvi0pb34iItcB1wM9n5lNt+rwoIk4bf06jjvtb9Z1P4yFSvIHWY1xIt3t+\nPfC1zNzXama/bZey/94EPJiZf9Q0a8HtK+1qWYj7yyS1LKj9ZZLPF8zWvjIfVxMuxAfwQzQC+/Sm\ntncA7yjPrwUeoHGV5l3Av57vMTeN8xM0TmH9Pxo/0V4NvBi4E3i4fD2j9B0E/qpp2bcBo+Xx1j6s\nY5TG79juLY+/KH1/FNhZnv942S5fKdvot/t0m3wU2APcR+Ob0NKJtZTpS2lcgfv1fq2ltH94fP9o\n6tu32wX4aRqnb+9r+jxdukD3lXa1LLj9ZZJaFtT+0q6OMm9W9hXvZCdJUoU8RS9JUoUMeEmSKmTA\nS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmq0P8HRHFgURsDZQUAAAAASUVORK5CYII=\n",
672 | "text/plain": [
673 | ""
674 | ]
675 | },
676 | "metadata": {},
677 | "output_type": "display_data"
678 | }
679 | ],
680 | "source": [
681 | "%matplotlib inline\n",
682 | "import matplotlib.pyplot as plt\n",
683 | "jobs_copy.hist(bins=50, figsize=(8,5))\n",
684 | "plt.show()"
685 | ]
686 | },
687 | {
688 | "cell_type": "markdown",
689 | "metadata": {},
690 | "source": [
691 | "## 不相关职位筛选"
692 | ]
693 | },
694 | {
695 | "cell_type": "markdown",
696 | "metadata": {},
697 | "source": [
698 | "### 筛选出职位名称包含:商业|数据分析|挖掘|分析|BI|BA|数据"
699 | ]
700 | },
701 | {
702 | "cell_type": "code",
703 | "execution_count": 280,
704 | "metadata": {},
705 | "outputs": [
706 | {
707 | "data": {
708 | "text/plain": [
709 | "数据分析师 5114\n",
710 | "电商运营 1144\n",
711 | "数据分析 1049\n",
712 | "产品经理 881\n",
713 | "网络推广 609\n",
714 | "Name: 职位名称, dtype: int64"
715 | ]
716 | },
717 | "execution_count": 280,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "jobs_copy['职位名称'].value_counts().head()"
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": 281,
729 | "metadata": {
730 | "collapsed": true
731 | },
732 | "outputs": [],
733 | "source": [
734 | "da_test = jobs_copy.astype(str)\n",
735 | "\n",
736 | "temp_0 = da_test[\n",
737 | " da_test['职位名称'].str.contains('商业|数据分析|挖掘|分析|BI|BA|数据')\n",
738 | " ]\n",
739 | "\n",
740 | "#data_analysis = jobs_result['职位名称']\n",
741 | "temp = list(temp_0['职位名称']) # 将含有关键字的列表表示为test1\n",
742 | "jobs_data = da_test[da_test['职位名称'].isin(temp)]"
743 | ]
744 | },
745 | {
746 | "cell_type": "code",
747 | "execution_count": 282,
748 | "metadata": {},
749 | "outputs": [],
750 | "source": [
751 | "#jobs_data['职位名称'].value_counts()"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 283,
757 | "metadata": {},
758 | "outputs": [
759 | {
760 | "data": {
761 | "text/plain": [
762 | "(13716, 6)"
763 | ]
764 | },
765 | "execution_count": 283,
766 | "metadata": {},
767 | "output_type": "execute_result"
768 | }
769 | ],
770 | "source": [
771 | "jobs_data.shape"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 284,
777 | "metadata": {},
778 | "outputs": [
779 | {
780 | "data": {
781 | "text/html": [
782 | "\n",
783 | "\n",
796 | "
\n",
797 | " \n",
798 | " \n",
799 | " | \n",
800 | " 城市 | \n",
801 | " 职位名称 | \n",
802 | " 工作经验 | \n",
803 | " 公司规模 | \n",
804 | " 学历要求 | \n",
805 | " salary | \n",
806 | "
\n",
807 | " \n",
808 | " \n",
809 | " \n",
810 | " 0 | \n",
811 | " 北京 | \n",
812 | " 数据分析 | \n",
813 | " 经验不限 | \n",
814 | " 10000人以上 | \n",
815 | " 本科 | \n",
816 | " 8.0 | \n",
817 | "
\n",
818 | " \n",
819 | " 2 | \n",
820 | " 北京 | \n",
821 | " 数据分析师 | \n",
822 | " 3-5年 | \n",
823 | " 1000-9999人 | \n",
824 | " 本科 | \n",
825 | " 20.0 | \n",
826 | "
\n",
827 | " \n",
828 | " 3 | \n",
829 | " 北京 | \n",
830 | " 数据分析 | \n",
831 | " 经验不限 | \n",
832 | " 10000人以上 | \n",
833 | " 本科 | \n",
834 | " 9.0 | \n",
835 | "
\n",
836 | " \n",
837 | " 4 | \n",
838 | " 北京 | \n",
839 | " 数据分析师 | \n",
840 | " 3-5年 | \n",
841 | " 10000人以上 | \n",
842 | " 本科 | \n",
843 | " 12.5 | \n",
844 | "
\n",
845 | " \n",
846 | " 8 | \n",
847 | " 北京 | \n",
848 | " 数据分析师 | \n",
849 | " 1-3年 | \n",
850 | " 10000人以上 | \n",
851 | " 本科 | \n",
852 | " 25.5 | \n",
853 | "
\n",
854 | " \n",
855 | "
\n",
856 | "
"
857 | ],
858 | "text/plain": [
859 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n",
860 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n",
861 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n",
862 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n",
863 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5\n",
864 | "8 北京 数据分析师 1-3年 10000人以上 本科 25.5"
865 | ]
866 | },
867 | "execution_count": 284,
868 | "metadata": {},
869 | "output_type": "execute_result"
870 | }
871 | ],
872 | "source": [
873 | "dt_test = jobs_data.astype(str)\n",
874 | "y = dt_test[\n",
875 | " dt_test['职位名称'].str.contains('转行')\n",
876 | " ]\n",
877 | "\n",
878 | "test1 = list(y['职位名称']) # 将含有关键字的列表表示为test1\n",
879 | "test2 = list(dt_test['职位名称']) # 将全部表示为test2\n",
880 | "ret = list(set(test2) ^ set(test1)) # 列表求差集的方法将含有关键字的行除去\n",
881 | "\n",
882 | "jobs_data = dt_test[dt_test['职位名称'].isin(ret)]\n",
883 | "jobs_data.head()"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": 285,
889 | "metadata": {},
890 | "outputs": [
891 | {
892 | "name": "stdout",
893 | "output_type": "stream",
894 | "text": [
895 | "\n",
896 | "Int64Index: 13665 entries, 0 to 65053\n",
897 | "Data columns (total 6 columns):\n",
898 | "城市 13665 non-null object\n",
899 | "职位名称 13665 non-null object\n",
900 | "工作经验 13665 non-null object\n",
901 | "公司规模 13665 non-null object\n",
902 | "学历要求 13665 non-null object\n",
903 | "salary 13665 non-null float64\n",
904 | "dtypes: float64(1), object(5)\n",
905 | "memory usage: 747.3+ KB\n"
906 | ]
907 | }
908 | ],
909 | "source": [
910 | "jobs_data_copy = jobs_data.copy()\n",
911 | "jobs_data_copy['salary'] = pd.to_numeric(jobs_data_copy['salary'])\n",
912 | "jobs_data_copy.info()"
913 | ]
914 | },
915 | {
916 | "cell_type": "code",
917 | "execution_count": 286,
918 | "metadata": {},
919 | "outputs": [
920 | {
921 | "data": {
922 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAE/CAYAAACqxdFzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAHGNJREFUeJzt3X+Q3Hd93/HnOxZ2DDKWjMvVkTSR\nE1QnTlQSczUmpPSEEv9ksNuJW7sOlsAdDcFQCGKwCDOYIUMrN3UokJRUYA1y6lgm/Kg1WBQ0hivD\nDHZBBCwbQyQ7in2ykADZMsImVPDuH/u5sD7t3u7d7u3tffx8zNzc7uf72e/3897vfu913+9+97uR\nmUiSpLr83HwPQJIk9Z8BL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlwRARGREvGi+xyGpPwx4\nSZIqZMBL6klELJrvMUg6kQEvVSgiboiIAxHxg4j4dkSsjYjzI+LLEfFERByMiD+LiJPbPP6yiPib\niHgyIh6NiHc3TVtZDudfFxGPAJ+PiLsi4k1T5nFfRFwxt5VKaseAlyoTEecAbwT+RWaeBlwE7Ad+\nAvwhcCbwMmAt8IY2s/khcC2wBLgM+IMWYf2vgF8t898G/H7TGF4MLAN29qUoSTNmwEv1+QlwCnBu\nRDwnM/dn5kOZuTsz78nM45m5H/gfNEL6BJk5npl7MvOnmXkfcHuLvu/OzB9m5tPAncCqiFhVpr0G\nuCMzfzwXBUrqzICXKpOZ+4C3AO8GDkfE9oj4hYj4ZxHx6Yj4TkQ8CfwnGnvzJ4iIl0bEFyLiuxFx\nFHh9i76PNi3zH4CPAb8fET8HXA38Zd+Lk9Q1A16qUGb+VWb+NvCLQAI3AR8CvgWsysznA38ERJtZ\n/BWwA1iRmacDf9Gi79SvotwGXEPj0P9TmfnlftQiaXYMeKkyEXFORLwyIk4BfgQ8TeOw/WnAk8Cx\niPgV4A+mmc1pwJHM/FFEnA/8+07LLYH+U+Bm3HuX5p0BL9XnFGAz8D3gO8ALaeytv41GUP8A+DBw\nxzTzeAPwnoj4AfAuGoffu3ErsBr4n7MauaS+icypR9kkaXYi4lpgQ3l7QNI8cg9eUl9ExHNp7Plv\nme+xSDLgJfVBRFwEfBc4ROMEPUnzzEP0kiRVyD14SZIqZMBLklShof4WqDPPPDNXrlw538OY1g9/\n+EOe97znzfcw+qKWWmqpA6xlWNVSSy11QF217N69+3uZ+U96nc9QB/zKlSv56le/Ot/DmNb4+Dhj\nY2PzPYy+qKWWWuoAaxlWtdRSSx1QVy0R8ff9mI+H6CVJqlDHgI+IrRFxOCLubzHtbeV7oc8s9yMi\nPhAR+8p3QZ/X1HddROwtP+v6W4YkSWrWzR78R4GLpzZGxArgd4FHmpovAVaVnw00vtyCiDgDuBF4\nKXA+cGNELO1l4JIkqb2OAZ+ZXwSOtJj0PuDtPPMbpS4Hbs2Ge4AlEXEWcBGwKzOPZObjwC5a/NMg\nSZL6Y1bvwUfEq4EDmfmNKZOW0fQd0cBEaWvXLkmS5sCMz6Iv15t+J3Bhq8kt2nKa9lbz30Dj8D4j\nIyOMj4/PdIgDdezYsaEfY7dqqaWWOsBahlUttdRSB9RVS7/M5mNyvwycDXwjIgCWA18r3xk9Aaxo\n6rsceKy0j01pH28188zcQvmyitHR0Rz2jz3U9NGMWmqppQ6wlmFVSy211AF11dIvMz5En5l7MvOF\nmbkyM1fSCO/zMvM7wA7g2nI2/QXA0cw8CHwWuDAilpaT6y4sbZIkaQ508zG524EvA+dExEREXDdN\n953Aw8A+4MM0vjqSzDwC/DHwlfLzntImSZLmQMdD9Jl5dYfpK5tuJ3B9m35bga0zHJ8kSZoFr2Qn\nSVKFhvpa9NKz1cpNdwGwcfVx1pfbzfZvvmzQQ5K0wLgHL0lShQx4SZIqZMBLklQhA16SpAoZ8JIk\nVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLg\nJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIq\nZMBLklShjgEfEVsj4nBE3N/U9icR8a2IuC8iPhURS5qmvSMi9kXEtyPioqb2i0vbvojY1P9SJEnS\npG724D8KXDylbRfw65n5z4G/Bd4BEBHnAlcBv1Ye898j4qSIOAn4c+AS4Fzg6tJXkiTNgY4Bn5lf\nBI5MaftcZh4vd+8BlpfblwPbM/MfMvPvgH3A+eVnX2Y+nJk/BraXvpIkaQ704z341wGfKbeXAY82\nTZsobe3aJUnSHFjUy4Mj4p3AceC2yaYW3ZLW/0hkm3luADYAjIyMMD4+3ssQ59yxY8eGfozdqqWW\nGurYuLpxgGzk1J/dbrYQ66thvUyqpZZa6oC6aumXWQd8RKwDXgWszczJsJ4AVjR1Ww48Vm63a3+G\nzNwCbAEYHR3NsbGx2Q5xIMbHxxn2MXarllpqqGP9pruARrjfvOfEzXT/NWMDHlHvalgvk2qppZY6\noK5a+mVWh+gj4mLgBuDVmflU06QdwFURcUpEnA2sAv4v8BVgVUScHREn0zgRb0dvQ5ckSe103IOP\niNuBMeDMiJgAbqRx1vwpwK6IALgnM1+fmQ9ExMeAb9I4dH99Zv6kzOeNwGeBk4CtmfnAHNQjSZLo\nIuAz8+oWzbdM0/+9wHtbtO8Eds5odJIkaVa8kp0kSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16S\npAoZ8JIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIG\nvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lS\nhQx4SZIqZMBLklShjgEfEVsj4nBE3N/UdkZE7IqIveX30tIeEfGBiNgXEfdFxHlNj1lX+u+NiHVz\nU44kSQJY1EWfjwJ/Btza1LYJuDszN0fEpnL/BuASYFX5eSnwIeClEXEGcCMwCiSwOyJ2ZObj/Spk\nUFZuuusZ9zeuPs76prb9my8b9JAkSTpBxz34zPwicGRK8+XAtnJ7G3BFU/ut2XAPsCQizgIuAnZl\n5pES6ruAi/tRgCRJOtFs34MfycyDAOX3C0v7MuDRpn4Tpa1duyRJmgPdHKKfiWjRltO0nziDiA3A\nBoCRkRHGx8f7Nrh+2Lj6+DPuj5z6zLZhG+9MHDt2bEGPf1INdUy+pqa+viYtxPpqWC+Taqmlljqg\nrlr6ZbYBfygizsrMg+UQ/OHSPgGsaOq3HHistI9NaR9vNePM3AJsARgdHc2xsbFW3ebN+hbvwd+8\n52dP4/5rxgY8ov4ZHx9n2J7v2aihjsnX2dTX16SF+DqrYb1MqqWWWuqAumrpl9keot8BTJ4Jvw64\ns6n92nI2/QXA0XII/7PAhRGxtJxxf2FpkyRJc6DjHnxE3E5j7/vMiJigcTb8ZuBjEXEd8AhwZem+\nE7gU2Ac8BbwWIDOPRMQfA18p/d6TmVNP3JMkSX3SMeAz8+o2k9a26JvA9W3msxXYOqPRSZKkWfFK\ndpIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUoX5fi37oTf2616n8uldJUg3cg5ck\nqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVKFn3cfkOun0MTpJqsWeA0dZ70eHq+UevCRJFXIP\nvs+8kI4kaRi4By9JUoXcg19gujlHwKMEkiT34CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIk\nVciAlySpQga8JEkVMuAlSaqQV7LTwHm9fkmaez3twUfEH0bEAxFxf0TcHhE/HxFnR8S9EbE3Iu6I\niJNL31PK/X1l+sp+FCBJkk4064CPiGXAfwRGM/PXgZOAq4CbgPdl5irgceC68pDrgMcz80XA+0o/\nSZI0B3p9D34RcGpELAKeCxwEXgl8vEzfBlxRbl9e7lOmr42I6HH5kiSphVkHfGYeAP4r8AiNYD8K\n7AaeyMzjpdsEsKzcXgY8Wh57vPR/wWyXL0mS2ovMnN0DI5YCnwD+HfAE8Nfl/o3lMDwRsQLYmZmr\nI+IB4KLMnCjTHgLOz8zvT5nvBmADwMjIyEu2b98+q/G1s+fA0b7Ob+RUOPR09/1XLzu9p+V1M/7Z\nLuPYsWMsXrx4Vo+diU419PocDaqOuTT5HLV7ffX6HM2HGtbLpFpqOXzkaMe/XwvltVbLOgFYs2bN\n7swc7XU+vZxF/zvA32XmdwEi4pPAbwFLImJR2UtfDjxW+k8AK4CJckj/dODI1Jlm5hZgC8Do6GiO\njY31MMQTre/i+9RnYuPq49y8p/uncf81Yz0tr5vxz3YZ4+Pj9Pv5bqVTDb0+R4OqYy5NPkftXl+9\nPkfzoYb1MqmWWj54250d/34tlNdaLeukn3p5D/4R4IKIeG55L30t8E3gC8DvlT7rgDvL7R3lPmX6\n53O2hw8kSdK0enkP/l4aJ8t9DdhT5rUFuAF4a0Tso/Ee+y3lIbcALyjtbwU29TBuSZI0jZ4udJOZ\nNwI3Tml+GDi/Rd8fAVf2sjxJktQdL1UrSVKFDHhJkipkwEuSVCG/bEbV2XPgaOeP4vmFNpIq5x68\nJEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKF\nDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRVaNN8DkKZauemuaafv33zZgEYiSQuXe/CS\nJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVKGeAj4ilkTExyPiWxHxYES8LCLOiIhd\nEbG3/F5a+kZEfCAi9kXEfRFxXn9KkCRJU/W6B/9+4H9n5q8ALwYeBDYBd2fmKuDuch/gEmBV+dkA\nfKjHZUuSpDZmHfAR8XzgFcAtAJn548x8Argc2Fa6bQOuKLcvB27NhnuAJRFx1qxHLkmS2orMnN0D\nI34D2AJ8k8be+27gzcCBzFzS1O/xzFwaEZ8GNmfml0r73cANmfnVKfPdQGMPn5GRkZds3759VuNr\nZ8+Bo32d38ipcOjp7vuvXnZ6T8vrZvydltFuHpO19DrGTnpdB53Gd/jI0Y7rZK5r7NXkc9Tu9TXs\n42/l2LFjLF68eL6H0Re11FLDtjKplnUCsGbNmt2ZOdrrfHq5Fv0i4DzgTZl5b0S8n58djm8lWrSd\n8N9FZm6h8Y8Do6OjOTY21sMQT7S+w3XOZ2rj6uPcvKf7p3H/NWM9La+b8XdaRrt5TNbS6xg76XUd\ndBrfB2+7s+M6mesaezX5HLV7fQ37+FsZHx+n39vzfKmllhq2lUm1rJN+6uU9+AlgIjPvLfc/TiPw\nD00eei+/Dzf1X9H0+OXAYz0sX5IktTHrgM/M7wCPRsQ5pWktjcP1O4B1pW0dcGe5vQO4tpxNfwFw\nNDMPznb5kiSpvV6/LvZNwG0RcTLwMPBaGv80fCwirgMeAa4sfXcClwL7gKdKX0mSNAd6CvjM/DrQ\n6kSAtS36JnB9L8uTJEnd8Up2kiRVyICXJKlCBrwkSRUy4CVJqlCvZ9FLUrVWTnNRqPWb7mL/5ssG\nPCKpe+7BS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKk\nChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlySpQga8\nJEkVMuAlSaqQAS9JUoV6DviIOCki/iYiPl3unx0R90bE3oi4IyJOLu2nlPv7yvSVvS5bkiS11o89\n+DcDDzbdvwl4X2auAh4Hrivt1wGPZ+aLgPeVfpIkaQ70FPARsRy4DPhIuR/AK4GPly7bgCvK7cvL\nfcr0taW/JEnqs1734P8b8Hbgp+X+C4AnMvN4uT8BLCu3lwGPApTpR0t/SZLUZ5GZs3tgxKuASzPz\nDRExBrwNeC3w5XIYnohYAezMzNUR8QBwUWZOlGkPAedn5venzHcDsAFgZGTkJdu3b59dZW3sOXC0\nr/MbORUOPd19/9XLTu9ped2Mv9My2s1jspZex9hJr+ug0/gOHznacZ3MdY29mnyO2r2+hn38rRw7\ndozFixfP9zBmZL63lblWw7YyaSG+vtpZs2bN7swc7XU+i3p47MuBV0fEpcDPA8+nsUe/JCIWlb30\n5cBjpf8EsAKYiIhFwOnAkakzzcwtwBaA0dHRHBsb62GIJ1q/6a6+zm/j6uPcvKf7p3H/NWM9La+b\n8XdaRrt5TNbS6fErO4xh/+bLZrX8bnUa3wdvu7PjOul1Pcy1yeeo3etr2Mffyvj4OP3enudar9vK\nsKthW5m0EF9fc23Wh+gz8x2ZuTwzVwJXAZ/PzGuALwC/V7qtA+4st3eU+5Tpn8/ZHj6QJEnTmovP\nwd8AvDUi9tF4j/2W0n4L8ILS/lZg0xwsW5Ik0dsh+n+UmePAeLn9MHB+iz4/Aq7sx/IkSdL0vJKd\nJEkVMuAlSaqQAS9JUoUMeEmSKtSXk+zUP50+Yy5JUjcMeEnzotcLJkmanofoJUmqkAEvSVKFDHhJ\nkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpBXspM0J6ZeqW7j6uOs91LM0sC4By9J\nUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUJ+TE6q0NSPqLWyf/NlAxiJpPliwFeomz/ukqS6\neYhekqQKGfCSJFXIQ/Tqu2fDWwSdavT9bUnzzT14SZIqZMBLklShWQd8RKyIiC9ExIMR8UBEvLm0\nnxERuyJib/m9tLRHRHwgIvZFxH0RcV6/ipAkSc/Uyx78cWBjZv4qcAFwfUScC2wC7s7MVcDd5T7A\nJcCq8rMB+FAPy5YkSdOY9Ul2mXkQOFhu/yAiHgSWAZcDY6XbNmAcuKG035qZCdwTEUsi4qwyH0mS\nZmzyhNeNq4+zvsXJr8/mE1778h58RKwEfhO4FxiZDO3y+4Wl2zLg0aaHTZQ2SZLUZ9HYoe5hBhGL\ngf8DvDczPxkRT2Tmkqbpj2fm0oi4C/jPmfml0n438PbM3D1lfhtoHMJnZGTkJdu3b+9pfFPtOXC0\nr/MbORUOPd19/9XLTp92er/HNxOTtQzzGKHz+A4fOdpxnXSaRyednoN+zb/d66sf66jXMXYydQz9\n3lYGod3z2O22MuwGsa3MtV63lWG0Zs2a3Zk52ut8evocfEQ8B/gEcFtmfrI0H5o89B4RZwGHS/sE\nsKLp4cuBx6bOMzO3AFsARkdHc2xsrJchnqDVIZxebFx9nJv3dP807r9mbNrp/R7fTEzWMsxjhM7P\n4Qdvu7PjOuk0j046PQf9mn+711c/1lGvY+xk6hj6va0MQrvnsdttZdgNYluZa71uKzXr5Sz6AG4B\nHszMP22atANYV26vA+5sar+2nE1/AXDU998lSZobvezBvxx4DbAnIr5e2v4I2Ax8LCKuAx4BrizT\ndgKXAvuAp4DX9rBsSZI0jV7Oov8SEG0mr23RP4HrZ7s8SZLUPa9kJ0lShQx4SZIqZMBLklQhA16S\npAoZ8JIkVciAlySpQj1dyU5aqFZ2uhLds/gLKiTVwYCXWuj0D4AkDTsP0UuSVCEDXpKkChnwkiRV\nyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlySpQga8JEkV8tvk\npHngt9XNPZ/j4eBXM88fA15SS/5hlhY2D9FLklQh9+AlLUg1HIL3KInmknvwkiRVyD14SVJbHmVY\nuNyDlySpQu7BS5qVGt4Dl2o+QjHwgI+Ii4H3AycBH8nMzYMegyTp2eHZ/I/oQAM+Ik4C/hz4XWAC\n+EpE7MjMbw5yHJK0ENS8d6m5N+g9+POBfZn5MEBEbAcuBwx4aQaezXsl0iB1s60N6z9agw74ZcCj\nTfcngJcOeAySNBD+I6b5FJk5uIVFXAlclJn/odx/DXB+Zr6pqc8GYEO5ew7w7YENcHbOBL4334Po\nk1pqqaUOsJZhVUsttdQBddVyTmae1utMBr0HPwGsaLq/HHisuUNmbgG2DHJQvYiIr2bm6HyPox9q\nqaWWOsBahlUttdRSB9RXSz/mM+jPwX8FWBURZ0fEycBVwI4Bj0GSpOoNdA8+M49HxBuBz9L4mNzW\nzHxgkGOQJOnZYOCfg8/MncDOQS93Di2YtxO6UEsttdQB1jKsaqmlljrAWk4w0JPsJEnSYHgtekmS\nKmTAdyEizomIrzf9PBkRb5nSZywijjb1edd8jXeqiNgaEYcj4v6mtjMiYldE7C2/l7Z57LrSZ29E\nrBvcqFuOpVUdfxIR34qI+yLiUxGxpM1j90fEnrJu+nKGai/a1PLuiDjQ9Bq6tM1jL46Ib0fEvojY\nNLhRt9amljua6tgfEV9v89ihWS8RsSIivhARD0bEAxHx5tK+ELeVdrUsuO1lmloW1PYyTR1zt61k\npj8z+KFxcuB3gF+c0j4GfHq+x9dmzK8AzgPub2r7L8CmcnsTcFOLx50BPFx+Ly23lw5ZHRcCi8rt\nm1rVUabtB86c73XRoZZ3A2/r8LiTgIeAXwJOBr4BnDtstUyZfjPwrmFfL8BZwHnl9mnA3wLnLtBt\npV0tC257maaWBbW9tKtjSp++bivuwc/cWuChzPz7+R5ItzLzi8CRKc2XA9vK7W3AFS0eehGwKzOP\nZObjwC7g4jkbaAet6sjMz2Xm8XL3HhrXVhh6bdZJN/7xcs+Z+WNg8nLP82a6WiIigH8L3D7QQc1C\nZh7MzK+V2z8AHqRx9c2FuK20rGUhbi/TrJduDM320qmOudhWDPiZu4r2K+BlEfGNiPhMRPzaIAc1\nCyOZeRAaLzzghS36tLq0cLcb1nx4HfCZNtMS+FxE7I7G1RKH1RvL4dOtbQ4FL7R18i+BQ5m5t830\noVwvEbES+E3gXhb4tjKllmYLbntpUcuC3F7arJO+bysG/AxE4+I8rwb+usXkr9E4bP9i4IPA/xrk\n2OZItGgbyo9dRMQ7gePAbW26vDwzzwMuAa6PiFcMbHDd+xDwy8BvAAdpHK6basGsk+Jqpt8jGbr1\nEhGLgU8Ab8nMJ7t9WIu2eV8v7WpZiNtLi1oW5PYyzeur79uKAT8zlwBfy8xDUydk5pOZeazc3gk8\nJyLOHPQAZ+BQRJwFUH4fbtGn46WFh0E5oelVwDVZ3qyaKjMfK78PA5+icehuqGTmocz8SWb+FPgw\nrce4INYJQEQsAv4NcEe7PsO2XiLiOTT++N6WmZ8szQtyW2lTy4LcXlrVshC3l2nWyZxsKwb8zLT9\nDysi/ml5D4WIOJ/Gc/v9AY5tpnYAk2f6rgPubNHns8CFEbG0HP66sLQNjYi4GLgBeHVmPtWmz/Mi\n4rTJ2zTquL9V3/k0GSLFv6b1GBfS5Z5/B/hWZk60mjhs66Vsv7cAD2bmnzZNWnDbSrtaFuL2Mk0t\nC2p7meb1BXO1rczH2YQL8Qd4Lo3APr2p7fXA68vtNwIP0DhL8x7gt+Z7zE3jvJ3GIaz/R+M/2uuA\nFwB3A3vL7zNK31HgI02PfR2wr/y8dgjr2EfjPbavl5+/KH1/AdhZbv9SWS/fKOvonUO6Tv4S2APc\nR+OP0FlTayn3L6VxBu5Dw1pLaf/o5PbR1Hdo1wvw2zQO397X9Hq6dIFuK+1qWXDbyzS1LKjtpV0d\nZdqcbCteyU6SpAp5iF6SpAoZ8JIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUof8P\nLIwC2ZVVELMAAAAASUVORK5CYII=\n",
923 | "text/plain": [
924 | ""
925 | ]
926 | },
927 | "metadata": {},
928 | "output_type": "display_data"
929 | }
930 | ],
931 | "source": [
932 | "%matplotlib inline\n",
933 | "import matplotlib.pyplot as plt\n",
934 | "jobs_data_copy.hist(bins=50, figsize=(8,5))\n",
935 | "# save_fig(\"attribute_histogram_plots\")\n",
936 | "plt.show()"
937 | ]
938 | },
939 | {
940 | "cell_type": "markdown",
941 | "metadata": {},
942 | "source": [
943 | "## 筛选岗位数量前150的城市"
944 | ]
945 | },
946 | {
947 | "cell_type": "code",
948 | "execution_count": 287,
949 | "metadata": {},
950 | "outputs": [],
951 | "source": [
952 | "cities = jobs_data_copy['城市'].value_counts()[:150].index"
953 | ]
954 | },
955 | {
956 | "cell_type": "code",
957 | "execution_count": 288,
958 | "metadata": {},
959 | "outputs": [],
960 | "source": [
961 | "temp_list = []\n",
962 | "for item in jobs_data_copy['城市'].values:\n",
963 | " temp_list.append(item in cities)"
964 | ]
965 | },
966 | {
967 | "cell_type": "code",
968 | "execution_count": 289,
969 | "metadata": {},
970 | "outputs": [],
971 | "source": [
972 | "#temp_list"
973 | ]
974 | },
975 | {
976 | "cell_type": "code",
977 | "execution_count": 290,
978 | "metadata": {},
979 | "outputs": [],
980 | "source": [
981 | "#temp_list\n",
982 | "jobs_data_copy = jobs_data_copy[temp_list]"
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": 291,
988 | "metadata": {},
989 | "outputs": [
990 | {
991 | "data": {
992 | "text/html": [
993 | "\n",
994 | "\n",
1007 | "
\n",
1008 | " \n",
1009 | " \n",
1010 | " | \n",
1011 | " 城市 | \n",
1012 | " 职位名称 | \n",
1013 | " 工作经验 | \n",
1014 | " 公司规模 | \n",
1015 | " 学历要求 | \n",
1016 | " salary | \n",
1017 | "
\n",
1018 | " \n",
1019 | " \n",
1020 | " \n",
1021 | " 0 | \n",
1022 | " 北京 | \n",
1023 | " 数据分析 | \n",
1024 | " 经验不限 | \n",
1025 | " 10000人以上 | \n",
1026 | " 本科 | \n",
1027 | " 8.0 | \n",
1028 | "
\n",
1029 | " \n",
1030 | " 2 | \n",
1031 | " 北京 | \n",
1032 | " 数据分析师 | \n",
1033 | " 3-5年 | \n",
1034 | " 1000-9999人 | \n",
1035 | " 本科 | \n",
1036 | " 20.0 | \n",
1037 | "
\n",
1038 | " \n",
1039 | " 3 | \n",
1040 | " 北京 | \n",
1041 | " 数据分析 | \n",
1042 | " 经验不限 | \n",
1043 | " 10000人以上 | \n",
1044 | " 本科 | \n",
1045 | " 9.0 | \n",
1046 | "
\n",
1047 | " \n",
1048 | " 4 | \n",
1049 | " 北京 | \n",
1050 | " 数据分析师 | \n",
1051 | " 3-5年 | \n",
1052 | " 10000人以上 | \n",
1053 | " 本科 | \n",
1054 | " 12.5 | \n",
1055 | "
\n",
1056 | " \n",
1057 | " 8 | \n",
1058 | " 北京 | \n",
1059 | " 数据分析师 | \n",
1060 | " 1-3年 | \n",
1061 | " 10000人以上 | \n",
1062 | " 本科 | \n",
1063 | " 25.5 | \n",
1064 | "
\n",
1065 | " \n",
1066 | "
\n",
1067 | "
"
1068 | ],
1069 | "text/plain": [
1070 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n",
1071 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n",
1072 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n",
1073 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n",
1074 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5\n",
1075 | "8 北京 数据分析师 1-3年 10000人以上 本科 25.5"
1076 | ]
1077 | },
1078 | "execution_count": 291,
1079 | "metadata": {},
1080 | "output_type": "execute_result"
1081 | }
1082 | ],
1083 | "source": [
1084 | "jobs_data_copy.head()"
1085 | ]
1086 | },
1087 | {
1088 | "cell_type": "code",
1089 | "execution_count": 292,
1090 | "metadata": {},
1091 | "outputs": [],
1092 | "source": [
1093 | "#jobs_data_copy['城市'].value_counts()"
1094 | ]
1095 | },
1096 | {
1097 | "cell_type": "markdown",
1098 | "metadata": {},
1099 | "source": [
1100 | "## 相同属性合并"
1101 | ]
1102 | },
1103 | {
1104 | "cell_type": "code",
1105 | "execution_count": 293,
1106 | "metadata": {},
1107 | "outputs": [],
1108 | "source": [
1109 | "experience_shift = {\n",
1110 | " '1-3年': '1-3年',\n",
1111 | " '3-5年': '3-5年',\n",
1112 | " '5-10年': '5-10年',\n",
1113 | " '应届生':'1年以内',\n",
1114 | " '1年以下':'1年以内',\n",
1115 | " '1年以内': '1年以内',\n",
1116 | " '无经验': '1年以内',\n",
1117 | " '经验不限': '经验不限',\n",
1118 | " '不限': '经验不限'\n",
1119 | "}\n",
1120 | "\n",
1121 | "scale_shift = {\n",
1122 | " '100-499人':'100-499人',\n",
1123 | " '0-20人':'0-20人',\n",
1124 | " '20人以下':'0-20人',\n",
1125 | " '20-99人':'20-99人',\n",
1126 | " '100-499人':'100-499人',\n",
1127 | " '500-999人':'500-999人',\n",
1128 | " '1000-9999人':'1000-9999人',\n",
1129 | " '10000人以上':'10000人以上',\n",
1130 | " '':'100-499人'\n",
1131 | "}\n",
1132 | "\n",
1133 | "degree_shift = {\n",
1134 | " '中专/中技': '中专',\n",
1135 | " '中技': '中专',\n",
1136 | " '中专': '中专',\n",
1137 | " '高中': '高中',\n",
1138 | " '大专': '大专',\n",
1139 | " '本科': '本科',\n",
1140 | " '硕士': '硕士',\n",
1141 | " '博士': '博士',\n",
1142 | " '不限': '学历不限',\n",
1143 | " '学历不限': '学历不限'\n",
1144 | "}"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 294,
1150 | "metadata": {},
1151 | "outputs": [],
1152 | "source": [
1153 | "jobs_data_copy['工作经验'] = jobs_data_copy['工作经验'].map(experience_shift)\n",
1154 | "jobs_data_copy['公司规模'] = jobs_data_copy['公司规模'].map(scale_shift)\n",
1155 | "jobs_data_copy['学历要求'] = jobs_data_copy['学历要求'].map(degree_shift)"
1156 | ]
1157 | },
1158 | {
1159 | "cell_type": "code",
1160 | "execution_count": 295,
1161 | "metadata": {},
1162 | "outputs": [
1163 | {
1164 | "data": {
1165 | "text/plain": [
1166 | "1-3年 5219\n",
1167 | "3-5年 4271\n",
1168 | "经验不限 1309\n",
1169 | "5-10年 825\n",
1170 | "1年以内 825\n",
1171 | "Name: 工作经验, dtype: int64"
1172 | ]
1173 | },
1174 | "execution_count": 295,
1175 | "metadata": {},
1176 | "output_type": "execute_result"
1177 | }
1178 | ],
1179 | "source": [
1180 | "jobs_data_copy['工作经验'].value_counts()"
1181 | ]
1182 | },
1183 | {
1184 | "cell_type": "code",
1185 | "execution_count": 296,
1186 | "metadata": {},
1187 | "outputs": [
1188 | {
1189 | "data": {
1190 | "text/plain": [
1191 | "100-499人 3843\n",
1192 | "20-99人 2848\n",
1193 | "1000-9999人 2503\n",
1194 | "10000人以上 1648\n",
1195 | "500-999人 1292\n",
1196 | "0-20人 315\n",
1197 | "Name: 公司规模, dtype: int64"
1198 | ]
1199 | },
1200 | "execution_count": 296,
1201 | "metadata": {},
1202 | "output_type": "execute_result"
1203 | }
1204 | ],
1205 | "source": [
1206 | "jobs_data_copy['公司规模'].value_counts()"
1207 | ]
1208 | },
1209 | {
1210 | "cell_type": "code",
1211 | "execution_count": 297,
1212 | "metadata": {},
1213 | "outputs": [
1214 | {
1215 | "data": {
1216 | "text/plain": [
1217 | "本科 8941\n",
1218 | "大专 2479\n",
1219 | "硕士 564\n",
1220 | "学历不限 420\n",
1221 | "中专 21\n",
1222 | "博士 13\n",
1223 | "高中 11\n",
1224 | "Name: 学历要求, dtype: int64"
1225 | ]
1226 | },
1227 | "execution_count": 297,
1228 | "metadata": {},
1229 | "output_type": "execute_result"
1230 | }
1231 | ],
1232 | "source": [
1233 | "jobs_data_copy['学历要求'].value_counts()"
1234 | ]
1235 | },
1236 | {
1237 | "cell_type": "markdown",
1238 | "metadata": {},
1239 | "source": [
1240 | "# 划分训练集与测试集"
1241 | ]
1242 | },
1243 | {
1244 | "cell_type": "code",
1245 | "execution_count": 298,
1246 | "metadata": {
1247 | "collapsed": true
1248 | },
1249 | "outputs": [],
1250 | "source": [
1251 | "from sklearn.model_selection import train_test_split\n",
1252 | "\n",
1253 | "train_set, test_set = train_test_split(jobs_data_copy, test_size=0.2, random_state=42)"
1254 | ]
1255 | },
1256 | {
1257 | "cell_type": "code",
1258 | "execution_count": 299,
1259 | "metadata": {
1260 | "collapsed": true
1261 | },
1262 | "outputs": [],
1263 | "source": [
1264 | "datas_train = train_set.copy()\n",
1265 | "datas_test = test_set.copy()"
1266 | ]
1267 | },
1268 | {
1269 | "cell_type": "code",
1270 | "execution_count": 300,
1271 | "metadata": {},
1272 | "outputs": [
1273 | {
1274 | "data": {
1275 | "text/html": [
1276 | "\n",
1277 | "\n",
1290 | "
\n",
1291 | " \n",
1292 | " \n",
1293 | " | \n",
1294 | " 城市 | \n",
1295 | " 职位名称 | \n",
1296 | " 工作经验 | \n",
1297 | " 公司规模 | \n",
1298 | " 学历要求 | \n",
1299 | " salary | \n",
1300 | "
\n",
1301 | " \n",
1302 | " \n",
1303 | " \n",
1304 | " 21110 | \n",
1305 | " 南阳 | \n",
1306 | " 数据分析 | \n",
1307 | " 1-3年 | \n",
1308 | " 20-99人 | \n",
1309 | " 本科 | \n",
1310 | " 6.5 | \n",
1311 | "
\n",
1312 | " \n",
1313 | " 56033 | \n",
1314 | " 云浮 | \n",
1315 | " 化妆品数据分析员 | \n",
1316 | " 1-3年 | \n",
1317 | " 100-499人 | \n",
1318 | " 本科 | \n",
1319 | " 8.5 | \n",
1320 | "
\n",
1321 | " \n",
1322 | " 37705 | \n",
1323 | " 泉州 | \n",
1324 | " 数据分析专员 | \n",
1325 | " 1-3年 | \n",
1326 | " 100-499人 | \n",
1327 | " 本科 | \n",
1328 | " 6.5 | \n",
1329 | "
\n",
1330 | " \n",
1331 | " 24622 | \n",
1332 | " 淮安 | \n",
1333 | " 数据分析 | \n",
1334 | " 1-3年 | \n",
1335 | " 1000-9999人 | \n",
1336 | " 大专 | \n",
1337 | " 6.5 | \n",
1338 | "
\n",
1339 | " \n",
1340 | " 37106 | \n",
1341 | " 宁德 | \n",
1342 | " 数据分析师(福州) | \n",
1343 | " 1-3年 | \n",
1344 | " 100-499人 | \n",
1345 | " 本科 | \n",
1346 | " 11.5 | \n",
1347 | "
\n",
1348 | " \n",
1349 | "
\n",
1350 | "
"
1351 | ],
1352 | "text/plain": [
1353 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n",
1354 | "21110 南阳 数据分析 1-3年 20-99人 本科 6.5\n",
1355 | "56033 云浮 化妆品数据分析员 1-3年 100-499人 本科 8.5\n",
1356 | "37705 泉州 数据分析专员 1-3年 100-499人 本科 6.5\n",
1357 | "24622 淮安 数据分析 1-3年 1000-9999人 大专 6.5\n",
1358 | "37106 宁德 数据分析师(福州) 1-3年 100-499人 本科 11.5"
1359 | ]
1360 | },
1361 | "execution_count": 300,
1362 | "metadata": {},
1363 | "output_type": "execute_result"
1364 | }
1365 | ],
1366 | "source": [
1367 | "datas_train.head()"
1368 | ]
1369 | },
1370 | {
1371 | "cell_type": "markdown",
1372 | "metadata": {},
1373 | "source": [
1374 | "## 为 标签/类别 属性编码"
1375 | ]
1376 | },
1377 | {
1378 | "cell_type": "markdown",
1379 | "metadata": {},
1380 | "source": [
1381 | "### 标签/类别 训练集/测试集划分"
1382 | ]
1383 | },
1384 | {
1385 | "cell_type": "code",
1386 | "execution_count": 301,
1387 | "metadata": {},
1388 | "outputs": [
1389 | {
1390 | "data": {
1391 | "text/html": [
1392 | "\n",
1393 | "\n",
1406 | "
\n",
1407 | " \n",
1408 | " \n",
1409 | " | \n",
1410 | " 城市 | \n",
1411 | " 公司规模 | \n",
1412 | " 学历要求 | \n",
1413 | " 工作经验 | \n",
1414 | "
\n",
1415 | " \n",
1416 | " \n",
1417 | " \n",
1418 | " 21110 | \n",
1419 | " 南阳 | \n",
1420 | " 20-99人 | \n",
1421 | " 本科 | \n",
1422 | " 1-3年 | \n",
1423 | "
\n",
1424 | " \n",
1425 | " 56033 | \n",
1426 | " 云浮 | \n",
1427 | " 100-499人 | \n",
1428 | " 本科 | \n",
1429 | " 1-3年 | \n",
1430 | "
\n",
1431 | " \n",
1432 | " 37705 | \n",
1433 | " 泉州 | \n",
1434 | " 100-499人 | \n",
1435 | " 本科 | \n",
1436 | " 1-3年 | \n",
1437 | "
\n",
1438 | " \n",
1439 | " 24622 | \n",
1440 | " 淮安 | \n",
1441 | " 1000-9999人 | \n",
1442 | " 大专 | \n",
1443 | " 1-3年 | \n",
1444 | "
\n",
1445 | " \n",
1446 | " 37106 | \n",
1447 | " 宁德 | \n",
1448 | " 100-499人 | \n",
1449 | " 本科 | \n",
1450 | " 1-3年 | \n",
1451 | "
\n",
1452 | " \n",
1453 | "
\n",
1454 | "
"
1455 | ],
1456 | "text/plain": [
1457 | " 城市 公司规模 学历要求 工作经验\n",
1458 | "21110 南阳 20-99人 本科 1-3年\n",
1459 | "56033 云浮 100-499人 本科 1-3年\n",
1460 | "37705 泉州 100-499人 本科 1-3年\n",
1461 | "24622 淮安 1000-9999人 大专 1-3年\n",
1462 | "37106 宁德 100-499人 本科 1-3年"
1463 | ]
1464 | },
1465 | "execution_count": 301,
1466 | "metadata": {},
1467 | "output_type": "execute_result"
1468 | }
1469 | ],
1470 | "source": [
1471 | "cata_train = datas_train[['城市','公司规模','学历要求','工作经验']] # 训练集\n",
1472 | "cata_test = datas_test[['城市','公司规模','学历要求','工作经验']] # 测试集\n",
1473 | "cata_train.head()"
1474 | ]
1475 | },
1476 | {
1477 | "cell_type": "code",
1478 | "execution_count": 302,
1479 | "metadata": {
1480 | "collapsed": true
1481 | },
1482 | "outputs": [],
1483 | "source": [
1484 | "from sklearn.preprocessing import OneHotEncoder\n",
1485 | "\n",
1486 | "cat_encoder = OneHotEncoder(sparse=False)"
1487 | ]
1488 | },
1489 | {
1490 | "cell_type": "code",
1491 | "execution_count": 303,
1492 | "metadata": {
1493 | "collapsed": true
1494 | },
1495 | "outputs": [],
1496 | "source": [
1497 | "job_cata_train = cat_encoder.fit_transform(cata_train)\n",
1498 | "job_cata_test = cat_encoder.fit_transform(cata_test)"
1499 | ]
1500 | },
1501 | {
1502 | "cell_type": "code",
1503 | "execution_count": 304,
1504 | "metadata": {},
1505 | "outputs": [
1506 | {
1507 | "data": {
1508 | "text/plain": [
1509 | "[array(['三明', '上海', '东沙群岛', '东莞', '东营', '中山', '临沂', '丽水', '乐山', '云浮', '亳州',\n",
1510 | " '仙桃', '佛山', '六安', '内江', '凉山', '北京', '十堰', '南京', '南充', '南平', '南通',\n",
1511 | " '南阳', '厦门', '台州', '合肥', '周口', '咸宁', '咸阳', '商丘', '商洛', '嘉兴', '大连',\n",
1512 | " '天津', '天门', '威海', '娄底', '孝感', '宁德', '安庆', '安康', '安阳', '宜宾', '宜昌',\n",
1513 | " '宣城', '宿州', '宿迁', '岳阳', '巴中', '常州', '常德', '平顶山', '广元', '广安', '广州',\n",
1514 | " '开封', '张家界', '徐州', '德阳', '怀化', '恩施', '惠州', '成都', '扬州', '揭阳', '攀枝花',\n",
1515 | " '新乡', '日照', '杭州', '枣庄', '株洲', '梅州', '武汉', '永州', '汕头', '汕尾', '江门',\n",
1516 | " '池州', '河源', '泉州', '泰安', '泰州', '泸州', '洛阳', '济南', '济宁', '淄博', '淮北',\n",
1517 | " '淮安', '深圳', '清远', '温州', '湖州', '湘潭', '湘西', '湛江', '滨州', '漯河', '漳州',\n",
1518 | " '潍坊', '潜江', '潮州', '烟台', '珠海', '甘孜', '益阳', '盐城', '眉山', '神农架', '福州',\n",
1519 | " '绍兴', '绵阳', '聊城', '肇庆', '自贡', '舟山', '芜湖', '苏州', '茂名', '荆州', '荆门',\n",
1520 | " '莆田', '莱芜', '菏泽', '衡阳', '衢州', '襄阳', '西安', '资阳', '达州', '连云港', '遂宁',\n",
1521 | " '邵阳', '郑州', '郴州', '鄂州', '金华', '铜川', '镇江', '长沙', '阜阳', '阳江', '阿坝',\n",
1522 | " '随州', '雅安', '韶关', '黄冈', '黄山', '黄石', '龙岩'], dtype=object),\n",
1523 | " array(['0-20人', '100-499人', '1000-9999人', '10000人以上', '20-99人',\n",
1524 | " '500-999人'], dtype=object),\n",
1525 | " array(['中专', '博士', '大专', '学历不限', '本科', '硕士', '高中'], dtype=object),\n",
1526 | " array(['1-3年', '1年以内', '3-5年', '5-10年', '经验不限'], dtype=object)]"
1527 | ]
1528 | },
1529 | "execution_count": 304,
1530 | "metadata": {},
1531 | "output_type": "execute_result"
1532 | }
1533 | ],
1534 | "source": [
1535 | "cat_encoder.categories_"
1536 | ]
1537 | },
1538 | {
1539 | "cell_type": "markdown",
1540 | "metadata": {},
1541 | "source": [
1542 | "### 参数构造函数"
1543 | ]
1544 | },
1545 | {
1546 | "cell_type": "code",
1547 | "execution_count": 378,
1548 | "metadata": {},
1549 | "outputs": [
1550 | {
1551 | "data": {
1552 | "text/plain": [
1553 | "array(['三明', '上海', '东沙群岛', '东莞', '东营', '中山', '临沂', '丽水', '乐山', '云浮', '亳州',\n",
1554 | " '仙桃', '佛山', '六安', '内江', '凉山', '北京', '十堰', '南京', '南充', '南平', '南通',\n",
1555 | " '南阳', '厦门', '台州', '合肥', '周口', '咸宁', '咸阳', '商丘', '商洛', '嘉兴', '大连',\n",
1556 | " '天津', '天门', '威海', '娄底', '孝感', '宁德', '安庆', '安康', '安阳', '宜宾', '宜昌',\n",
1557 | " '宣城', '宿州', '宿迁', '岳阳', '巴中', '常州', '常德', '平顶山', '广元', '广安', '广州',\n",
1558 | " '开封', '张家界', '徐州', '德阳', '怀化', '恩施', '惠州', '成都', '扬州', '揭阳', '攀枝花',\n",
1559 | " '新乡', '日照', '杭州', '枣庄', '株洲', '梅州', '武汉', '永州', '汕头', '汕尾', '江门',\n",
1560 | " '池州', '河源', '泉州', '泰安', '泰州', '泸州', '洛阳', '济南', '济宁', '淄博', '淮北',\n",
1561 | " '淮安', '深圳', '清远', '温州', '湖州', '湘潭', '湘西', '湛江', '滨州', '漯河', '漳州',\n",
1562 | " '潍坊', '潜江', '潮州', '烟台', '珠海', '甘孜', '益阳', '盐城', '眉山', '神农架', '福州',\n",
1563 | " '绍兴', '绵阳', '聊城', '肇庆', '自贡', '舟山', '芜湖', '苏州', '茂名', '荆州', '荆门',\n",
1564 | " '莆田', '莱芜', '菏泽', '衡阳', '衢州', '襄阳', '西安', '资阳', '达州', '连云港', '遂宁',\n",
1565 | " '邵阳', '郑州', '郴州', '鄂州', '金华', '铜川', '镇江', '长沙', '阜阳', '阳江', '阿坝',\n",
1566 | " '随州', '雅安', '韶关', '黄冈', '黄山', '黄石', '龙岩'], dtype=object)"
1567 | ]
1568 | },
1569 | "execution_count": 378,
1570 | "metadata": {},
1571 | "output_type": "execute_result"
1572 | }
1573 | ],
1574 | "source": [
1575 | "cat_encoder.categories_[0] "
1576 | ]
1577 | },
1578 | {
1579 | "cell_type": "code",
1580 | "execution_count": 444,
1581 | "metadata": {},
1582 | "outputs": [],
1583 | "source": [
1584 | "import numpy as np\n",
1585 | "\n",
1586 | "def func_params(templist):\n",
1587 | " temp = []\n",
1588 | " city,scale,degree,exp = templist\n",
1589 | " citypara = cat_encoder.categories_[0] == '{city}'.format(city=city)\n",
1590 | " scalepara = cat_encoder.categories_[1] == '{scale}'.format(scale=scale)\n",
1591 | " degreepara = cat_encoder.categories_[2] == '{degree}'.format(degree=degree)\n",
1592 | " exppara = cat_encoder.categories_[3] == '{exp}'.format(exp=exp)\n",
1593 | " # citypara\n",
1594 | " for item in citypara:\n",
1595 | " #print(item)\n",
1596 | " if item == False:\n",
1597 | " item = float(0)\n",
1598 | " temp.append(item)\n",
1599 | " else:\n",
1600 | " item = float(1)\n",
1601 | " temp.append(item)\n",
1602 | " for item in scalepara:\n",
1603 | " if item == False:\n",
1604 | " item = float(0)\n",
1605 | " temp.append(item)\n",
1606 | " else:\n",
1607 | " item = float(1)\n",
1608 | " temp.append(item)\n",
1609 | " for item in scalepara:\n",
1610 | " if item == False:\n",
1611 | " item = float(0)\n",
1612 | " temp.append(item)\n",
1613 | " else:\n",
1614 | " item = float(1)\n",
1615 | " temp.append(item)\n",
1616 | " for item in scalepara:\n",
1617 | " if item == False:\n",
1618 | " item = float(0)\n",
1619 | " temp.append(item)\n",
1620 | " else:\n",
1621 | " item = float(1)\n",
1622 | " temp.append(item)\n",
1623 | " temp = np.array(temp, dtype = float).reshape(1, -1)\n",
1624 | " return temp"
1625 | ]
1626 | },
1627 | {
1628 | "cell_type": "code",
1629 | "execution_count": 445,
1630 | "metadata": {},
1631 | "outputs": [],
1632 | "source": [
1633 | "paramlist = func_params(['上海','1000-9999人','硕士','1-3年'])\n",
1634 | "#paramlist"
1635 | ]
1636 | },
1637 | {
1638 | "cell_type": "code",
1639 | "execution_count": 372,
1640 | "metadata": {},
1641 | "outputs": [
1642 | {
1643 | "data": {
1644 | "text/plain": [
1645 | "array(['0-20人', '100-499人', '1000-9999人', '10000人以上', '20-99人',\n",
1646 | " '500-999人'], dtype=object)"
1647 | ]
1648 | },
1649 | "execution_count": 372,
1650 | "metadata": {},
1651 | "output_type": "execute_result"
1652 | }
1653 | ],
1654 | "source": [
1655 | "cat_encoder.categories_[1]"
1656 | ]
1657 | },
1658 | {
1659 | "cell_type": "code",
1660 | "execution_count": 432,
1661 | "metadata": {},
1662 | "outputs": [],
1663 | "source": [
1664 | "#cat_encoder.categories_[3]"
1665 | ]
1666 | },
1667 | {
1668 | "cell_type": "markdown",
1669 | "metadata": {},
1670 | "source": [
1671 | "# 用于机器学习的数据"
1672 | ]
1673 | },
1674 | {
1675 | "cell_type": "markdown",
1676 | "metadata": {},
1677 | "source": [
1678 | "## x_train,y_train"
1679 | ]
1680 | },
1681 | {
1682 | "cell_type": "code",
1683 | "execution_count": 305,
1684 | "metadata": {},
1685 | "outputs": [],
1686 | "source": [
1687 | "# 训练集\n",
1688 | "x_train = job_cata_train\n",
1689 | "y_train = datas_train['salary'].values.reshape(-1, 1)\n",
1690 | "#y_train"
1691 | ]
1692 | },
1693 | {
1694 | "cell_type": "code",
1695 | "execution_count": 439,
1696 | "metadata": {},
1697 | "outputs": [],
1698 | "source": [
1699 | "#x_train[0]"
1700 | ]
1701 | },
1702 | {
1703 | "cell_type": "code",
1704 | "execution_count": 306,
1705 | "metadata": {},
1706 | "outputs": [
1707 | {
1708 | "data": {
1709 | "text/plain": [
1710 | "(9959, 168)"
1711 | ]
1712 | },
1713 | "execution_count": 306,
1714 | "metadata": {},
1715 | "output_type": "execute_result"
1716 | }
1717 | ],
1718 | "source": [
1719 | "x_train.shape"
1720 | ]
1721 | },
1722 | {
1723 | "cell_type": "code",
1724 | "execution_count": 307,
1725 | "metadata": {},
1726 | "outputs": [
1727 | {
1728 | "data": {
1729 | "text/plain": [
1730 | "(9959, 1)"
1731 | ]
1732 | },
1733 | "execution_count": 307,
1734 | "metadata": {},
1735 | "output_type": "execute_result"
1736 | }
1737 | ],
1738 | "source": [
1739 | "y_train.shape"
1740 | ]
1741 | },
1742 | {
1743 | "cell_type": "markdown",
1744 | "metadata": {},
1745 | "source": [
1746 | "## x_test,y_test"
1747 | ]
1748 | },
1749 | {
1750 | "cell_type": "code",
1751 | "execution_count": 308,
1752 | "metadata": {
1753 | "collapsed": true
1754 | },
1755 | "outputs": [],
1756 | "source": [
1757 | "# 测试集\n",
1758 | "x_test = job_cata_test\n",
1759 | "y_test = datas_test['salary'].values.reshape(-1, 1)"
1760 | ]
1761 | },
1762 | {
1763 | "cell_type": "code",
1764 | "execution_count": 309,
1765 | "metadata": {},
1766 | "outputs": [
1767 | {
1768 | "data": {
1769 | "text/plain": [
1770 | "(2490, 168)"
1771 | ]
1772 | },
1773 | "execution_count": 309,
1774 | "metadata": {},
1775 | "output_type": "execute_result"
1776 | }
1777 | ],
1778 | "source": [
1779 | "x_test.shape"
1780 | ]
1781 | },
1782 | {
1783 | "cell_type": "code",
1784 | "execution_count": 310,
1785 | "metadata": {},
1786 | "outputs": [
1787 | {
1788 | "data": {
1789 | "text/plain": [
1790 | "(2490, 1)"
1791 | ]
1792 | },
1793 | "execution_count": 310,
1794 | "metadata": {},
1795 | "output_type": "execute_result"
1796 | }
1797 | ],
1798 | "source": [
1799 | "y_test.shape"
1800 | ]
1801 | },
1802 | {
1803 | "cell_type": "markdown",
1804 | "metadata": {},
1805 | "source": [
1806 | "# 机器学习建模"
1807 | ]
1808 | },
1809 | {
1810 | "cell_type": "markdown",
1811 | "metadata": {},
1812 | "source": [
1813 | "## 决策树"
1814 | ]
1815 | },
1816 | {
1817 | "cell_type": "code",
1818 | "execution_count": 311,
1819 | "metadata": {
1820 | "scrolled": true
1821 | },
1822 | "outputs": [
1823 | {
1824 | "data": {
1825 | "text/plain": [
1826 | "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
1827 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
1828 | " min_impurity_split=None, min_samples_leaf=1,\n",
1829 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
1830 | " presort=False, random_state=42, splitter='best')"
1831 | ]
1832 | },
1833 | "execution_count": 311,
1834 | "metadata": {},
1835 | "output_type": "execute_result"
1836 | }
1837 | ],
1838 | "source": [
1839 | "from sklearn.tree import DecisionTreeRegressor\n",
1840 | "\n",
1841 | "tree_reg = DecisionTreeRegressor(random_state=42)\n",
1842 | "tree_reg.fit(x_train, y_train)"
1843 | ]
1844 | },
1845 | {
1846 | "cell_type": "markdown",
1847 | "metadata": {},
1848 | "source": [
1849 | "### 检验"
1850 | ]
1851 | },
1852 | {
1853 | "cell_type": "markdown",
1854 | "metadata": {},
1855 | "source": [
1856 | "#### 训练集检验"
1857 | ]
1858 | },
1859 | {
1860 | "cell_type": "code",
1861 | "execution_count": 312,
1862 | "metadata": {
1863 | "scrolled": true
1864 | },
1865 | "outputs": [
1866 | {
1867 | "data": {
1868 | "text/plain": [
1869 | "array([[ 7.66666667],\n",
1870 | " [11.33333333],\n",
1871 | " [10. ],\n",
1872 | " [ 6.5 ],\n",
1873 | " [ 9.875 ],\n",
1874 | " [20. ],\n",
1875 | " [15.8125 ],\n",
1876 | " [ 8.38461538],\n",
1877 | " [ 7.8 ],\n",
1878 | " [10.25 ]])"
1879 | ]
1880 | },
1881 | "execution_count": 312,
1882 | "metadata": {},
1883 | "output_type": "execute_result"
1884 | }
1885 | ],
1886 | "source": [
1887 | "y_pred_tree = tree_reg.predict(x_train)\n",
1888 | "y_pred_tree[:10].reshape(10, 1)"
1889 | ]
1890 | },
1891 | {
1892 | "cell_type": "code",
1893 | "execution_count": 354,
1894 | "metadata": {},
1895 | "outputs": [],
1896 | "source": [
1897 | "#datas_train.head(10)"
1898 | ]
1899 | },
1900 | {
1901 | "cell_type": "code",
1902 | "execution_count": 314,
1903 | "metadata": {
1904 | "scrolled": true
1905 | },
1906 | "outputs": [
1907 | {
1908 | "data": {
1909 | "text/plain": [
1910 | "array([[ 6.5],\n",
1911 | " [ 8.5],\n",
1912 | " [ 6.5],\n",
1913 | " [ 6.5],\n",
1914 | " [11.5],\n",
1915 | " [20. ],\n",
1916 | " [16.5],\n",
1917 | " [ 7. ],\n",
1918 | " [ 6.5],\n",
1919 | " [ 8. ]])"
1920 | ]
1921 | },
1922 | "execution_count": 314,
1923 | "metadata": {},
1924 | "output_type": "execute_result"
1925 | }
1926 | ],
1927 | "source": [
1928 | "y_train[:10]"
1929 | ]
1930 | },
1931 | {
1932 | "cell_type": "code",
1933 | "execution_count": 315,
1934 | "metadata": {},
1935 | "outputs": [
1936 | {
1937 | "data": {
1938 | "text/plain": [
1939 | "3.0382473722545784"
1940 | ]
1941 | },
1942 | "execution_count": 315,
1943 | "metadata": {},
1944 | "output_type": "execute_result"
1945 | }
1946 | ],
1947 | "source": [
1948 | "from sklearn.metrics import mean_squared_error\n",
1949 | "import numpy as np\n",
1950 | "\n",
1951 | "tree_mse = mean_squared_error(y_train, y_pred_tree)\n",
1952 | "tree_rmse = np.sqrt(tree_mse)\n",
1953 | "tree_rmse"
1954 | ]
1955 | },
1956 | {
1957 | "cell_type": "markdown",
1958 | "metadata": {},
1959 | "source": [
1960 | "训练集预测误差:3.0382473722545784"
1961 | ]
1962 | },
1963 | {
1964 | "cell_type": "markdown",
1965 | "metadata": {},
1966 | "source": [
1967 | "#### 测试集检验"
1968 | ]
1969 | },
1970 | {
1971 | "cell_type": "code",
1972 | "execution_count": 316,
1973 | "metadata": {},
1974 | "outputs": [
1975 | {
1976 | "data": {
1977 | "text/plain": [
1978 | "4.782476424480568"
1979 | ]
1980 | },
1981 | "execution_count": 316,
1982 | "metadata": {},
1983 | "output_type": "execute_result"
1984 | }
1985 | ],
1986 | "source": [
1987 | "from sklearn.metrics import mean_squared_error\n",
1988 | "import numpy as np\n",
1989 | "\n",
1990 | "y_pred_tree_test = tree_reg.predict(x_test)\n",
1991 | "\n",
1992 | "tree_mse = mean_squared_error(y_test, y_pred_tree_test)\n",
1993 | "tree_rmse = np.sqrt(tree_mse)\n",
1994 | "tree_rmse"
1995 | ]
1996 | },
1997 | {
1998 | "cell_type": "markdown",
1999 | "metadata": {},
2000 | "source": [
2001 | "测试集误差:4.782476424480568"
2002 | ]
2003 | },
2004 | {
2005 | "cell_type": "markdown",
2006 | "metadata": {},
2007 | "source": [
2008 | "### 交叉验证"
2009 | ]
2010 | },
2011 | {
2012 | "cell_type": "code",
2013 | "execution_count": 317,
2014 | "metadata": {
2015 | "collapsed": true
2016 | },
2017 | "outputs": [],
2018 | "source": [
2019 | "from sklearn.model_selection import cross_val_score\n",
2020 | "\n",
2021 | "scores = cross_val_score(tree_reg, x_train, y_train,\n",
2022 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
2023 | "tree_rmse_scores = np.sqrt(-scores)"
2024 | ]
2025 | },
2026 | {
2027 | "cell_type": "code",
2028 | "execution_count": 318,
2029 | "metadata": {},
2030 | "outputs": [
2031 | {
2032 | "name": "stdout",
2033 | "output_type": "stream",
2034 | "text": [
2035 | "Scores: [4.62433783 4.74066502 4.8478161 4.60423202 4.78729759 4.55902835\n",
2036 | " 4.47123285 4.46471566 4.62152351 4.81591793]\n",
2037 | "Mean: 4.653676687092153\n",
2038 | "Standard deviation: 0.1310433320106498\n"
2039 | ]
2040 | }
2041 | ],
2042 | "source": [
2043 | "def display_scores(scores):\n",
2044 | " print(\"Scores:\", scores)\n",
2045 | " print(\"Mean:\", scores.mean())\n",
2046 | " print(\"Standard deviation:\", scores.std())\n",
2047 | "\n",
2048 | "display_scores(tree_rmse_scores)"
2049 | ]
2050 | },
2051 | {
2052 | "cell_type": "markdown",
2053 | "metadata": {},
2054 | "source": [
2055 | "## 随机森林"
2056 | ]
2057 | },
2058 | {
2059 | "cell_type": "code",
2060 | "execution_count": 319,
2061 | "metadata": {},
2062 | "outputs": [
2063 | {
2064 | "name": "stderr",
2065 | "output_type": "stream",
2066 | "text": [
2067 | "C:\\Users\\13626\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
2068 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n",
2069 | "C:\\Users\\13626\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
2070 | " after removing the cwd from sys.path.\n"
2071 | ]
2072 | },
2073 | {
2074 | "data": {
2075 | "text/plain": [
2076 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
2077 | " max_features='auto', max_leaf_nodes=None,\n",
2078 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
2079 | " min_samples_leaf=1, min_samples_split=2,\n",
2080 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n",
2081 | " oob_score=False, random_state=42, verbose=0, warm_start=False)"
2082 | ]
2083 | },
2084 | "execution_count": 319,
2085 | "metadata": {},
2086 | "output_type": "execute_result"
2087 | }
2088 | ],
2089 | "source": [
2090 | "from sklearn.ensemble import RandomForestRegressor\n",
2091 | "\n",
2092 | "forest_reg = RandomForestRegressor(random_state=42)\n",
2093 | "forest_reg.fit(x_train, y_train)"
2094 | ]
2095 | },
2096 | {
2097 | "cell_type": "markdown",
2098 | "metadata": {},
2099 | "source": [
2100 | "### 训练集检验"
2101 | ]
2102 | },
2103 | {
2104 | "cell_type": "code",
2105 | "execution_count": 320,
2106 | "metadata": {},
2107 | "outputs": [
2108 | {
2109 | "data": {
2110 | "text/plain": [
2111 | "3.1980674188547304"
2112 | ]
2113 | },
2114 | "execution_count": 320,
2115 | "metadata": {},
2116 | "output_type": "execute_result"
2117 | }
2118 | ],
2119 | "source": [
2120 | "y_pred_rf = forest_reg.predict(x_train)\n",
2121 | "forest_mse = mean_squared_error(y_train, y_pred_rf)\n",
2122 | "forest_rmse = np.sqrt(forest_mse)\n",
2123 | "forest_rmse"
2124 | ]
2125 | },
2126 | {
2127 | "cell_type": "markdown",
2128 | "metadata": {},
2129 | "source": [
2130 | "误差为3.1980674188547304"
2131 | ]
2132 | },
2133 | {
2134 | "cell_type": "markdown",
2135 | "metadata": {},
2136 | "source": [
2137 | "### 测试集检验"
2138 | ]
2139 | },
2140 | {
2141 | "cell_type": "code",
2142 | "execution_count": 322,
2143 | "metadata": {},
2144 | "outputs": [
2145 | {
2146 | "data": {
2147 | "text/plain": [
2148 | "4.5536604518702575"
2149 | ]
2150 | },
2151 | "execution_count": 322,
2152 | "metadata": {},
2153 | "output_type": "execute_result"
2154 | }
2155 | ],
2156 | "source": [
2157 | "y_pred_rf_test = forest_reg.predict(x_test)\n",
2158 | "forest_mse = mean_squared_error(y_test, y_pred_rf_test)\n",
2159 | "forest_rmse = np.sqrt(forest_mse)\n",
2160 | "forest_rmse"
2161 | ]
2162 | },
2163 | {
2164 | "cell_type": "markdown",
2165 | "metadata": {},
2166 | "source": [
2167 | "测试集标准差4.5536604518702575"
2168 | ]
2169 | },
2170 | {
2171 | "cell_type": "code",
2172 | "execution_count": 355,
2173 | "metadata": {},
2174 | "outputs": [],
2175 | "source": [
2176 | "#datas_test.head(10)"
2177 | ]
2178 | },
2179 | {
2180 | "cell_type": "code",
2181 | "execution_count": 324,
2182 | "metadata": {},
2183 | "outputs": [
2184 | {
2185 | "data": {
2186 | "text/plain": [
2187 | "array([[ 7.73 ],\n",
2188 | " [11.4861727 ],\n",
2189 | " [ 9.33285714],\n",
2190 | " [ 7.4 ],\n",
2191 | " [ 9.275 ],\n",
2192 | " [16.1 ],\n",
2193 | " [15.60247655],\n",
2194 | " [ 8.45447109],\n",
2195 | " [ 8.05528666],\n",
2196 | " [11.12662698]])"
2197 | ]
2198 | },
2199 | "execution_count": 324,
2200 | "metadata": {},
2201 | "output_type": "execute_result"
2202 | }
2203 | ],
2204 | "source": [
2205 | "y_pred_rf[:10].reshape(10, 1)"
2206 | ]
2207 | },
2208 | {
2209 | "cell_type": "markdown",
2210 | "metadata": {},
2211 | "source": [
2212 | "# 变量重要性"
2213 | ]
2214 | },
2215 | {
2216 | "cell_type": "code",
2217 | "execution_count": 353,
2218 | "metadata": {},
2219 | "outputs": [
2220 | {
2221 | "data": {
2222 | "text/plain": [
2223 | "[(0.058026406821841056, '大专'),\n",
2224 | " (0.031263812251122784, '10000人以上'),\n",
2225 | " (0.020290832343490904, '1000-9999人'),\n",
2226 | " (0.018095521323951716, '20-99人'),\n",
2227 | " (0.015872193036716557, '100-499人'),\n",
2228 | " (0.015170706225377684, '硕士'),\n",
2229 | " (0.014925513158681906, '北京'),\n",
2230 | " (0.014193573528348177, '500-999人'),\n",
2231 | " (0.013911459035214633, '0-20人'),\n",
2232 | " (0.011434958956342767, '深圳'),\n",
2233 | " (0.011310119828051196, '本科'),\n",
2234 | " (0.01002918014973044, '丽水'),\n",
2235 | " (0.0096919398776096, '杭州'),\n",
2236 | " (0.009393127333069419, '温州'),\n",
2237 | " (0.00912590555299951, '学历不限'),\n",
2238 | " (0.009077835450701817, '台州'),\n",
2239 | " (0.008986219951521799, '舟山'),\n",
2240 | " (0.008834491150945197, '绍兴'),\n",
2241 | " (0.008656335871500801, '金华'),\n",
2242 | " (0.008413046832938394, '湖州')]"
2243 | ]
2244 | },
2245 | "execution_count": 353,
2246 | "metadata": {},
2247 | "output_type": "execute_result"
2248 | }
2249 | ],
2250 | "source": [
2251 | "#forest_reg.feature_importances_\n",
2252 | "labellist = []\n",
2253 | "\n",
2254 | "for item in cat_encoder.categories_[0]:\n",
2255 | " labellist.append(item)\n",
2256 | " \n",
2257 | "for item in cat_encoder.categories_[1]:\n",
2258 | " labellist.append(item)\n",
2259 | " \n",
2260 | "for item in cat_encoder.categories_[2]:\n",
2261 | " labellist.append(item)\n",
2262 | "#labellist\n",
2263 | "sorted(zip(forest_reg.feature_importances_,labellist), reverse=True)[:20]"
2264 | ]
2265 | },
2266 | {
2267 | "cell_type": "code",
2268 | "execution_count": 369,
2269 | "metadata": {},
2270 | "outputs": [
2271 | {
2272 | "data": {
2273 | "text/html": [
2274 | "\n",
2275 | "\n",
2288 | "
\n",
2289 | " \n",
2290 | " \n",
2291 | " | \n",
2292 | " 重要性 | \n",
2293 | "
\n",
2294 | " \n",
2295 | " \n",
2296 | " \n",
2297 | " 大专 | \n",
2298 | " 0.058026 | \n",
2299 | "
\n",
2300 | " \n",
2301 | " 10000人以上 | \n",
2302 | " 0.031264 | \n",
2303 | "
\n",
2304 | " \n",
2305 | " 1000-9999人 | \n",
2306 | " 0.020291 | \n",
2307 | "
\n",
2308 | " \n",
2309 | " 20-99人 | \n",
2310 | " 0.018096 | \n",
2311 | "
\n",
2312 | " \n",
2313 | " 100-499人 | \n",
2314 | " 0.015872 | \n",
2315 | "
\n",
2316 | " \n",
2317 | " 硕士 | \n",
2318 | " 0.015171 | \n",
2319 | "
\n",
2320 | " \n",
2321 | " 北京 | \n",
2322 | " 0.014926 | \n",
2323 | "
\n",
2324 | " \n",
2325 | " 500-999人 | \n",
2326 | " 0.014194 | \n",
2327 | "
\n",
2328 | " \n",
2329 | " 0-20人 | \n",
2330 | " 0.013911 | \n",
2331 | "
\n",
2332 | " \n",
2333 | " 深圳 | \n",
2334 | " 0.011435 | \n",
2335 | "
\n",
2336 | " \n",
2337 | " 本科 | \n",
2338 | " 0.011310 | \n",
2339 | "
\n",
2340 | " \n",
2341 | " 丽水 | \n",
2342 | " 0.010029 | \n",
2343 | "
\n",
2344 | " \n",
2345 | " 杭州 | \n",
2346 | " 0.009692 | \n",
2347 | "
\n",
2348 | " \n",
2349 | " 温州 | \n",
2350 | " 0.009393 | \n",
2351 | "
\n",
2352 | " \n",
2353 | " 学历不限 | \n",
2354 | " 0.009126 | \n",
2355 | "
\n",
2356 | " \n",
2357 | " 台州 | \n",
2358 | " 0.009078 | \n",
2359 | "
\n",
2360 | " \n",
2361 | " 舟山 | \n",
2362 | " 0.008986 | \n",
2363 | "
\n",
2364 | " \n",
2365 | " 绍兴 | \n",
2366 | " 0.008834 | \n",
2367 | "
\n",
2368 | " \n",
2369 | " 金华 | \n",
2370 | " 0.008656 | \n",
2371 | "
\n",
2372 | " \n",
2373 | " 湖州 | \n",
2374 | " 0.008413 | \n",
2375 | "
\n",
2376 | " \n",
2377 | "
\n",
2378 | "
"
2379 | ],
2380 | "text/plain": [
2381 | " 重要性\n",
2382 | "大专 0.058026\n",
2383 | "10000人以上 0.031264\n",
2384 | "1000-9999人 0.020291\n",
2385 | "20-99人 0.018096\n",
2386 | "100-499人 0.015872\n",
2387 | "硕士 0.015171\n",
2388 | "北京 0.014926\n",
2389 | "500-999人 0.014194\n",
2390 | "0-20人 0.013911\n",
2391 | "深圳 0.011435\n",
2392 | "本科 0.011310\n",
2393 | "丽水 0.010029\n",
2394 | "杭州 0.009692\n",
2395 | "温州 0.009393\n",
2396 | "学历不限 0.009126\n",
2397 | "台州 0.009078\n",
2398 | "舟山 0.008986\n",
2399 | "绍兴 0.008834\n",
2400 | "金华 0.008656\n",
2401 | "湖州 0.008413"
2402 | ]
2403 | },
2404 | "execution_count": 369,
2405 | "metadata": {},
2406 | "output_type": "execute_result"
2407 | }
2408 | ],
2409 | "source": [
2410 | "from pandas import DataFrame\n",
2411 | "# 转换成 DataFrame格式\n",
2412 | "sorted(zip(forest_reg.feature_importances_,labellist), reverse=True)[:20][0]\n",
2413 | "list1 = []\n",
2414 | "list2 = []\n",
2415 | "for item in sorted(zip(forest_reg.feature_importances_,labellist), reverse=True)[:20]:\n",
2416 | " list1.append(item[1])\n",
2417 | " list2.append(item[0])\n",
2418 | "df_importance = DataFrame(list2,index=list1,columns=['重要性'])\n",
2419 | "df_importance"
2420 | ]
2421 | },
2422 | {
2423 | "cell_type": "markdown",
2424 | "metadata": {},
2425 | "source": [
2426 | "# 变量重要性分析"
2427 | ]
2428 | },
2429 | {
2430 | "cell_type": "markdown",
2431 | "metadata": {},
2432 | "source": [
2433 | "## 选取前20个重要变量"
2434 | ]
2435 | },
2436 | {
2437 | "cell_type": "markdown",
2438 | "metadata": {},
2439 | "source": [
2440 | "1.很多公司招聘时往往将学历要求设置为'大专'(即不设门槛,只挑选有能力者)"
2441 | ]
2442 | },
2443 | {
2444 | "cell_type": "markdown",
2445 | "metadata": {},
2446 | "source": [
2447 | "2.公司规模是影响数据分析师薪资方面的重要变量,其中'10000人以上'的大公司在薪资高低方面的影响力最大。"
2448 | ]
2449 | },
2450 | {
2451 | "cell_type": "markdown",
2452 | "metadata": {},
2453 | "source": [
2454 | "3.也有许多公司将'硕士学历'设为数据分析师的门槛,因此'硕士学历'在数据分析师薪资方面影响较大。"
2455 | ]
2456 | },
2457 | {
2458 | "cell_type": "markdown",
2459 | "metadata": {},
2460 | "source": [
2461 | "4.城市/地域方面:可以看到北京、深圳的职位对薪资的影响最大;其次,丽水、杭州、温州、台州、舟山、绍兴、金华等均为浙江省的市,可以明显看出浙江省数据分析师的薪资待遇较高。"
2462 | ]
2463 | },
2464 | {
2465 | "cell_type": "code",
2466 | "execution_count": 370,
2467 | "metadata": {},
2468 | "outputs": [],
2469 | "source": [
2470 | "#datas_train[datas_train['学历要求'] == '大专'].head()"
2471 | ]
2472 | },
2473 | {
2474 | "cell_type": "markdown",
2475 | "metadata": {},
2476 | "source": [
2477 | "# 自定义变量预测"
2478 | ]
2479 | },
2480 | {
2481 | "cell_type": "code",
2482 | "execution_count": 451,
2483 | "metadata": {},
2484 | "outputs": [
2485 | {
2486 | "data": {
2487 | "text/plain": [
2488 | "[array(['三明', '上海', '东沙群岛', '东莞', '东营', '中山', '临沂', '丽水', '乐山', '云浮', '亳州',\n",
2489 | " '仙桃', '佛山', '六安', '内江', '凉山', '北京', '十堰', '南京', '南充', '南平', '南通',\n",
2490 | " '南阳', '厦门', '台州', '合肥', '周口', '咸宁', '咸阳', '商丘', '商洛', '嘉兴', '大连',\n",
2491 | " '天津', '天门', '威海', '娄底', '孝感', '宁德', '安庆', '安康', '安阳', '宜宾', '宜昌',\n",
2492 | " '宣城', '宿州', '宿迁', '岳阳', '巴中', '常州', '常德', '平顶山', '广元', '广安', '广州',\n",
2493 | " '开封', '张家界', '徐州', '德阳', '怀化', '恩施', '惠州', '成都', '扬州', '揭阳', '攀枝花',\n",
2494 | " '新乡', '日照', '杭州', '枣庄', '株洲', '梅州', '武汉', '永州', '汕头', '汕尾', '江门',\n",
2495 | " '池州', '河源', '泉州', '泰安', '泰州', '泸州', '洛阳', '济南', '济宁', '淄博', '淮北',\n",
2496 | " '淮安', '深圳', '清远', '温州', '湖州', '湘潭', '湘西', '湛江', '滨州', '漯河', '漳州',\n",
2497 | " '潍坊', '潜江', '潮州', '烟台', '珠海', '甘孜', '益阳', '盐城', '眉山', '神农架', '福州',\n",
2498 | " '绍兴', '绵阳', '聊城', '肇庆', '自贡', '舟山', '芜湖', '苏州', '茂名', '荆州', '荆门',\n",
2499 | " '莆田', '莱芜', '菏泽', '衡阳', '衢州', '襄阳', '西安', '资阳', '达州', '连云港', '遂宁',\n",
2500 | " '邵阳', '郑州', '郴州', '鄂州', '金华', '铜川', '镇江', '长沙', '阜阳', '阳江', '阿坝',\n",
2501 | " '随州', '雅安', '韶关', '黄冈', '黄山', '黄石', '龙岩'], dtype=object),\n",
2502 | " array(['0-20人', '100-499人', '1000-9999人', '10000人以上', '20-99人',\n",
2503 | " '500-999人'], dtype=object),\n",
2504 | " array(['中专', '博士', '大专', '学历不限', '本科', '硕士', '高中'], dtype=object),\n",
2505 | " array(['1-3年', '1年以内', '3-5年', '5-10年', '经验不限'], dtype=object)]"
2506 | ]
2507 | },
2508 | "execution_count": 451,
2509 | "metadata": {},
2510 | "output_type": "execute_result"
2511 | }
2512 | ],
2513 | "source": [
2514 | "cat_encoder.categories_ #可选预测变量"
2515 | ]
2516 | },
2517 | {
2518 | "cell_type": "markdown",
2519 | "metadata": {},
2520 | "source": [
2521 | "## 决策树预测 "
2522 | ]
2523 | },
2524 | {
2525 | "cell_type": "code",
2526 | "execution_count": 452,
2527 | "metadata": {},
2528 | "outputs": [
2529 | {
2530 | "data": {
2531 | "text/plain": [
2532 | "array([15.])"
2533 | ]
2534 | },
2535 | "execution_count": 452,
2536 | "metadata": {},
2537 | "output_type": "execute_result"
2538 | }
2539 | ],
2540 | "source": [
2541 | "#paramlist = func_params(['上海','1000-9999人','硕士','1-3年'])\n",
2542 | "Y_pred_dt = tree_reg.predict(paramlist)\n",
2543 | "Y_pred_dt"
2544 | ]
2545 | },
2546 | {
2547 | "cell_type": "code",
2548 | "execution_count": 458,
2549 | "metadata": {},
2550 | "outputs": [
2551 | {
2552 | "data": {
2553 | "text/plain": [
2554 | "array([13.5])"
2555 | ]
2556 | },
2557 | "execution_count": 458,
2558 | "metadata": {},
2559 | "output_type": "execute_result"
2560 | }
2561 | ],
2562 | "source": [
2563 | "paramlist2 = func_params(['北京','500-999人','本科','1-3年'])\n",
2564 | "tree_reg.predict(paramlist2)"
2565 | ]
2566 | },
2567 | {
2568 | "cell_type": "markdown",
2569 | "metadata": {},
2570 | "source": [
2571 | "## 随机森林预测"
2572 | ]
2573 | },
2574 | {
2575 | "cell_type": "code",
2576 | "execution_count": 469,
2577 | "metadata": {},
2578 | "outputs": [
2579 | {
2580 | "data": {
2581 | "text/plain": [
2582 | "array([12.63888889])"
2583 | ]
2584 | },
2585 | "execution_count": 469,
2586 | "metadata": {},
2587 | "output_type": "execute_result"
2588 | }
2589 | ],
2590 | "source": [
2591 | "paramlist3 = func_params(['深圳','100-499人','硕士','1年以内'])\n",
2592 | "tree_reg.predict(paramlist3)"
2593 | ]
2594 | },
2595 | {
2596 | "cell_type": "code",
2597 | "execution_count": 476,
2598 | "metadata": {},
2599 | "outputs": [
2600 | {
2601 | "data": {
2602 | "text/plain": [
2603 | "array([15.])"
2604 | ]
2605 | },
2606 | "execution_count": 476,
2607 | "metadata": {},
2608 | "output_type": "execute_result"
2609 | }
2610 | ],
2611 | "source": [
2612 | "paramlist4 = func_params(['上海','1000-9999人','硕士','1年以内'])\n",
2613 | "tree_reg.predict(paramlist4)"
2614 | ]
2615 | },
2616 | {
2617 | "cell_type": "code",
2618 | "execution_count": null,
2619 | "metadata": {
2620 | "collapsed": true
2621 | },
2622 | "outputs": [],
2623 | "source": []
2624 | }
2625 | ],
2626 | "metadata": {
2627 | "kernelspec": {
2628 | "display_name": "Python 3",
2629 | "language": "python",
2630 | "name": "python3"
2631 | },
2632 | "language_info": {
2633 | "codemirror_mode": {
2634 | "name": "ipython",
2635 | "version": 3
2636 | },
2637 | "file_extension": ".py",
2638 | "mimetype": "text/x-python",
2639 | "name": "python",
2640 | "nbconvert_exporter": "python",
2641 | "pygments_lexer": "ipython3",
2642 | "version": "3.6.3"
2643 | }
2644 | },
2645 | "nbformat": 4,
2646 | "nbformat_minor": 2
2647 | }
2648 |
--------------------------------------------------------------------------------
/city_data/city.csv:
--------------------------------------------------------------------------------
1 | ,code,市,省
2 | 0,101010100,北京,北京
3 | 1,101020100,上海,上海
4 | 2,101030100,天津,天津
5 | 3,101040100,重庆,重庆
6 | 4,101050100,哈尔滨,黑龙江
7 | 5,101050200,齐齐哈尔,黑龙江
8 | 6,101050300,牡丹江,黑龙江
9 | 7,101050400,佳木斯,黑龙江
10 | 8,101050500,绥化,黑龙江
11 | 9,101050600,黑河,黑龙江
12 | 10,101050700,伊春,黑龙江
13 | 11,101050800,大庆,黑龙江
14 | 12,101050900,七台河,黑龙江
15 | 13,101051000,鸡西,黑龙江
16 | 14,101051100,鹤岗,黑龙江
17 | 15,101051200,双鸭山,黑龙江
18 | 16,101051300,大兴安岭,黑龙江
19 | 17,101060100,长春,吉林
20 | 18,101060200,吉林,吉林
21 | 19,101060300,四平,吉林
22 | 20,101060400,通化,吉林
23 | 21,101060500,白城,吉林
24 | 22,101060600,辽源,吉林
25 | 23,101060700,松原,吉林
26 | 24,101060800,白山,吉林
27 | 25,101060900,延边,吉林
28 | 26,101070100,沈阳,辽宁
29 | 27,101070200,大连,辽宁
30 | 28,101070300,鞍山,辽宁
31 | 29,101070400,抚顺,辽宁
32 | 30,101070500,本溪,辽宁
33 | 31,101070600,丹东,辽宁
34 | 32,101070700,锦州,辽宁
35 | 33,101070800,营口,辽宁
36 | 34,101070900,阜新,辽宁
37 | 35,101071000,辽阳,辽宁
38 | 36,101071100,铁岭,辽宁
39 | 37,101071200,朝阳,辽宁
40 | 38,101071300,盘锦,辽宁
41 | 39,101071400,葫芦岛,辽宁
42 | 40,101080100,呼和浩特,内蒙古
43 | 41,101080200,包头,内蒙古
44 | 42,101080300,乌海,内蒙古
45 | 43,101080400,通辽,内蒙古
46 | 44,101080500,赤峰,内蒙古
47 | 45,101080600,鄂尔多斯,内蒙古
48 | 46,101080700,呼伦贝尔,内蒙古
49 | 47,101080800,巴彦淖尔,内蒙古
50 | 48,101080900,乌兰察布,内蒙古
51 | 49,101081000,锡林郭勒,内蒙古
52 | 50,101081100,兴安盟,内蒙古
53 | 51,101081200,阿拉善,内蒙古
54 | 52,101090100,石家庄,河北
55 | 53,101090200,保定,河北
56 | 54,101090300,张家口,河北
57 | 55,101090400,承德,河北
58 | 56,101090500,唐山,河北
59 | 57,101090600,廊坊,河北
60 | 58,101090700,沧州,河北
61 | 59,101090800,衡水,河北
62 | 60,101090900,邢台,河北
63 | 61,101091000,邯郸,河北
64 | 62,101091100,秦皇岛,河北
65 | 63,101100100,太原,山西
66 | 64,101100200,大同,山西
67 | 65,101100300,阳泉,山西
68 | 66,101100400,晋中,山西
69 | 67,101100500,长治,山西
70 | 68,101100600,晋城,山西
71 | 69,101100700,临汾,山西
72 | 70,101100800,运城,山西
73 | 71,101100900,朔州,山西
74 | 72,101101000,忻州,山西
75 | 73,101101100,吕梁,山西
76 | 74,101110100,西安,陕西
77 | 75,101110200,咸阳,陕西
78 | 76,101110300,延安,陕西
79 | 77,101110400,榆林,陕西
80 | 78,101110500,渭南,陕西
81 | 79,101110600,商洛,陕西
82 | 80,101110700,安康,陕西
83 | 81,101110800,汉中,陕西
84 | 82,101110900,宝鸡,陕西
85 | 83,101111000,铜川,陕西
86 | 84,101120100,济南,山东
87 | 85,101120200,青岛,山东
88 | 86,101120300,淄博,山东
89 | 87,101120400,德州,山东
90 | 88,101120500,烟台,山东
91 | 89,101120600,潍坊,山东
92 | 90,101120700,济宁,山东
93 | 91,101120800,泰安,山东
94 | 92,101120900,临沂,山东
95 | 93,101121000,菏泽,山东
96 | 94,101121100,滨州,山东
97 | 95,101121200,东营,山东
98 | 96,101121300,威海,山东
99 | 97,101121400,枣庄,山东
100 | 98,101121500,日照,山东
101 | 99,101121600,莱芜,山东
102 | 100,101121700,聊城,山东
103 | 101,101130100,乌鲁木齐,新疆
104 | 102,101130200,克拉玛依,新疆
105 | 103,101130300,昌吉,新疆
106 | 104,101130400,巴音郭楞,新疆
107 | 105,101130500,博尔塔拉,新疆
108 | 106,101130600,伊犁,新疆
109 | 107,101130800,吐鲁番,新疆
110 | 108,101130900,哈密,新疆
111 | 109,101131000,阿克苏,新疆
112 | 110,101131100,克孜勒苏柯尔克孜,新疆
113 | 111,101131200,喀什,新疆
114 | 112,101131300,和田,新疆
115 | 113,101131400,塔城,新疆
116 | 114,101131500,阿勒泰,新疆
117 | 115,101131600,石河子,新疆
118 | 116,101131700,阿拉尔,新疆
119 | 117,101131800,图木舒克,新疆
120 | 118,101131900,五家渠,新疆
121 | 119,101132000,铁门关,新疆
122 | 120,101132100,北屯市,新疆
123 | 121,101132200,可克达拉市,新疆
124 | 122,101132300,昆玉市,新疆
125 | 123,101132400,双河市,新疆
126 | 124,101150100,西宁,青海
127 | 125,101150200,海东,青海
128 | 126,101150300,海北,青海
129 | 127,101150400,黄南,青海
130 | 128,101150500,海南,青海
131 | 129,101150600,果洛,青海
132 | 130,101150700,玉树,青海
133 | 131,101150800,海西,青海
134 | 132,101160100,兰州,甘肃
135 | 133,101160200,定西,甘肃
136 | 134,101160300,平凉,甘肃
137 | 135,101160400,庆阳,甘肃
138 | 136,101160500,武威,甘肃
139 | 137,101160600,金昌,甘肃
140 | 138,101160700,张掖,甘肃
141 | 139,101160800,酒泉,甘肃
142 | 140,101160900,天水,甘肃
143 | 141,101161000,白银,甘肃
144 | 142,101161100,陇南,甘肃
145 | 143,101161200,嘉峪关,甘肃
146 | 144,101161300,临夏,甘肃
147 | 145,101161400,甘南,甘肃
148 | 146,101170100,银川,宁夏
149 | 147,101170200,石嘴山,宁夏
150 | 148,101170300,吴忠,宁夏
151 | 149,101170400,固原,宁夏
152 | 150,101170500,中卫,宁夏
153 | 151,101180100,郑州,河南
154 | 152,101180200,安阳,河南
155 | 153,101180300,新乡,河南
156 | 154,101180400,许昌,河南
157 | 155,101180500,平顶山,河南
158 | 156,101180600,信阳,河南
159 | 157,101180700,南阳,河南
160 | 158,101180800,开封,河南
161 | 159,101180900,洛阳,河南
162 | 160,101181000,商丘,河南
163 | 161,101181100,焦作,河南
164 | 162,101181200,鹤壁,河南
165 | 163,101181300,濮阳,河南
166 | 164,101181400,周口,河南
167 | 165,101181500,漯河,河南
168 | 166,101181600,驻马店,河南
169 | 167,101181700,三门峡,河南
170 | 168,101181800,济源,河南
171 | 169,101190100,南京,江苏
172 | 170,101190200,无锡,江苏
173 | 171,101190300,镇江,江苏
174 | 172,101190400,苏州,江苏
175 | 173,101190500,南通,江苏
176 | 174,101190600,扬州,江苏
177 | 175,101190700,盐城,江苏
178 | 176,101190800,徐州,江苏
179 | 177,101190900,淮安,江苏
180 | 178,101191000,连云港,江苏
181 | 179,101191100,常州,江苏
182 | 180,101191200,泰州,江苏
183 | 181,101191300,宿迁,江苏
184 | 182,101200100,武汉,湖北
185 | 183,101200200,襄阳,湖北
186 | 184,101200300,鄂州,湖北
187 | 185,101200400,孝感,湖北
188 | 186,101200500,黄冈,湖北
189 | 187,101200600,黄石,湖北
190 | 188,101200700,咸宁,湖北
191 | 189,101200800,荆州,湖北
192 | 190,101200900,宜昌,湖北
193 | 191,101201000,十堰,湖北
194 | 192,101201100,随州,湖北
195 | 193,101201200,荆门,湖北
196 | 194,101201300,恩施,湖北
197 | 195,101201400,仙桃,湖北
198 | 196,101201500,潜江,湖北
199 | 197,101201600,天门,湖北
200 | 198,101201700,神农架,湖北
201 | 199,101210100,杭州,浙江
202 | 200,101210200,湖州,浙江
203 | 201,101210300,嘉兴,浙江
204 | 202,101210400,宁波,浙江
205 | 203,101210500,绍兴,浙江
206 | 204,101210600,台州,浙江
207 | 205,101210700,温州,浙江
208 | 206,101210800,丽水,浙江
209 | 207,101210900,金华,浙江
210 | 208,101211000,衢州,浙江
211 | 209,101211100,舟山,浙江
212 | 210,101220100,合肥,安徽
213 | 211,101220200,蚌埠,安徽
214 | 212,101220300,芜湖,安徽
215 | 213,101220400,淮南,安徽
216 | 214,101220500,马鞍山,安徽
217 | 215,101220600,安庆,安徽
218 | 216,101220700,宿州,安徽
219 | 217,101220800,阜阳,安徽
220 | 218,101220900,亳州,安徽
221 | 219,101221000,滁州,安徽
222 | 220,101221100,淮北,安徽
223 | 221,101221200,铜陵,安徽
224 | 222,101221300,宣城,安徽
225 | 223,101221400,六安,安徽
226 | 224,101221500,池州,安徽
227 | 225,101221600,黄山,安徽
228 | 226,101230100,福州,福建
229 | 227,101230200,厦门,福建
230 | 228,101230300,宁德,福建
231 | 229,101230400,莆田,福建
232 | 230,101230500,泉州,福建
233 | 231,101230600,漳州,福建
234 | 232,101230700,龙岩,福建
235 | 233,101230800,三明,福建
236 | 234,101230900,南平,福建
237 | 235,101240100,南昌,江西
238 | 236,101240200,九江,江西
239 | 237,101240300,上饶,江西
240 | 238,101240400,抚州,江西
241 | 239,101240500,宜春,江西
242 | 240,101240600,吉安,江西
243 | 241,101240700,赣州,江西
244 | 242,101240800,景德镇,江西
245 | 243,101240900,萍乡,江西
246 | 244,101241000,新余,江西
247 | 245,101241100,鹰潭,江西
248 | 246,101250100,长沙,湖南
249 | 247,101250200,湘潭,湖南
250 | 248,101250300,株洲,湖南
251 | 249,101250400,衡阳,湖南
252 | 250,101250500,郴州,湖南
253 | 251,101250600,常德,湖南
254 | 252,101250700,益阳,湖南
255 | 253,101250800,娄底,湖南
256 | 254,101250900,邵阳,湖南
257 | 255,101251000,岳阳,湖南
258 | 256,101251100,张家界,湖南
259 | 257,101251200,怀化,湖南
260 | 258,101251300,永州,湖南
261 | 259,101251400,湘西,湖南
262 | 260,101260100,贵阳,贵州
263 | 261,101260200,遵义,贵州
264 | 262,101260300,安顺,贵州
265 | 263,101260400,铜仁,贵州
266 | 264,101260500,毕节,贵州
267 | 265,101260600,六盘水,贵州
268 | 266,101260700,黔东南,贵州
269 | 267,101260800,黔南,贵州
270 | 268,101260900,黔西南,贵州
271 | 269,101270100,成都,四川
272 | 270,101270200,攀枝花,四川
273 | 271,101270300,自贡,四川
274 | 272,101270400,绵阳,四川
275 | 273,101270500,南充,四川
276 | 274,101270600,达州,四川
277 | 275,101270700,遂宁,四川
278 | 276,101270800,广安,四川
279 | 277,101270900,巴中,四川
280 | 278,101271000,泸州,四川
281 | 279,101271100,宜宾,四川
282 | 280,101271200,内江,四川
283 | 281,101271300,资阳,四川
284 | 282,101271400,乐山,四川
285 | 283,101271500,眉山,四川
286 | 284,101271600,雅安,四川
287 | 285,101271700,德阳,四川
288 | 286,101271800,广元,四川
289 | 287,101271900,阿坝,四川
290 | 288,101272000,凉山,四川
291 | 289,101272100,甘孜,四川
292 | 290,101280100,广州,广东
293 | 291,101280200,韶关,广东
294 | 292,101280300,惠州,广东
295 | 293,101280400,梅州,广东
296 | 294,101280500,汕头,广东
297 | 295,101280600,深圳,广东
298 | 296,101280700,珠海,广东
299 | 297,101280800,佛山,广东
300 | 298,101280900,肇庆,广东
301 | 299,101281000,湛江,广东
302 | 300,101281100,江门,广东
303 | 301,101281200,河源,广东
304 | 302,101281300,清远,广东
305 | 303,101281400,云浮,广东
306 | 304,101281500,潮州,广东
307 | 305,101281600,东莞,广东
308 | 306,101281700,中山,广东
309 | 307,101281800,阳江,广东
310 | 308,101281900,揭阳,广东
311 | 309,101282000,茂名,广东
312 | 310,101282100,汕尾,广东
313 | 311,101282200,东沙群岛,广东
314 | 312,101290100,昆明,云南
315 | 313,101290200,曲靖,云南
316 | 314,101290300,保山,云南
317 | 315,101290400,玉溪,云南
318 | 316,101290500,普洱,云南
319 | 317,101290700,昭通,云南
320 | 318,101290800,临沧,云南
321 | 319,101290900,丽江,云南
322 | 320,101291000,西双版纳,云南
323 | 321,101291100,文山,云南
324 | 322,101291200,红河,云南
325 | 323,101291300,德宏,云南
326 | 324,101291400,怒江,云南
327 | 325,101291500,迪庆,云南
328 | 326,101291600,大理,云南
329 | 327,101291700,楚雄,云南
330 | 328,101300100,南宁,广西
331 | 329,101300200,崇左,广西
332 | 330,101300300,柳州,广西
333 | 331,101300400,来宾,广西
334 | 332,101300500,桂林,广西
335 | 333,101300600,梧州,广西
336 | 334,101300700,贺州,广西
337 | 335,101300800,贵港,广西
338 | 336,101300900,玉林,广西
339 | 337,101301000,百色,广西
340 | 338,101301100,钦州,广西
341 | 339,101301200,河池,广西
342 | 340,101301300,北海,广西
343 | 341,101301400,防城港,广西
344 | 342,101310100,海口,海南
345 | 343,101310200,三亚,海南
346 | 344,101310300,三沙,海南
347 | 345,101310400,儋州,海南
348 | 346,101310500,五指山,海南
349 | 347,101310600,琼海,海南
350 | 348,101310700,文昌,海南
351 | 349,101310800,万宁,海南
352 | 350,101310900,东方,海南
353 | 351,101311000,定安,海南
354 | 352,101311100,屯昌,海南
355 | 353,101311200,澄迈,海南
356 | 354,101311300,临高,海南
357 | 355,101311400,白沙,海南
358 | 356,101311500,昌江,海南
359 | 357,101311600,乐东,海南
360 | 358,101311700,陵水,海南
361 | 359,101311800,保亭,海南
362 | 360,101311900,琼中,海南
363 | 361,101341100,台湾,台湾
364 | 362,101140100,拉萨,西藏
365 | 363,101140200,日喀则,西藏
366 | 364,101140300,昌都,西藏
367 | 365,101140400,林芝,西藏
368 | 366,101140500,山南,西藏
369 | 367,101140600,那曲,西藏
370 | 368,101140700,阿里,西藏
371 | 369,101320300,香港,香港
372 | 370,101330100,澳门,澳门
373 |
--------------------------------------------------------------------------------
/city_data/city.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 9,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pymongo\n",
12 | "import pandas as pd\n",
13 | "client = pymongo.MongoClient('localhost',27017)\n",
14 | "db = client['Graduation_project']\n",
15 | "table = db['city']\n",
16 | "city = pd.DataFrame(list(table.find()))\n",
17 | "del city['_id']"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 11,
23 | "metadata": {},
24 | "outputs": [
25 | {
26 | "data": {
27 | "text/html": [
28 | "\n",
29 | "\n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " | \n",
46 | " code | \n",
47 | " 市 | \n",
48 | " 省 | \n",
49 | "
\n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " 0 | \n",
54 | " 101010100 | \n",
55 | " 北京 | \n",
56 | " 北京 | \n",
57 | "
\n",
58 | " \n",
59 | " 1 | \n",
60 | " 101020100 | \n",
61 | " 上海 | \n",
62 | " 上海 | \n",
63 | "
\n",
64 | " \n",
65 | " 2 | \n",
66 | " 101030100 | \n",
67 | " 天津 | \n",
68 | " 天津 | \n",
69 | "
\n",
70 | " \n",
71 | " 3 | \n",
72 | " 101040100 | \n",
73 | " 重庆 | \n",
74 | " 重庆 | \n",
75 | "
\n",
76 | " \n",
77 | " 4 | \n",
78 | " 101050100 | \n",
79 | " 哈尔滨 | \n",
80 | " 黑龙江 | \n",
81 | "
\n",
82 | " \n",
83 | "
\n",
84 | "
"
85 | ],
86 | "text/plain": [
87 | " code 市 省\n",
88 | "0 101010100 北京 北京\n",
89 | "1 101020100 上海 上海\n",
90 | "2 101030100 天津 天津\n",
91 | "3 101040100 重庆 重庆\n",
92 | "4 101050100 哈尔滨 黑龙江"
93 | ]
94 | },
95 | "execution_count": 11,
96 | "metadata": {},
97 | "output_type": "execute_result"
98 | }
99 | ],
100 | "source": [
101 | "city.head()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 13,
107 | "metadata": {
108 | "collapsed": true
109 | },
110 | "outputs": [],
111 | "source": [
112 | "city.to_csv('city.csv')"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 16,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/html": [
123 | "\n",
124 | "\n",
137 | "
\n",
138 | " \n",
139 | " \n",
140 | " | \n",
141 | " code | \n",
142 | " 省 | \n",
143 | " 市 | \n",
144 | "
\n",
145 | " \n",
146 | " \n",
147 | " \n",
148 | " 0 | \n",
149 | " 101010100 | \n",
150 | " 北京 | \n",
151 | " 北京 | \n",
152 | "
\n",
153 | " \n",
154 | " 1 | \n",
155 | " 101020100 | \n",
156 | " 上海 | \n",
157 | " 上海 | \n",
158 | "
\n",
159 | " \n",
160 | " 2 | \n",
161 | " 101030100 | \n",
162 | " 天津 | \n",
163 | " 天津 | \n",
164 | "
\n",
165 | " \n",
166 | " 3 | \n",
167 | " 101040100 | \n",
168 | " 重庆 | \n",
169 | " 重庆 | \n",
170 | "
\n",
171 | " \n",
172 | " 4 | \n",
173 | " 101050100 | \n",
174 | " 黑龙江 | \n",
175 | " 哈尔滨 | \n",
176 | "
\n",
177 | " \n",
178 | "
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " code 省 市\n",
183 | "0 101010100 北京 北京\n",
184 | "1 101020100 上海 上海\n",
185 | "2 101030100 天津 天津\n",
186 | "3 101040100 重庆 重庆\n",
187 | "4 101050100 黑龙江 哈尔滨"
188 | ]
189 | },
190 | "execution_count": 16,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "import pandas as pd\n",
197 | "\n",
198 | "city = pd.read_csv('city.csv')\n",
199 | "data = city[['code', '省', '市']]\n",
200 | "data.head()"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 20,
206 | "metadata": {
207 | "collapsed": true
208 | },
209 | "outputs": [],
210 | "source": [
211 | "city = data.values"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 21,
217 | "metadata": {},
218 | "outputs": [
219 | {
220 | "data": {
221 | "text/plain": [
222 | "array([[101010100, '北京', '北京'],\n",
223 | " [101020100, '上海', '上海'],\n",
224 | " [101030100, '天津', '天津'],\n",
225 | " ...,\n",
226 | " [101140700, '西藏', '阿里'],\n",
227 | " [101320300, '香港', '香港'],\n",
228 | " [101330100, '澳门', '澳门']], dtype=object)"
229 | ]
230 | },
231 | "execution_count": 21,
232 | "metadata": {},
233 | "output_type": "execute_result"
234 | }
235 | ],
236 | "source": [
237 | "city"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 23,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "data": {
247 | "text/plain": [
248 | "'上海'"
249 | ]
250 | },
251 | "execution_count": 23,
252 | "metadata": {},
253 | "output_type": "execute_result"
254 | }
255 | ],
256 | "source": [
257 | "city[1][2]"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 36,
263 | "metadata": {
264 | "collapsed": true
265 | },
266 | "outputs": [],
267 | "source": [
268 | "# 连接到MongoDB\n",
269 | "MONGO_URL = 'localhost'\n",
270 | "MONGO_DB = 'Graduation_project'\n",
271 | "MONGO_COLLECTION = 'jobs_info'\n",
272 | "client = pymongo.MongoClient(MONGO_URL, port=27017)\n",
273 | "db = client[MONGO_DB]\n",
274 | "\n",
275 | "\n",
276 | "# 检查是否已爬过\n",
277 | "check = pd.DataFrame(list(db[MONGO_COLLECTION].find()))\n",
278 | "check_list = check[['signal']]\n",
279 | "grouped = check_list.groupby(check['signal'])"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 44,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "True"
291 | ]
292 | },
293 | "execution_count": 44,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "'1' in grouped.size().index"
300 | ]
301 | }
302 | ],
303 | "metadata": {
304 | "kernelspec": {
305 | "display_name": "Python 3",
306 | "language": "python",
307 | "name": "python3"
308 | },
309 | "language_info": {
310 | "codemirror_mode": {
311 | "name": "ipython",
312 | "version": 3
313 | },
314 | "file_extension": ".py",
315 | "mimetype": "text/x-python",
316 | "name": "python",
317 | "nbconvert_exporter": "python",
318 | "pygments_lexer": "ipython3",
319 | "version": "3.6.3"
320 | }
321 | },
322 | "nbformat": 4,
323 | "nbformat_minor": 2
324 | }
325 |
--------------------------------------------------------------------------------
/city_data/city.py:
--------------------------------------------------------------------------------
1 | # version:1.0
2 | # author:brandon
3 | # date:2018/10/20
4 |
5 | # common imports
6 | import pandas as pd
7 |
8 |
9 | def city():
10 | city_data = pd.read_csv('city.csv')
11 | data = city_data[['code', '省', '市']]
12 | # print(data.values)
13 | city = data.values
14 | return city
--------------------------------------------------------------------------------
/pics/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/1.png
--------------------------------------------------------------------------------
/pics/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/10.png
--------------------------------------------------------------------------------
/pics/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/11.png
--------------------------------------------------------------------------------
/pics/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/12.png
--------------------------------------------------------------------------------
/pics/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/13.png
--------------------------------------------------------------------------------
/pics/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/14.png
--------------------------------------------------------------------------------
/pics/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/2.png
--------------------------------------------------------------------------------
/pics/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/3.png
--------------------------------------------------------------------------------
/pics/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/4.png
--------------------------------------------------------------------------------
/pics/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/5.png
--------------------------------------------------------------------------------
/pics/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/6.png
--------------------------------------------------------------------------------
/pics/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/7.png
--------------------------------------------------------------------------------
/pics/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/8.png
--------------------------------------------------------------------------------
/pics/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/9.png
--------------------------------------------------------------------------------
/pics/ML部分/为标签编码.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/为标签编码.png
--------------------------------------------------------------------------------
/pics/ML部分/决策树训练集误差.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/决策树训练集误差.png
--------------------------------------------------------------------------------
/pics/ML部分/划分训练集与测试集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/划分训练集与测试集.png
--------------------------------------------------------------------------------
/pics/ML部分/变量重要性.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/变量重要性.png
--------------------------------------------------------------------------------
/pics/ML部分/属性合并.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/属性合并.png
--------------------------------------------------------------------------------
/pics/ML部分/筛选岗位数量前150.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/筛选岗位数量前150.png
--------------------------------------------------------------------------------
/pics/ML部分/编码分类结果.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/编码分类结果.png
--------------------------------------------------------------------------------
/pics/ML部分/过滤后的分布.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/过滤后的分布.png
--------------------------------------------------------------------------------
/pics/ML部分/过滤薪资.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/过滤薪资.png
--------------------------------------------------------------------------------
/pics/ML部分/重要性分析.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/重要性分析.png
--------------------------------------------------------------------------------
/pics/ML部分/随机森林训练集误差.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/随机森林训练集误差.png
--------------------------------------------------------------------------------
/pics/url.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/url.png
--------------------------------------------------------------------------------
/spiders/citylist_spider.py:
--------------------------------------------------------------------------------
1 | # version:1.0
2 | # author:brandon
3 | # date:2018/10/20
4 |
5 | # common imports
6 | import requests
7 | import pymongo
8 | # -------------
9 |
10 |
11 | # 页面获取函数
12 | def get():
13 | header = {
14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
15 | 'Chrome/69.0.3497.12 Safari/537.36 '
16 | }
17 | url = 'https://www.zhipin.com/common/data/city.json'
18 | response = requests.get(url, headers=header)
19 | return response.json()
20 |
21 |
22 | def parse(data):
23 | cities = data['data']['cityList']
24 | for provence in cities:
25 | provence_name = provence['name']
26 | citylist = provence['subLevelModelList']
27 | for city in citylist:
28 | city_code = city['code']
29 | city_name = city['name']
30 | city_info = {
31 | '省': provence_name,
32 | '市': city_name,
33 | 'code': city_code
34 | }
35 | print(city_info)
36 | save_to_mongo(city_info)
37 |
38 |
39 | # 连接到MongoDB
40 | MONGO_URL = 'localhost'
41 | MONGO_DB = 'Graduation_project'
42 | MONGO_COLLECTION = 'city'
43 | client = pymongo.MongoClient(MONGO_URL, port=27017)
44 | db = client[MONGO_DB]
45 |
46 |
47 | def save_to_mongo(data):
48 | # 保存到MongoDB中
49 | try:
50 | if db[MONGO_COLLECTION].insert(data):
51 | print('存储到 MongoDB 成功')
52 | except Exception:
53 | print('存储到 MongoDB 失败')
54 |
55 |
56 | if __name__ == '__main__':
57 | city = get()
58 | parse(city)
59 |
--------------------------------------------------------------------------------
/spiders/jobs_spider.py:
--------------------------------------------------------------------------------
1 | # version:1.0
2 | # author:brandon
3 | # date:2018/10/20
4 |
5 | # common imports
6 | import requests
7 | import re
8 | from lxml import etree
9 | import time
10 | import random
11 | import pymongo
12 | import pandas as pd
13 | # -------------
14 | # 导入模块
15 | import city
16 |
17 |
18 | # 页面获取函数
19 | def get_page(page, city_code):
20 | header = {
21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
22 | 'Chrome/69.0.3497.12 Safari/537.36 '
23 | }
24 | print('正在爬取第', page, '页')
25 | url = 'https://www.zhipin.com/c{code}-p100511/?page={page}&ka=page-{page}'.format(code=city_code, page=page)
26 | response = requests.get(url, headers=header)
27 | return response.text
28 | # --------------
29 |
30 |
31 | # 页面解析函数
32 | def parse(html, city, provence, page):
33 | # data = json.loads(data)
34 | # print(data)
35 | # 观察数据结构可得
36 | data = etree.HTML(html)
37 | # 取工资均值
38 | items = data.xpath('//*[@id="main"]/div/div[2]/ul/li')
39 | for item in items:
40 | job_title = item.xpath('./div/div[1]/h3/a/div[1]/text()')[0]
41 | job_salary = item.xpath('./div/div[1]/h3/a/span/text()')[0]
42 | job_company = item.xpath('./div/div[2]/div/h3/a/text()')[0]
43 | job_experience = item.xpath('./div/div[1]/p/text()[2]')[0]
44 | job_degree = item.xpath('./div/div[1]/p/text()[3]')[0]
45 | company_scale = item.xpath('./div/div[2]/div/p/text()[3]')[0]
46 | # 取薪资均值----------------
47 | avg_salary = average(job_salary)
48 | # -------------------------
49 | signal = city + str(page)
50 | print(provence, '|', city, '|', job_title, '|', job_salary, '|', job_company, '|', job_experience, '|', job_degree, '|', company_scale,
51 | '|', avg_salary)
52 | job = {
53 | 'signal': signal,
54 | '省': provence,
55 | '城市': city,
56 | '职位名称': job_title,
57 | '职位薪资': job_salary,
58 | '公司名称': job_company,
59 | '工作经验': job_experience,
60 | '学历要求': job_degree,
61 | '公司规模': company_scale
62 | }
63 | save_to_mongo(job)
64 | # ---------------------------------------
65 |
66 |
67 | # 均值函数
68 | def average(job_salary):
69 | # 取薪资均值----------------
70 | pattern = re.compile('\d+')
71 | salary = job_salary
72 | try:
73 | res = re.findall(pattern, salary)
74 | avg_salary = 0
75 | sum = 0
76 | for i in res:
77 | a = int(i)
78 | sum = sum + a
79 | avg_salary = sum / 2
80 | except Exception:
81 | avg_salary = 0
82 | # 函数返回值
83 | return avg_salary
84 |
85 |
86 | # 连接到MongoDB
87 | MONGO_URL = 'localhost'
88 | MONGO_DB = 'Graduation_project'
89 | MONGO_COLLECTION = 'jobs_info'
90 | client = pymongo.MongoClient(MONGO_URL, port=27017)
91 | db = client[MONGO_DB]
92 |
93 |
94 | # 检查是否已爬过
95 | check = pd.DataFrame(list(db[MONGO_COLLECTION].find()))
96 | check_list = check[['signal']]
97 | grouped = check_list.groupby(check['signal'])
98 | # -----------------
99 |
100 |
101 | def save_to_mongo(data):
102 | # 保存到MongoDB中
103 | try:
104 | if db[MONGO_COLLECTION].insert(data):
105 | print('存储到 MongoDB 成功')
106 | except Exception:
107 | print('存储到 MongoDB 失败')
108 |
109 |
110 | def jobspider(city_code, city, provence):
111 | # 最大爬取页数
112 | MAX_PAGE = 30
113 | for i in range(1, MAX_PAGE + 1):
114 | job_signal = city + str(i)
115 | # print(job_signal)
116 | if job_signal in grouped.size().index:
117 | continue
118 | else:
119 | try:
120 | html = get_page(i, city_code)
121 | # ------------ 解析数据 ---------------
122 | parse(html, city, provence, i)
123 | print('-' * 100)
124 | time.sleep(random.randint(0, 3))
125 | except Exception:
126 | break
127 |
128 |
129 | if __name__ == '__main__':
130 | # 获取市ID
131 | citylist = city.city()
132 | for city in citylist:
133 | city_code = city[0]
134 | provence = city[1]
135 | city = city[2]
136 | # 职位爬虫
137 | jobspider(city_code, city, provence)
138 | # -----------------
139 |
--------------------------------------------------------------------------------