├── NLP-analysis
├── Data-Science-NLP_11-19.ipynb
├── data
│ ├── business_analysis.csv
│ ├── business_analysis_with_skills.csv
│ ├── data_analysis.csv
│ ├── data_analysis_with_skills.csv
│ ├── data_mining.csv
│ ├── data_mining_with_skills.csv
│ ├── machine_learning.csv
│ └── machine_learning_with_skills.csv
├── dict
│ ├── Stopword.txt
│ ├── userdict.txt
│ ├── 硬技能.txt
│ └── 软技能.txt
└── images
│ ├── 各区职位数.png
│ ├── 数据科学求职技能.png
│ ├── 数据科学求职硬技能.png
│ └── 数据科学求职软技能.png
├── README.md
├── analysis&forecast
└── Data-Science-Analysis.ipynb
├── data_to_be_analysed
├── business_analysis_with_skills.csv
├── data_analysis_with_skills.csv
├── data_mining_with_skills.csv
└── machine_learning_with_skills.csv
├── output_13_0.png
└── spider
└── shanghai_jobs_discovery.py
/NLP-analysis/dict/Stopword.txt:
--------------------------------------------------------------------------------
1 |
2 | 线
3 | 分
4 | 数据
5 | 分析
6 | 数学
7 | 型
8 | 模型
9 | 率
10 | 度
11 | 统计
12 | 能
13 | 学习
14 | 用
15 | 人
16 | 性
17 | 结构
18 | 采集
19 | 库
20 | 数
21 | 网
22 | 据
23 | 网络
24 | 画像
25 | 机器
26 | 语
27 | 程
28 | 机
29 | 清
30 | 理学
31 | 式
32 | 建
33 | 代数
34 | 概率
35 | 工程
36 | 集
37 | 字
38 | 概率
39 | 深
40 | 营
41 | 运
42 | 学
43 | 仓
44 | 工
45 | 感
46 | 合
47 | 析
48 | 拟
49 | 爬
50 | 代
51 | 户
52 | 掘
53 | 挖掘
54 | 用户
55 | 建模
56 | 逻辑
57 | 敏感
58 | 思维
59 | 深度
60 | 清洗
61 | 测试
62 | 模式
63 | -
64 | 识别
65 | A
66 | B
67 | D
68 | E
69 | F
70 | G
71 | H
72 | I
73 | J
74 | K
75 | L
76 | M
77 | N
78 | O
79 | P
80 | Q
81 | R
82 | S
83 | T
84 | U
85 | V
86 | W
87 | X
88 | Y
89 | Z
90 | Pos
91 | Sa
92 | 1
93 | 2
94 | 3
95 | 4
96 | 5
97 | 6
98 | 7
99 | 8
100 | 9
101 | 0
102 | /
103 | 。
104 | ,
105 | 、
106 | .
107 | ,
108 | /
109 | \
110 | '
111 | "
112 | ;
113 | `
114 | (
115 | )
116 | (
117 | )
118 | ?
119 | ?
120 |
--------------------------------------------------------------------------------
/NLP-analysis/dict/userdict.txt:
--------------------------------------------------------------------------------
1 | 数据结构
2 | 数据库
3 | 数据分析
4 | 数据挖掘
5 | 建模
6 | 商务智能
7 | 商业
8 | 拟合
9 | 开源
10 | 统计
11 | 数学
12 | 决策
13 | 线性代数
14 | 数据报表
15 | 定量分析
16 | 沟通
17 | 业务
18 | 管理学
19 | 英语
20 | 数字
21 | 测试
22 | 机器学习
23 | 人工智能
24 | 深度学习
25 | 爬虫
26 | 概率统计
27 | 决策支持
28 | 概率论
29 | 网络工程
30 | 算法
31 | 经济学
32 | 报表
33 | 报告
34 | 评估
35 | 评价
36 | 咨询
37 | 逻辑思维
38 | 数字敏感
39 | 数据采集
40 | 数据清洗
41 | 模式识别
42 | 用户画像
43 | 用户行为
44 | 数据运营
45 | 数据仓库
46 | 数据模型
--------------------------------------------------------------------------------
/NLP-analysis/dict/硬技能.txt:
--------------------------------------------------------------------------------
1 | Python
2 | C
3 | Java
4 | R
5 | Ppt
6 | Excel
7 | Spss
8 | Word
9 | Powerpoint
10 | Tensorflow
11 | Scikit
12 | Sql
13 | Shell
14 | Hadoop
15 | Stack
16 | Git
17 | Github
18 | Tableau
19 | Linux
20 | Unix
21 | Sas
22 | Matlab
23 | Oracle
24 | Hive
25 | Access
26 | Spark
27 | Powerbi
28 | Mongodb
29 | Redis
30 | PostgreSql
--------------------------------------------------------------------------------
/NLP-analysis/dict/软技能.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/dict/软技能.txt
--------------------------------------------------------------------------------
/NLP-analysis/images/各区职位数.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/各区职位数.png
--------------------------------------------------------------------------------
/NLP-analysis/images/数据科学求职技能.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/数据科学求职技能.png
--------------------------------------------------------------------------------
/NLP-analysis/images/数据科学求职硬技能.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/数据科学求职硬技能.png
--------------------------------------------------------------------------------
/NLP-analysis/images/数据科学求职软技能.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/数据科学求职软技能.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ```python
4 | import warnings
5 | warnings.filterwarnings("ignore")
6 | ```
7 |
8 | # Read Data
9 |
10 |
11 | ```python
12 | import pandas as pd
13 | from pandas import Series
14 | ```
15 |
16 |
17 | ```python
18 | data_analysis = pd.read_csv('./data_to_be_analysed/data_analysis_with_skills.csv')
19 | data_mining = pd.read_csv('./data_to_be_analysed/data_mining_with_skills.csv')
20 | machine_learning = pd.read_csv('./data_to_be_analysed/machine_learning_with_skills.csv')
21 | business_analysis = pd.read_csv('./data_to_be_analysed/business_analysis_with_skills.csv')
22 | ```
23 |
24 | ---
25 |
26 | ## 添加薪资均值
27 |
28 |
29 | ```python
30 | import re
31 | # 均值函数
32 | def average(job_salary):
33 | # 取薪资均值----------------
34 | pattern = re.compile('\d+')
35 | salary = job_salary
36 | try:
37 | res = re.findall(pattern, salary)
38 | avg_salary = 0
39 | sum = 0
40 | for i in res:
41 | a = int(i)
42 | sum = sum + a
43 | avg_salary = sum / 2
44 | except Exception:
45 | avg_salary = 0
46 | # 函数返回值
47 | return avg_salary
48 |
49 | salary_list = []
50 | for i in range(0,data_analysis.shape[0]):
51 | avg_sal = average(data_analysis['职位薪资'][i])
52 | salary_list.append(avg_sal)
53 | sal = Series(salary_list)
54 |
55 | data_analysis.insert(9,'salary',sal)
56 | ```
57 |
58 |
59 | ```python
60 | salary_list = []
61 | for i in range(0,data_mining.shape[0]):
62 | avg_sal = average(data_mining['职位薪资'][i])
63 | salary_list.append(avg_sal)
64 | sal = Series(salary_list)
65 |
66 | data_mining.insert(9,'salary',sal)
67 | ```
68 |
69 |
70 | ```python
71 | salary_list = []
72 | for i in range(0,machine_learning.shape[0]):
73 | avg_sal = average(machine_learning['职位薪资'][i])
74 | salary_list.append(avg_sal)
75 | sal = Series(salary_list)
76 |
77 | machine_learning.insert(9,'salary',sal)
78 | ```
79 |
80 |
81 | ```python
82 | salary_list = []
83 | for i in range(0,business_analysis.shape[0]):
84 | avg_sal = average(business_analysis['职位薪资'][i])
85 | salary_list.append(avg_sal)
86 | sal = Series(salary_list)
87 |
88 | business_analysis.insert(9,'salary',sal)
89 | ```
90 |
91 | ---
92 |
93 | # 薪资分布探索
94 |
95 |
96 | ```python
97 | data_analysis.salary.describe()
98 | ```
99 |
100 |
101 |
102 |
103 | count 575.000000
104 | mean 17.446957
105 | std 8.261053
106 | min 2.500000
107 | 25% 11.500000
108 | 50% 15.000000
109 | 75% 22.500000
110 | max 47.500000
111 | Name: salary, dtype: float64
112 |
113 |
114 |
115 |
116 | ```python
117 | %matplotlib inline
118 | import matplotlib.pyplot as plt
119 | data_analysis.salary.hist(bins=50, figsize=(8,5))
120 |
121 | plt.show()
122 | ```
123 |
124 |
125 | 
126 |
127 |
128 | - 薪资主要分布在**5k-30k**之间
129 |
130 | ---
131 |
132 |
133 | ```python
134 | data_analysis = data_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)
135 | ```
136 |
137 |
138 | ```python
139 | data_mining = data_mining.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)
140 | ```
141 |
142 |
143 | ```python
144 | machine_learning = machine_learning.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)
145 | ```
146 |
147 |
148 | ```python
149 | business_analysis = business_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)
150 | ```
151 |
152 | ---
153 |
154 | ## 掌握的软件技能对薪资的影响关系
155 |
156 |
157 | ```python
158 | corr_matrix = data_analysis.corr()
159 | corr_matrix["salary"].sort_values(ascending=False)
160 | ```
161 |
162 |
163 |
164 |
165 | salary 1.000000
166 | Hive 0.292764
167 | Hadoop 0.254448
168 | Python 0.242217
169 | Sql 0.235256
170 | Spark 0.216505
171 | Sas 0.119493
172 | Tableau 0.044519
173 | Spss 0.024708
174 | Ppt -0.042691
175 | Excel -0.140370
176 | Name: salary, dtype: float64
177 |
178 |
179 |
180 | - **Data Analysis**的职位中,`Hive`,`Spark`,`Hadoop`大数据应用方面的软件是**薪资的加分项**。
181 | - 同时,`Python`,`SQL`,`SAS`,`Tableau`,`SPSS`等统计分析软件与可视化软件也是数据分析师**区别于低薪分析专员**的因素。
182 | - `PPT`,`Excel`作为必须的软件技能,对薪资变化**并没有太大的影响**,甚至仅仅会Excel的职位沦落为专员,会是一个减分项。
183 | - 结论:在数据分析领域,拥有**大数据软件技能**并且懂得**Python**这一编程语言的分析师的待遇较好。
184 |
185 |
186 | ```python
187 | corr_matrix = data_mining.corr()
188 | corr_matrix["salary"].sort_values(ascending=False)
189 | ```
190 |
191 |
192 |
193 |
194 | salary 1.000000
195 | Hive 0.195964
196 | Spark 0.180962
197 | Java 0.180336
198 | Hadoop 0.136721
199 | C 0.127518
200 | Python 0.067957
201 | Shell 0.020722
202 | Linux -0.009015
203 | Sql -0.052715
204 | Sas -0.089915
205 | Name: salary, dtype: float64
206 |
207 |
208 |
209 | - **Data Mining**的职位中,`Hive`,`Spark`,`Hadoop`大数据方面的软件是薪资**极大的加分项**。
210 | - `Java`,`C`,`Python`等编程语言对数据挖掘的工作有很大帮助因此也体现在了对薪资的**正面影响**上。
211 | - 分析结论:具备**数据挖掘算法与编码能力**且具备**大数据方面分析技能**的数据挖掘工程师的待遇较好。
212 |
213 |
214 | ```python
215 | corr_matrix = machine_learning.corr()
216 | corr_matrix["salary"].sort_values(ascending=False)
217 | ```
218 |
219 |
220 |
221 |
222 | salary 1.000000
223 | Hive 0.095518
224 | Spark 0.093537
225 | Java 0.064341
226 | Tensorflow 0.059785
227 | Hadoop 0.057670
228 | Sql -0.035192
229 | Linux -0.036466
230 | C -0.052703
231 | Matlab -0.058808
232 | Python -0.104268
233 | Name: salary, dtype: float64
234 |
235 |
236 |
237 | - **Machine Learning**的职位中,没有特别突出的技能加分项,列表中的软件技能基本都是入职必备的技能。
238 | - `Hive`,`Spark`,`Hadoop`等大数据方面的技能会对薪资有一定程度的提升,不过影响较小。
239 | - 分析结论:机器学习工程师入门难度稍高,需要掌握具备的软件技能也较多,没有特别突出的薪资加分项。
240 |
241 |
242 | ```python
243 | corr_matrix = business_analysis.corr()
244 | corr_matrix["salary"].sort_values(ascending=False)
245 | ```
246 |
247 |
248 |
249 |
250 | salary 1.000000
251 | C 0.399615
252 | Python 0.377288
253 | Linux 0.255181
254 | Java 0.237707
255 | Hive 0.153111
256 | Sql 0.115494
257 | Sas 0.085293
258 | Excel 0.059614
259 | Ppt -0.057346
260 | Spss -0.068219
261 | Name: salary, dtype: float64
262 |
263 |
264 |
265 | - **Business Analysis**的职位中,编程语言是**极大的薪资加分项**。如`C`,`Python`,`Java`。
266 | - `Excel`,`PPT`,`SPSS`等软件是这个职位的**必备技能**,因此对职位薪资没有太大的影响。
267 | - 结论:在商业分析领域,拥有**商业分析思维**并且具有**编程能力**的分析师的待遇较好。
268 |
269 | ---
270 |
271 | ---
272 |
273 | # 准备数据
274 |
275 |
276 | ```python
277 | from sklearn.model_selection import train_test_split
278 |
279 | train_set, test_set = train_test_split(data_analysis, test_size=0.2, random_state=52)
280 | ```
281 |
282 |
283 | ```python
284 | data_train = train_set.copy()
285 | data_test = test_set.copy()
286 | ```
287 |
288 | ```python
289 | from sklearn.pipeline import Pipeline
290 | from sklearn.preprocessing import StandardScaler
291 | from sklearn.preprocessing import Imputer
292 | from sklearn.compose import ColumnTransformer
293 | from sklearn.preprocessing import OneHotEncoder
294 | ```
295 |
296 |
297 | ```python
298 | data_analysis_num = data_analysis.drop(['公司名称','公司规模','地区','学历要求','工作经验','职位名称','融资情况','salary'], axis=1)
299 | num_attribs = list(data_analysis_num)
300 | cat_attribs = ['公司规模','学历要求','工作经验','融资情况']
301 |
302 | num_pipeline = Pipeline([
303 | ('std_scaler', StandardScaler()),
304 | ])
305 |
306 | full_pipeline = ColumnTransformer([
307 | ("num", num_pipeline, num_attribs),
308 | ("cat", OneHotEncoder(), cat_attribs),
309 | ])
310 |
311 | data_analysis_prepared = full_pipeline.fit_transform(data_train)
312 | data_analysis_test = full_pipeline.transform(data_test)
313 | ```
314 |
315 |
316 | ```python
317 | data_analysis_prepared[:1]
318 | ```
319 |
320 |
321 |
322 |
323 | array([[-1.04902651, -0.99566158, -0.6853091 , -0.58404441, -0.5 ,
324 | -0.4307749 , -0.38729833, 2.68594224, -0.37608404, -0.27343437,
325 | 0. , 0. , 1. , 0. , 0. ,
326 | 0. , 0. , 0. , 1. , 0. ,
327 | 0. , 0. , 0. , 0. , 1. ,
328 | 0. , 0. , 0. , 0. , 0. ,
329 | 0. , 0. , 0. , 0. , 0. ,
330 | 0. , 0. , 0. , 0. , 1. ,
331 | 0. ]])
332 |
333 |
334 |
335 |
336 | ```python
337 | data_analysis_labels = data_train.salary.values
338 | test_labels = data_test.salary.values
339 | ```
340 |
341 | ---
342 |
343 | # 训练模型
344 |
345 | ## Linear
346 |
347 |
348 | ```python
349 | from sklearn.linear_model import LinearRegression
350 |
351 | lin_reg = LinearRegression()
352 | lin_reg.fit(data_analysis_prepared, data_analysis_labels)
353 | ```
354 |
355 |
356 |
357 |
358 | LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
359 | normalize=False)
360 |
361 |
362 |
363 |
364 | ```python
365 | from sklearn.metrics import mean_squared_error
366 | import numpy as np
367 |
368 | salary_predictions = lin_reg.predict(data_analysis_prepared)
369 | lin_mse = mean_squared_error(data_analysis_labels, salary_predictions)
370 | lin_rmse = np.sqrt(lin_mse)
371 | lin_rmse
372 | ```
373 |
374 |
375 |
376 |
377 | 5.402995127278521
378 |
379 |
380 |
381 |
382 | ```python
383 | #salary_predictions[:10]
384 | ```
385 |
386 | ### 测试集
387 |
388 |
389 | ```python
390 | data_test.head(10)
391 | ```
392 |
393 |
394 |
395 |
396 |
397 |
410 |
411 |
412 |
413 | |
414 | 公司名称 |
415 | 公司规模 |
416 | 地区 |
417 | 学历要求 |
418 | 工作经验 |
419 | 职位名称 |
420 | salary |
421 | 融资情况 |
422 | Sql |
423 | Python |
424 | Excel |
425 | Sas |
426 | Spss |
427 | Hive |
428 | Hadoop |
429 | Tableau |
430 | Ppt |
431 | Spark |
432 |
433 |
434 |
435 |
436 | 198 |
437 | 上海克顿文化传媒 |
438 | 100-499人 |
439 | 上海 黄浦区 新天地 |
440 | 本科 |
441 | 3-5年 |
442 | 数据分析师 |
443 | 11.5 |
444 | 已上市 |
445 | 0 |
446 | 0 |
447 | 0 |
448 | 0 |
449 | 1 |
450 | 0 |
451 | 0 |
452 | 0 |
453 | 1 |
454 | 0 |
455 |
456 |
457 | 316 |
458 | 今日头条 |
459 | 10000人以上 |
460 | 上海 徐汇区 上海南站 |
461 | 本科 |
462 | 1-3年 |
463 | 广告数据分析师 |
464 | 11.5 |
465 | D轮及以上 |
466 | 1 |
467 | 0 |
468 | 1 |
469 | 1 |
470 | 1 |
471 | 0 |
472 | 0 |
473 | 0 |
474 | 0 |
475 | 0 |
476 |
477 |
478 | 52 |
479 | 壹米滴答 |
480 | 10000人以上 |
481 | 上海 青浦区 徐泾 |
482 | 本科 |
483 | 1-3年 |
484 | 资深BI数据分析师 |
485 | 16.0 |
486 | C轮 |
487 | 1 |
488 | 0 |
489 | 0 |
490 | 0 |
491 | 0 |
492 | 0 |
493 | 1 |
494 | 0 |
495 | 0 |
496 | 1 |
497 |
498 |
499 | 313 |
500 | 拼多多 |
501 | 1000-9999人 |
502 | 上海 长宁区 天山路 |
503 | 本科 |
504 | 经验不限 |
505 | 数据分析师 |
506 | 22.5 |
507 | 已上市 |
508 | 1 |
509 | 1 |
510 | 0 |
511 | 0 |
512 | 0 |
513 | 1 |
514 | 0 |
515 | 0 |
516 | 0 |
517 | 0 |
518 |
519 |
520 | 19 |
521 | 太平洋房屋 |
522 | 10000人以上 |
523 | 上海 徐汇区 枫林路 |
524 | 本科 |
525 | 1-3年 |
526 | 数据分析师 |
527 | 12.0 |
528 | 已上市 |
529 | 1 |
530 | 1 |
531 | 1 |
532 | 0 |
533 | 0 |
534 | 0 |
535 | 0 |
536 | 1 |
537 | 0 |
538 | 0 |
539 |
540 |
541 | 482 |
542 | 印鹰 |
543 | 100-499人 |
544 | 上海 静安区 汶水路 |
545 | 本科 |
546 | 3-5年 |
547 | 数据分析经理 |
548 | 20.0 |
549 | B轮 |
550 | 1 |
551 | 0 |
552 | 0 |
553 | 0 |
554 | 0 |
555 | 0 |
556 | 1 |
557 | 0 |
558 | 0 |
559 | 0 |
560 |
561 |
562 | 212 |
563 | 复硕正态 |
564 | 20-99人 |
565 | 上海 静安区 大宁路 |
566 | 本科 |
567 | 1-3年 |
568 | 高级数据分析员 |
569 | 10.0 |
570 | 不需要融资 |
571 | 0 |
572 | 0 |
573 | 0 |
574 | 0 |
575 | 1 |
576 | 0 |
577 | 0 |
578 | 0 |
579 | 1 |
580 | 0 |
581 |
582 |
583 | 127 |
584 | 云序生物 |
585 | 20-99人 |
586 | 上海 松江区 新桥 |
587 | 大专 |
588 | 1-3年 |
589 | 生信分析师/数据分析 |
590 | 8.0 |
591 | 不需要融资 |
592 | 0 |
593 | 1 |
594 | 0 |
595 | 0 |
596 | 0 |
597 | 0 |
598 | 0 |
599 | 0 |
600 | 0 |
601 | 0 |
602 |
603 |
604 | 401 |
605 | 上海创蓝文化传播 |
606 | 500-999人 |
607 | 上海 松江区 松江工业区 |
608 | 本科 |
609 | 1-3年 |
610 | 数据分析师 |
611 | 20.0 |
612 | A轮 |
613 | 1 |
614 | 1 |
615 | 1 |
616 | 0 |
617 | 0 |
618 | 0 |
619 | 0 |
620 | 1 |
621 | 1 |
622 | 0 |
623 |
624 |
625 | 260 |
626 | 上海智帛 |
627 | 20-99人 |
628 | 上海 闵行区 漕宝路 |
629 | 大专 |
630 | 1-3年 |
631 | 数据分析 |
632 | 10.0 |
633 | 未融资 |
634 | 1 |
635 | 0 |
636 | 1 |
637 | 0 |
638 | 0 |
639 | 0 |
640 | 0 |
641 | 0 |
642 | 0 |
643 | 0 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 | ```python
653 | y_test = lin_reg.predict(data_analysis_test)
654 | y_test[:10]
655 | ```
656 |
657 |
658 |
659 |
660 | array([17.01653719, 16.41342787, 21.16768836, 19.22802331, 13.22095168,
661 | 22.22075171, 11.02715534, 7.95300838, 13.12913168, 4.30171607])
662 |
663 |
664 |
665 |
666 | ```python
667 | test_labels[:10]
668 | ```
669 |
670 |
671 |
672 |
673 | array([11.5, 11.5, 16. , 22.5, 12. , 20. , 10. , 8. , 20. , 10. ])
674 |
675 |
676 |
677 |
678 | ```python
679 | lin_mse = mean_squared_error(test_labels, y_test)
680 | lin_rmse = np.sqrt(lin_mse)
681 | lin_rmse
682 | ```
683 |
684 |
685 |
686 |
687 | 6.7698143548675915
688 |
689 |
690 |
691 | - 测试集上误差约为**6.77**
692 |
693 | ### 交叉验证
694 |
695 |
696 | ```python
697 | from sklearn.model_selection import cross_val_score
698 |
699 | scores = cross_val_score(lin_reg, data_analysis_prepared, data_analysis_labels,
700 | scoring="neg_mean_squared_error", cv=10)
701 | lin_rmse_scores = np.sqrt(-scores)
702 | ```
703 |
704 |
705 | ```python
706 | display_scores(lin_rmse_scores)
707 | ```
708 |
709 | Scores: [5.81120933 5.92292919 6.50970607 6.24610706 5.18158564 6.27624993
710 | 7.37315509 6.07787995 5.67585695 4.86847943]
711 | Mean: 5.994315863710689
712 | Standard deviation: 0.6621760708217165
713 |
714 |
715 | ---
716 |
717 | ## D-Tree
718 |
719 | ### 建模训练
720 |
721 |
722 | ```python
723 | from sklearn.tree import DecisionTreeRegressor
724 |
725 | tree_reg = DecisionTreeRegressor(random_state=52)
726 | tree_reg.fit(data_analysis_prepared, data_analysis_labels)
727 | ```
728 |
729 |
730 |
731 |
732 | DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
733 | max_leaf_nodes=None, min_impurity_decrease=0.0,
734 | min_impurity_split=None, min_samples_leaf=1,
735 | min_samples_split=2, min_weight_fraction_leaf=0.0,
736 | presort=False, random_state=52, splitter='best')
737 |
738 |
739 |
740 |
741 | ```python
742 | y_pred_tree = tree_reg.predict(data_analysis_prepared)
743 | ```
744 |
745 |
746 | ```python
747 | from sklearn.metrics import mean_squared_error
748 |
749 | tree_mse = mean_squared_error(data_analysis_labels, y_pred_tree)
750 | tree_rmse = np.sqrt(tree_mse)
751 | tree_rmse
752 | ```
753 |
754 |
755 |
756 |
757 | 1.4079709678742711
758 |
759 |
760 |
761 | ### 测试集
762 |
763 |
764 | ```python
765 | y_test = tree_reg.predict(data_analysis_test)
766 | y_test[:10]
767 | ```
768 |
769 |
770 |
771 |
772 | array([15. , 12.5, 10. , 27. , 30. , 14.5, 8. , 7. , 12.5, 7. ])
773 |
774 |
775 |
776 |
777 | ```python
778 | test_labels[:10]
779 | ```
780 |
781 |
782 |
783 |
784 | array([11.5, 11.5, 16. , 22.5, 12. , 20. , 10. , 8. , 20. , 10. ])
785 |
786 |
787 |
788 |
789 | ```python
790 | tree_mse = mean_squared_error(test_labels, y_test)
791 | tree_rmse = np.sqrt(tree_mse)
792 | tree_rmse
793 | ```
794 |
795 |
796 |
797 |
798 | 8.252411446722123
799 |
800 |
801 |
802 | - 测试集上误差约为**8.25**
803 |
804 | ---
805 |
806 | ## Random Forest
807 |
808 | ### 建模训练
809 |
810 |
811 | ```python
812 | from sklearn.ensemble import RandomForestRegressor
813 |
814 | forest_reg = RandomForestRegressor(random_state=52)
815 | forest_reg.fit(data_analysis_prepared, data_analysis_labels)
816 | ```
817 |
818 |
819 |
820 |
821 | RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
822 | max_features='auto', max_leaf_nodes=None,
823 | min_impurity_decrease=0.0, min_impurity_split=None,
824 | min_samples_leaf=1, min_samples_split=2,
825 | min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
826 | oob_score=False, random_state=52, verbose=0, warm_start=False)
827 |
828 |
829 |
830 |
831 | ```python
832 | y_pred_rf = forest_reg.predict(data_analysis_prepared)
833 | forest_mse = mean_squared_error(data_analysis_labels, y_pred_rf)
834 | forest_rmse = np.sqrt(forest_mse)
835 | forest_rmse
836 | ```
837 |
838 |
839 |
840 |
841 | 2.714777885552381
842 |
843 |
844 |
845 | ### 测试集
846 |
847 |
848 | ```python
849 | data_test[:10]
850 | ```
851 |
852 |
853 |
854 |
855 |
856 |
869 |
870 |
871 |
872 | |
873 | 公司名称 |
874 | 公司规模 |
875 | 地区 |
876 | 学历要求 |
877 | 工作经验 |
878 | 职位名称 |
879 | salary |
880 | 融资情况 |
881 | Sql |
882 | Python |
883 | Excel |
884 | Sas |
885 | Spss |
886 | Hive |
887 | Hadoop |
888 | Tableau |
889 | Ppt |
890 | Spark |
891 |
892 |
893 |
894 |
895 | 198 |
896 | 上海克顿文化传媒 |
897 | 100-499人 |
898 | 上海 黄浦区 新天地 |
899 | 本科 |
900 | 3-5年 |
901 | 数据分析师 |
902 | 11.5 |
903 | 已上市 |
904 | 0 |
905 | 0 |
906 | 0 |
907 | 0 |
908 | 1 |
909 | 0 |
910 | 0 |
911 | 0 |
912 | 1 |
913 | 0 |
914 |
915 |
916 | 316 |
917 | 今日头条 |
918 | 10000人以上 |
919 | 上海 徐汇区 上海南站 |
920 | 本科 |
921 | 1-3年 |
922 | 广告数据分析师 |
923 | 11.5 |
924 | D轮及以上 |
925 | 1 |
926 | 0 |
927 | 1 |
928 | 1 |
929 | 1 |
930 | 0 |
931 | 0 |
932 | 0 |
933 | 0 |
934 | 0 |
935 |
936 |
937 | 52 |
938 | 壹米滴答 |
939 | 10000人以上 |
940 | 上海 青浦区 徐泾 |
941 | 本科 |
942 | 1-3年 |
943 | 资深BI数据分析师 |
944 | 16.0 |
945 | C轮 |
946 | 1 |
947 | 0 |
948 | 0 |
949 | 0 |
950 | 0 |
951 | 0 |
952 | 1 |
953 | 0 |
954 | 0 |
955 | 1 |
956 |
957 |
958 | 313 |
959 | 拼多多 |
960 | 1000-9999人 |
961 | 上海 长宁区 天山路 |
962 | 本科 |
963 | 经验不限 |
964 | 数据分析师 |
965 | 22.5 |
966 | 已上市 |
967 | 1 |
968 | 1 |
969 | 0 |
970 | 0 |
971 | 0 |
972 | 1 |
973 | 0 |
974 | 0 |
975 | 0 |
976 | 0 |
977 |
978 |
979 | 19 |
980 | 太平洋房屋 |
981 | 10000人以上 |
982 | 上海 徐汇区 枫林路 |
983 | 本科 |
984 | 1-3年 |
985 | 数据分析师 |
986 | 12.0 |
987 | 已上市 |
988 | 1 |
989 | 1 |
990 | 1 |
991 | 0 |
992 | 0 |
993 | 0 |
994 | 0 |
995 | 1 |
996 | 0 |
997 | 0 |
998 |
999 |
1000 | 482 |
1001 | 印鹰 |
1002 | 100-499人 |
1003 | 上海 静安区 汶水路 |
1004 | 本科 |
1005 | 3-5年 |
1006 | 数据分析经理 |
1007 | 20.0 |
1008 | B轮 |
1009 | 1 |
1010 | 0 |
1011 | 0 |
1012 | 0 |
1013 | 0 |
1014 | 0 |
1015 | 1 |
1016 | 0 |
1017 | 0 |
1018 | 0 |
1019 |
1020 |
1021 | 212 |
1022 | 复硕正态 |
1023 | 20-99人 |
1024 | 上海 静安区 大宁路 |
1025 | 本科 |
1026 | 1-3年 |
1027 | 高级数据分析员 |
1028 | 10.0 |
1029 | 不需要融资 |
1030 | 0 |
1031 | 0 |
1032 | 0 |
1033 | 0 |
1034 | 1 |
1035 | 0 |
1036 | 0 |
1037 | 0 |
1038 | 1 |
1039 | 0 |
1040 |
1041 |
1042 | 127 |
1043 | 云序生物 |
1044 | 20-99人 |
1045 | 上海 松江区 新桥 |
1046 | 大专 |
1047 | 1-3年 |
1048 | 生信分析师/数据分析 |
1049 | 8.0 |
1050 | 不需要融资 |
1051 | 0 |
1052 | 1 |
1053 | 0 |
1054 | 0 |
1055 | 0 |
1056 | 0 |
1057 | 0 |
1058 | 0 |
1059 | 0 |
1060 | 0 |
1061 |
1062 |
1063 | 401 |
1064 | 上海创蓝文化传播 |
1065 | 500-999人 |
1066 | 上海 松江区 松江工业区 |
1067 | 本科 |
1068 | 1-3年 |
1069 | 数据分析师 |
1070 | 20.0 |
1071 | A轮 |
1072 | 1 |
1073 | 1 |
1074 | 1 |
1075 | 0 |
1076 | 0 |
1077 | 0 |
1078 | 0 |
1079 | 1 |
1080 | 1 |
1081 | 0 |
1082 |
1083 |
1084 | 260 |
1085 | 上海智帛 |
1086 | 20-99人 |
1087 | 上海 闵行区 漕宝路 |
1088 | 大专 |
1089 | 1-3年 |
1090 | 数据分析 |
1091 | 10.0 |
1092 | 未融资 |
1093 | 1 |
1094 | 0 |
1095 | 1 |
1096 | 0 |
1097 | 0 |
1098 | 0 |
1099 | 0 |
1100 | 0 |
1101 | 0 |
1102 | 0 |
1103 |
1104 |
1105 |
1106 |
1107 |
1108 |
1109 |
1110 |
1111 | ```python
1112 | y_test = forest_reg.predict(data_analysis_test)
1113 | y_test[:10]
1114 | ```
1115 |
1116 |
1117 |
1118 |
1119 | array([20.08333333, 10.35 , 18.025 , 26.25 , 19.58333333,
1120 | 20.35 , 9.11666667, 10.825 , 12.55428571, 9.1 ])
1121 |
1122 |
1123 |
1124 |
1125 | ```python
1126 | test_labels[:10]
1127 | ```
1128 |
1129 |
1130 |
1131 |
1132 | array([11.5, 11.5, 16. , 22.5, 12. , 20. , 10. , 8. , 20. , 10. ])
1133 |
1134 |
1135 |
1136 |
1137 | ```python
1138 | forest_mse = mean_squared_error(test_labels, y_test)
1139 | forest_rmse = np.sqrt(forest_mse)
1140 | forest_rmse
1141 | ```
1142 |
1143 |
1144 |
1145 |
1146 | 7.087180783205348
1147 |
1148 |
1149 |
1150 | - 测试集上误差约为**7.09**
1151 |
1152 | ### 交叉验证
1153 |
1154 |
1155 | ```python
1156 | from sklearn.model_selection import cross_val_score
1157 |
1158 | scores = cross_val_score(forest_reg, data_analysis_prepared, data_analysis_labels,
1159 | scoring="neg_mean_squared_error", cv=10)
1160 | forest_rmse_scores = np.sqrt(-scores)
1161 | ```
1162 |
1163 |
1164 | ```python
1165 | def display_scores(scores):
1166 | print("Scores:", scores)
1167 | print("Mean:", scores.mean())
1168 | print("Standard deviation:", scores.std())
1169 |
1170 | display_scores(forest_rmse_scores)
1171 | ```
1172 |
1173 | Scores: [5.92160593 6.47537707 8.01906699 5.64303733 6.39749406 7.22392532
1174 | 6.18275771 6.05244757 6.53447138 4.86251585]
1175 | Mean: 6.331269920627992
1176 | Standard deviation: 0.8130474122018511
1177 |
1178 |
1179 | ---
1180 |
1181 | # 网格搜索调参
1182 |
1183 |
1184 | ```python
1185 | from sklearn.model_selection import GridSearchCV
1186 |
1187 | param_grid = [
1188 | # try 12 (3×4) combinations of hyperparameters
1189 | {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
1190 | # then try 6 (2×3) combinations with bootstrap set as False
1191 | {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
1192 | ]
1193 |
1194 | forest_reg = RandomForestRegressor(random_state=52)
1195 | # train across 5 folds, that's a total of (12+6)*5=90 rounds of training
1196 | grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
1197 | scoring='neg_mean_squared_error', return_train_score=True)
1198 | grid_search.fit(data_analysis_prepared, data_analysis_labels)
1199 | ```
1200 |
1201 |
1202 |
1203 |
1204 | GridSearchCV(cv=5, error_score='raise-deprecating',
1205 | estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
1206 | max_features='auto', max_leaf_nodes=None,
1207 | min_impurity_decrease=0.0, min_impurity_split=None,
1208 | min_samples_leaf=1, min_samples_split=2,
1209 | min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
1210 | oob_score=False, random_state=52, verbose=0, warm_start=False),
1211 | fit_params=None, iid='warn', n_jobs=None,
1212 | param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
1213 | pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
1214 | scoring='neg_mean_squared_error', verbose=0)
1215 |
1216 |
1217 |
1218 |
1219 | ```python
1220 | cvres = grid_search.cv_results_
1221 | for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
1222 | print(np.sqrt(-mean_score), params)
1223 | ```
1224 |
1225 | 7.29061793698431 {'max_features': 2, 'n_estimators': 3}
1226 | 6.413781857864982 {'max_features': 2, 'n_estimators': 10}
1227 | 6.092544533188321 {'max_features': 2, 'n_estimators': 30}
1228 | 7.107886960097701 {'max_features': 4, 'n_estimators': 3}
1229 | 6.315256370330592 {'max_features': 4, 'n_estimators': 10}
1230 | 5.976022358347516 {'max_features': 4, 'n_estimators': 30}
1231 | 7.001163746738424 {'max_features': 6, 'n_estimators': 3}
1232 | 6.1921055299084635 {'max_features': 6, 'n_estimators': 10}
1233 | 5.996739844433075 {'max_features': 6, 'n_estimators': 30}
1234 | 7.088902371406774 {'max_features': 8, 'n_estimators': 3}
1235 | 6.292396346910386 {'max_features': 8, 'n_estimators': 10}
1236 | 5.980558606686522 {'max_features': 8, 'n_estimators': 30}
1237 | 7.0825937380292405 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
1238 | 6.412140716903331 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
1239 | 7.062864506385558 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
1240 | 6.337407579436449 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
1241 | 6.766095704089036 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
1242 | 6.251001206038802 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
1243 |
1244 |
1245 |
1246 | ```python
1247 | grid_search.best_params_
1248 | ```
1249 |
1250 |
1251 |
1252 |
1253 | {'max_features': 4, 'n_estimators': 30}
1254 |
1255 |
1256 |
1257 | ## 变量重要性
1258 |
1259 |
1260 | ```python
1261 | feature_importances = grid_search.best_estimator_.feature_importances_
1262 | #feature_importances
1263 | ```
1264 |
1265 |
1266 | ```python
1267 | num_attribs = list(data_analysis_num)
1268 | cat_attribs = ['公司规模','学历要求','工作经验','融资情况']
1269 | ```
1270 |
1271 |
1272 | ```python
1273 | # 变量重要性排序
1274 | attributes = num_attribs + cat_attribs
1275 | sorted(zip(feature_importances, attributes), reverse=True)
1276 | ```
1277 |
1278 |
1279 |
1280 |
1281 | [(0.05883012920907406, 'Sql'),
1282 | (0.05739271707726927, 'Hadoop'),
1283 | (0.052855819385887605, 'Python'),
1284 | (0.05042672750583558, '工作经验'),
1285 | (0.04841490914313372, 'Hive'),
1286 | (0.03169130633549138, 'Excel'),
1287 | (0.0301026272691416, 'Spark'),
1288 | (0.027897066519544437, 'Sas'),
1289 | (0.02382836465248971, 'Spss'),
1290 | (0.023060368955297835, '学历要求'),
1291 | (0.022374373956317948, 'Tableau'),
1292 | (0.01650026584689836, '融资情况'),
1293 | (0.013561201776627235, 'Ppt'),
1294 | (0.004150442668926646, '公司规模')]
1295 |
1296 |
1297 |
1298 | - **公司规模**对薪资的影响很小。
1299 |
1300 | ---
1301 |
1302 | # 最终模型
1303 |
1304 |
1305 | ```python
1306 | final_model = grid_search.best_estimator_
1307 |
1308 | final_predictions = final_model.predict(data_analysis_test)
1309 |
1310 | final_mse = mean_squared_error(test_labels, final_predictions)
1311 | final_rmse = np.sqrt(final_mse)
1312 | ```
1313 |
1314 |
1315 | ```python
1316 | final_rmse
1317 | ```
1318 |
1319 |
1320 |
1321 |
1322 | 6.867133419234717
1323 |
1324 |
1325 |
1326 | - 误差为6.87
1327 |
1328 | ---
1329 |
1330 | # 薪资预测
1331 |
1332 |
1333 | ```python
1334 | salary_test_series = Series(final_predictions,index=data_test.index)
1335 | ```
1336 |
1337 |
1338 | ```python
1339 | data_test_prediction = data_test.copy()
1340 | data_test_prediction.insert(7,'prediction',salary_test_series)
1341 | ```
1342 |
1343 |
1344 | ```python
1345 | data_test_prediction.sample(10)
1346 | ```
1347 |
1348 |
1349 |
1350 |
1351 |
1352 |
1365 |
1366 |
1367 |
1368 | |
1369 | 公司名称 |
1370 | 公司规模 |
1371 | 地区 |
1372 | 学历要求 |
1373 | 工作经验 |
1374 | 职位名称 |
1375 | salary |
1376 | prediction |
1377 | 融资情况 |
1378 | Sql |
1379 | Python |
1380 | Excel |
1381 | Sas |
1382 | Spss |
1383 | Hive |
1384 | Hadoop |
1385 | Tableau |
1386 | Ppt |
1387 | Spark |
1388 |
1389 |
1390 |
1391 |
1392 | 8 |
1393 | 辰德网络科技 |
1394 | 100-499人 |
1395 | 上海 |
1396 | 本科 |
1397 | 1-3年 |
1398 | 数据分析 |
1399 | 7.0 |
1400 | 8.916667 |
1401 | 未融资 |
1402 | 1 |
1403 | 0 |
1404 | 0 |
1405 | 0 |
1406 | 0 |
1407 | 0 |
1408 | 0 |
1409 | 0 |
1410 | 0 |
1411 | 0 |
1412 |
1413 |
1414 | 224 |
1415 | 安心记加班 |
1416 | 100-499人 |
1417 | 上海 徐汇区 交大 |
1418 | 本科 |
1419 | 3-5年 |
1420 | 高级数据分析师 |
1421 | 22.5 |
1422 | 17.355556 |
1423 | B轮 |
1424 | 1 |
1425 | 1 |
1426 | 1 |
1427 | 0 |
1428 | 0 |
1429 | 1 |
1430 | 1 |
1431 | 0 |
1432 | 0 |
1433 | 1 |
1434 |
1435 |
1436 | 490 |
1437 | 北京海金格医药科技 |
1438 | 100-499人 |
1439 | 上海 静安区 宜川路 |
1440 | 本科 |
1441 | 1年以内 |
1442 | 临床数据分析 |
1443 | 10.0 |
1444 | 9.800000 |
1445 | 已上市 |
1446 | 0 |
1447 | 0 |
1448 | 0 |
1449 | 1 |
1450 | 0 |
1451 | 0 |
1452 | 0 |
1453 | 0 |
1454 | 0 |
1455 | 0 |
1456 |
1457 |
1458 | 230 |
1459 | 任意门科技 |
1460 | 100-499人 |
1461 | 上海 浦东新区 花木 |
1462 | 本科 |
1463 | 1-3年 |
1464 | 数据分析师 |
1465 | 22.5 |
1466 | 18.438889 |
1467 | C轮 |
1468 | 1 |
1469 | 1 |
1470 | 0 |
1471 | 0 |
1472 | 0 |
1473 | 1 |
1474 | 0 |
1475 | 1 |
1476 | 0 |
1477 | 1 |
1478 |
1479 |
1480 | 299 |
1481 | 天壤智能 |
1482 | 100-499人 |
1483 | 上海 徐汇区 龙华 |
1484 | 本科 |
1485 | 3-5年 |
1486 | 数据分析师 |
1487 | 32.5 |
1488 | 18.216667 |
1489 | A轮 |
1490 | 1 |
1491 | 1 |
1492 | 1 |
1493 | 0 |
1494 | 0 |
1495 | 1 |
1496 | 1 |
1497 | 1 |
1498 | 0 |
1499 | 0 |
1500 |
1501 |
1502 | 329 |
1503 | 横援投资 |
1504 | 20-99人 |
1505 | 上海 松江区 九亭 |
1506 | 本科 |
1507 | 3-5年 |
1508 | 数据分析师 |
1509 | 11.5 |
1510 | 12.033333 |
1511 | 未融资 |
1512 | 0 |
1513 | 0 |
1514 | 0 |
1515 | 0 |
1516 | 0 |
1517 | 0 |
1518 | 0 |
1519 | 0 |
1520 | 0 |
1521 | 0 |
1522 |
1523 |
1524 | 351 |
1525 | 雅智捷 |
1526 | 20-99人 |
1527 | 上海 静安区 天目西路 |
1528 | 本科 |
1529 | 3-5年 |
1530 | 高级咨询顾问 - 数据分析 |
1531 | 15.0 |
1532 | 18.466667 |
1533 | 20-99人 |
1534 | 1 |
1535 | 0 |
1536 | 0 |
1537 | 1 |
1538 | 0 |
1539 | 0 |
1540 | 0 |
1541 | 0 |
1542 | 0 |
1543 | 0 |
1544 |
1545 |
1546 | 316 |
1547 | 今日头条 |
1548 | 10000人以上 |
1549 | 上海 徐汇区 上海南站 |
1550 | 本科 |
1551 | 1-3年 |
1552 | 广告数据分析师 |
1553 | 11.5 |
1554 | 18.383333 |
1555 | D轮及以上 |
1556 | 1 |
1557 | 0 |
1558 | 1 |
1559 | 1 |
1560 | 1 |
1561 | 0 |
1562 | 0 |
1563 | 0 |
1564 | 0 |
1565 | 0 |
1566 |
1567 |
1568 | 535 |
1569 | 高顿教育 |
1570 | 1000-9999人 |
1571 | 上海 虹口区 虹口足球场 |
1572 | 本科 |
1573 | 3-5年 |
1574 | 大数据产品经理(数据分析师) |
1575 | 17.5 |
1576 | 19.737222 |
1577 | C轮 |
1578 | 0 |
1579 | 0 |
1580 | 0 |
1581 | 0 |
1582 | 0 |
1583 | 0 |
1584 | 0 |
1585 | 0 |
1586 | 0 |
1587 | 0 |
1588 |
1589 |
1590 | 520 |
1591 | 浅橙科技 |
1592 | 500-999人 |
1593 | 上海 杨浦区 新江湾城 |
1594 | 本科 |
1595 | 经验不限 |
1596 | 数据分析(风险政策) |
1597 | 7.5 |
1598 | 18.483333 |
1599 | B轮 |
1600 | 1 |
1601 | 1 |
1602 | 0 |
1603 | 1 |
1604 | 0 |
1605 | 0 |
1606 | 0 |
1607 | 0 |
1608 | 0 |
1609 | 0 |
1610 |
1611 |
1612 |
1613 |
1614 |
1615 |
1616 |
1617 | ---
1618 |
1619 |
1620 | ```python
1621 |
1622 | ```
1623 |
--------------------------------------------------------------------------------
/analysis&forecast/Data-Science-Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import warnings\n",
10 | "warnings.filterwarnings(\"ignore\")"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "# Read Data"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 4,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import pandas as pd\n",
27 | "from pandas import Series"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 5,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "data_analysis = pd.read_csv('./data_to_be_analysed/data_analysis_with_skills.csv')\n",
37 | "data_mining = pd.read_csv('./data_to_be_analysed/data_mining_with_skills.csv')\n",
38 | "machine_learning = pd.read_csv('./data_to_be_analysed/machine_learning_with_skills.csv')\n",
39 | "business_analysis = pd.read_csv('./data_to_be_analysed/business_analysis_with_skills.csv')"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "---"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 6,
52 | "metadata": {
53 | "scrolled": true
54 | },
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "(1053, 21)"
60 | ]
61 | },
62 | "execution_count": 6,
63 | "metadata": {},
64 | "output_type": "execute_result"
65 | }
66 | ],
67 | "source": [
68 | "data_analysis.shape"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "---"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## 添加薪资均值"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 7,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "import re\n",
92 | "# 均值函数\n",
93 | "def average(job_salary):\n",
94 | " # 取薪资均值----------------\n",
95 | " pattern = re.compile('\\d+')\n",
96 | " salary = job_salary\n",
97 | " try:\n",
98 | " res = re.findall(pattern, salary)\n",
99 | " avg_salary = 0\n",
100 | " sum = 0\n",
101 | " for i in res:\n",
102 | " a = int(i)\n",
103 | " sum = sum + a\n",
104 | " avg_salary = sum / 2\n",
105 | " except Exception:\n",
106 | " avg_salary = 0\n",
107 | " # 函数返回值\n",
108 | " return avg_salary\n",
109 | "\n",
110 | "salary_list = []\n",
111 | "for i in range(0,data_analysis.shape[0]):\n",
112 | " avg_sal = average(data_analysis['职位薪资'][i])\n",
113 | " salary_list.append(avg_sal)\n",
114 | "sal = Series(salary_list)\n",
115 | "\n",
116 | "data_analysis.insert(9,'salary',sal)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 8,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "salary_list = []\n",
126 | "for i in range(0,data_mining.shape[0]):\n",
127 | " avg_sal = average(data_mining['职位薪资'][i])\n",
128 | " salary_list.append(avg_sal)\n",
129 | "sal = Series(salary_list)\n",
130 | "\n",
131 | "data_mining.insert(9,'salary',sal)"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 9,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "salary_list = []\n",
141 | "for i in range(0,machine_learning.shape[0]):\n",
142 | " avg_sal = average(machine_learning['职位薪资'][i])\n",
143 | " salary_list.append(avg_sal)\n",
144 | "sal = Series(salary_list)\n",
145 | "\n",
146 | "machine_learning.insert(9,'salary',sal)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 10,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "salary_list = []\n",
156 | "for i in range(0,business_analysis.shape[0]):\n",
157 | " avg_sal = average(business_analysis['职位薪资'][i])\n",
158 | " salary_list.append(avg_sal)\n",
159 | "sal = Series(salary_list)\n",
160 | "\n",
161 | "business_analysis.insert(9,'salary',sal)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "---"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "# 薪资分布探索"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 11,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "data": {
185 | "text/plain": [
186 | "count 1053.000000\n",
187 | "mean 16.632479\n",
188 | "std 7.925945\n",
189 | "min 1.500000\n",
190 | "25% 11.000000\n",
191 | "50% 15.000000\n",
192 | "75% 22.500000\n",
193 | "max 52.500000\n",
194 | "Name: salary, dtype: float64"
195 | ]
196 | },
197 | "execution_count": 11,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "data_analysis.salary.describe()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 12,
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "data": {
213 | "text/plain": [
214 | "Index(['Unnamed: 0', 'Keyword', '公司名称', '公司规模', '地区', '学历要求', '工作经验', '职位名称',\n",
215 | " '职位描述', 'salary', '职位薪资', '融资情况', 'Sql', 'Python', 'Excel', 'Sas',\n",
216 | " 'Spss', 'Hive', 'Hadoop', 'Ppt', 'Tableau', 'Spark'],\n",
217 | " dtype='object')"
218 | ]
219 | },
220 | "execution_count": 12,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "data_analysis.columns"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 13,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "data": {
236 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAEyCAYAAAA1AJN4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAFEJJREFUeJzt3W2MXOV5h/HrLk4ah20xhrCiNu1SxUpD2YSGFaWlanchaghGMR9Cm4imJqWyqlJKG0dhST+gVkJy1JKXqi+SFVAcibAhhBSEaRvLZUsjFVI7oVnASXGJS2yo3SjYySYo0SZ3P8xxO4LdmfWcmd1nZq6fhHbOOc+cc8+NZ/77nJk9E5mJJEkq04+tdgGSJGlpBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYGtWuwCAs88+O8fGxlqO+e53v8vpp5++MgUNGHtXj/3rnL2rx/51rh96t3///m9m5uvajSsiqMfGxti3b1/LMbOzs0xOTq5MQQPG3tVj/zpn7+qxf53rh95FxH8tZ5ynviVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIK1DeqIuCsijkXEk03r/jwivhoRX4mIz0XEuqZtt0bEwYj4WkS8rVeFS5I0DJYzo/4EcOXL1u0BLszMNwH/AdwKEBEXAO8Cfr66z99ExGldq1aSpCHTNqgz81HgWy9b9/nMXKgWHwM2Vre3ADOZ+f3M/DpwELiki/VKkjRUuvEe9e8Af1/d3gB8o2nb4WqdJEnqQK0rk0XEnwALwN0nVy0yLJe47zZgG8Do6Cizs7MtjzU/P992jBZn7+qxf52zd/XYv84NUu86DuqI2ApcDVyRmSfD+DBwXtOwjcDzi90/M3cCOwEmJiay3aXe+uFycKWyd/XYv87Zu3rsX+cGqXcdBXVEXAncAvxaZn6vadODwKci4sPATwGbgC/WrlIDaWx6d8vth3ZsXqFKJKlcbYM6Iu4BJoGzI+IwcBuNT3n/OLAnIgAey8zfy8ynIuJe4Gkap8RvzMwf9qp4SZIGXdugzsx3L7L6zhbjbwdur1OUJElq8MpkkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFaxvUEXFXRByLiCeb1q2PiD0R8Uz188xqfUTEX0bEwYj4SkS8pZfFS5I06NYsY8wngL8CPtm0bhrYm5k7ImK6Wr4FeDuwqfrvF4G/rX6qD41N7265/dCOzStUiSQNr7Yz6sx8FPjWy1ZvAXZVt3cB1zSt/2Q2PAasi4hzu1WsJEnDJjKz/aCIMeChzLywWj6emeuatr+YmWdGxEPAjsz8QrV+L3BLZu5bZJ/bgG0Ao6OjF8/MzLSsYX5+npGRkeU+LjXptHdzR0603D6+4YxOS1qR/XeL//Y6Z+/qsX+d64feTU1N7c/MiXbjlnPq+1TEIusW/U0gM3cCOwEmJiZycnKy5Y5nZ2dpN0aL67R317c79X3dqe9zJfffLf7b65y9q8f+dW6Qetfpp76PnjylXf08Vq0/DJzXNG4j8Hzn5UmSNNw6DeoHga3V7a3AA03rf7v69PelwInMfKFmjZIkDa22p74j4h5gEjg7Ig4DtwE7gHsj4gbgOeDaavjDwFXAQeB7wHt7ULMkSUOjbVBn5ruX2HTFImMTuLFuUZIkqcErk0mSVDCDWpKkghnUkiQVrNt/Ry2piZdhlVSXM2pJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWBrVrsAqWRj07vZPr7A9dO7F91+aMfmFa5I0rBxRi1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqWK2gjog/joinIuLJiLgnIl4TEedHxOMR8UxEfDoiXt2tYiVJGjYdB3VEbAD+EJjIzAuB04B3AR8CPpKZm4AXgRu6UagkScOo7qnvNcDaiFgDvBZ4AbgcuK/avgu4puYxJEkaWh0HdWYeAf4CeI5GQJ8A9gPHM3OhGnYY2FC3SEmShlVkZmd3jDgT+Czwm8Bx4DPV8m2Z+fpqzHnAw5k5vsj9twHbAEZHRy+emZlpebz5+XlGRkY6qnXYddq7uSMnWm4f33BGpyWtyP67Ye7ICUbXwtGXFt/ersZ+eIy95PO2HvvXuX7o3dTU1P7MnGg3rs63Z70V+Hpm/g9ARNwP/DKwLiLWVLPqjcDzi905M3cCOwEmJiZycnKy5cFmZ2dpN0aL67R3S31j1EmHrjv1fa7k/rvh+urbs+6YW/yp0q7GfniMveTzth7717lB6l2d96ifAy6NiNdGRABXAE8DjwDvrMZsBR6oV6IkScOrznvUj9P40NiXgLlqXzuBW4D3RcRB4Czgzi7UKUnSUKpz6pvMvA247WWrnwUuqbNfSZLU4JXJJEkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwdasdgGS+tvY9O4lt20fX2By5UqRBlKtGXVErIuI+yLiqxFxICJ+KSLWR8SeiHim+nlmt4qVJGnY1D31/THgHzLz54A3AweAaWBvZm4C9lbLkiSpAx0HdUT8JPCrwJ0AmfmDzDwObAF2VcN2AdfULVKSpGEVmdnZHSMuAnYCT9OYTe8HbgaOZOa6pnEvZuYrTn9HxDZgG8Do6OjFMzMzLY83Pz/PyMhIR7WWaO7IiZbbxzec0bVjddq7Xte4kj3o1NyRE4yuhaMvLb69XY398BjravUYR9fCOev7/zGulkF73VtJ/dC7qamp/Zk50W5cnaCeAB4DLsvMxyPiY8C3gZuWE9TNJiYmct++fS2PNzs7y+TkZEe1lqjVB3AADu3Y3LVjddq7Xte4kj3o1Nj0braPL3DH3OKfu2xXYz88xrrafZjspuu2rGA1g2XQXvdWUj/0LiKWFdR13qM+DBzOzMer5fuAtwBHI+LcqohzgWM1jiFJ0lDrOKgz87+Bb0TEG6pVV9A4Df4gsLVatxV4oFaFkiQNsbp/R30TcHdEvBp4FngvjfC/NyJuAJ4Drq15DEmShlatoM7MJ4DFzq9fUWe/kiSpwUuISpJUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkgtW91rcK1fzVg9vHF7i+zdctSpLK5IxakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSClY7qCPitIj4ckQ8VC2fHxGPR8QzEfHpiHh1/TIlSRpO3ZhR3wwcaFr+EPCRzNwEvAjc0IVjSJI0lGoFdURsBDYDH6+WA7gcuK8asgu4ps4xJEkaZnVn1B8FPgD8qFo+CziemQvV8mFgQ81jSJI0tCIzO7tjxNXAVZn5+xExCbwfeC/wr5n5+mrMecDDmTm+yP23AdsARkdHL56ZmWl5vPn5eUZGRjqqtURzR0603D6+4Yyu7X90LRx9qdbuFtXNGnux/26YO3KiZf/a1dgPj7GuVo9xdC2cs77/H+NqGbTXvZXUD72bmpran5kT7catqXGMy4B3RMRVwGuAn6Qxw14XEWuqWfVG4PnF7pyZO4GdABMTEzk5OdnyYLOzs7Qb00+un97dcvuh6ya7tv/t4wvcMVfnf/XiulljL/bfDddP727Zv3Y19sNjrKvVY9w+vsBvDNDzdqUN2uveShqk3nX86p2ZtwK3ApycUWfmdRHxGeCdwAywFXigC3UOnbF2L/A7Nq9QJZKk1dSLv6O+BXhfRByk8Z71nT04hiRJQ6Er50MzcxaYrW4/C1zSjf1qae1m3MPAsw6ShoFXJpMkqWDd/4SRpBXjWQVp8DmjliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMC570iJf4lCR1gzNqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMC8hqp7xMqqSVJ8zakmSCuaMWgOr3Yz+0I7NK1SJJHXOGbUkSQVzRq2O+R60JPWeM2pJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIK1nFQR8R5EfFIRByIiKci4uZq/fqI2BMRz1Q/z+xeuZIkDZc6M+oFYHtmvhG4FLgxIi4ApoG9mbkJ2FstS5KkDnQc1Jn5QmZ+qbr9HeAAsAHYAuyqhu0CrqlbpCRJwyoys/5OIsaAR4ELgecyc13Tthcz8xWnvyNiG7ANYHR09OKZmZmWx5ifn2dkZKR2rStl7siJ1S7h/4yuhaMvrXYVp258wxktt9ftcbv9nzxGq/7VrXE5Nazm/uvWMLoWzlnf+xoGVb+97pWkH3o3NTW1PzMn2o2rHdQRMQL8M3B7Zt4fEceXE9TNJiYmct++fS2PMzs7y+TkZK1aV1JJ3yy1fXyBO+b674vS2n1fdN0eL+f7qMemd7fsX90a634ndgnfud2qhu3jC9x03Zae1zCo+u11ryT90LuIWFZQ13r1johXAZ8F7s7M+6vVRyPi3Mx8ISLOBY7VOYak3lnOLzsrEfaSllbnU98B3AkcyMwPN216ENha3d4KPNB5eZIkDbc6M+rLgPcAcxHxRLXug8AO4N6IuAF4Dri2XomSJA2vjoM6M78AxBKbr+h0v9JKKelzBJK0FK9MJklSwfrvo8AaGs54JckZtSRJRTOoJUkqmEEtSVLBfI96CSVc8UmSJGfUkiQVzBl1h/xEsqSTPAOnXnJGLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYFzyRVDQvJqJh54xakqSCOaOWCualaiU5o5YkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDD/PEuqwT+fUjcs9e9o+/gC10/v9qIuQ84ZtSRJBXNGLWmgeQlS9Ttn1JIkFWwgZ9TLed/Q36JVgn54j7sfapQGmTNqSZIKNpAz6uVwliBJy+P7/KvLGbUkSQUb2hm1NAw8cyT1v57NqCPiyoj4WkQcjIjpXh1HkqRB1pOgjojTgL8G3g5cALw7Ii7oxbEkSRpkvTr1fQlwMDOfBYiIGWAL8HSPjiepUJ5+13Ks9gfWVvv4rfTq1PcG4BtNy4erdZIk6RREZnZ/pxHXAm/LzN+tlt8DXJKZNzWN2QZsqxbfAHytzW7PBr7Z9WKHg72rx/51zt7VY/861w+9+5nMfF27Qb069X0YOK9peSPwfPOAzNwJ7FzuDiNiX2ZOdKe84WLv6rF/nbN39di/zg1S73p16vvfgE0RcX5EvBp4F/Bgj44lSdLA6smMOjMXIuIPgH8ETgPuysynenEsSZIGWc8ueJKZDwMPd3GXyz5Nrlewd/XYv87Zu3rsX+cGpnc9+TCZJEnqDq/1LUlSwQxqSZIKVnxQe83wUxMRd0XEsYh4smnd+ojYExHPVD/PXM0aSxUR50XEIxFxICKeioibq/X2bxki4jUR8cWI+Peqf39arT8/Ih6v+vfp6i9BtIiIOC0ivhwRD1XL9m6ZIuJQRMxFxBMRsa9aNxDP3aKD2muGd+QTwJUvWzcN7M3MTcDealmvtABsz8w3ApcCN1b/3uzf8nwfuDwz3wxcBFwZEZcCHwI+UvXvReCGVayxdDcDB5qW7d2pmcrMi5r+fnognrtFBzVN1wzPzB8AJ68ZriVk5qPAt162eguwq7q9C7hmRYvqE5n5QmZ+qbr9HRovmBuwf8uSDfPV4quq/xK4HLivWm//lhARG4HNwMer5cDe1TUQz93Sg9prhnfHaGa+AI0wAs5Z5XqKFxFjwC8Aj2P/lq06dfsEcAzYA/wncDwzF6ohPoeX9lHgA8CPquWzsHenIoHPR8T+6hLVMCDP3Z79HXWXxCLr/Hsy9VREjACfBf4oM7/dmNhoOTLzh8BFEbEO+BzwxsWGrWxV5YuIq4Fjmbk/IiZPrl5kqL1b2mWZ+XxEnAPsiYivrnZB3VL6jLrtNcO1LEcj4lyA6uexVa6nWBHxKhohfXdm3l+ttn+nKDOPA7M03utfFxEnJwU+hxd3GfCOiDhE4y2+y2nMsO3dMmXm89XPYzR+SbyEAXnulh7UXjO8Ox4Etla3twIPrGItxareE7wTOJCZH27aZP+WISJeV82kiYi1wFtpvM//CPDOapj9W0Rm3pqZGzNzjMbr3D9l5nXYu2WJiNMj4idO3gZ+HXiSAXnuFn9lsoi4isZvlievGX77KpdUtIi4B5ik8RVvR4HbgL8D7gV+GngOuDYzX/6Bs6EXEb8C/Aswx/+/T/hBGu9T2782IuJNND6wcxqNScC9mflnEfGzNGaJ64EvA7+Vmd9fvUrLVp36fn9mXm3vlqfq0+eqxTXApzLz9og4iwF47hYf1JIkDbPST31LkjTUDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQX7X6yswXm5vaZhAAAAAElFTkSuQmCC\n",
237 | "text/plain": [
238 | ""
239 | ]
240 | },
241 | "metadata": {
242 | "needs_background": "light"
243 | },
244 | "output_type": "display_data"
245 | }
246 | ],
247 | "source": [
248 | "%matplotlib inline\n",
249 | "import matplotlib.pyplot as plt\n",
250 | "data_analysis.salary.hist(bins=50, figsize=(8,5))\n",
251 | "\n",
252 | "plt.show()"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "- 薪资主要分布在**5k-30k**之间"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 14,
265 | "metadata": {},
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/plain": [
270 | "(42, 22)"
271 | ]
272 | },
273 | "execution_count": 14,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "data_analysis[data_analysis.salary>30].shape"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 15,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "(22, 22)"
291 | ]
292 | },
293 | "execution_count": 15,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "data_analysis[data_analysis.salary<5].shape"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 16,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "data_analysis = data_analysis[data_analysis['salary']<30]\n",
309 | "data_analysis = data_analysis[data_analysis['salary']>5]"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "---"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 17,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/html": [
327 | "\n",
328 | "\n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " | \n",
345 | " Unnamed: 0 | \n",
346 | " Keyword | \n",
347 | " 公司名称 | \n",
348 | " 公司规模 | \n",
349 | " 地区 | \n",
350 | " 学历要求 | \n",
351 | " 工作经验 | \n",
352 | " 职位名称 | \n",
353 | " 职位描述 | \n",
354 | " salary | \n",
355 | " ... | \n",
356 | " Sql | \n",
357 | " Python | \n",
358 | " Excel | \n",
359 | " Sas | \n",
360 | " Spss | \n",
361 | " Hive | \n",
362 | " Hadoop | \n",
363 | " Ppt | \n",
364 | " Tableau | \n",
365 | " Spark | \n",
366 | "
\n",
367 | " \n",
368 | " \n",
369 | " \n",
370 | " 0 | \n",
371 | " 1 | \n",
372 | " 数据分析 | \n",
373 | " 上海恒奕集团 | \n",
374 | " 500-999人 | \n",
375 | " 上海 普陀区 武宁路 | \n",
376 | " 大专 | \n",
377 | " 1-3年 | \n",
378 | " 数据分析 | \n",
379 | " 岗位职责:通过抓取第三方数据来分析在不同地区,我们医院项目的目标人群画像分析,实时网络热度盘... | \n",
380 | " 10.0 | \n",
381 | " ... | \n",
382 | " 0 | \n",
383 | " 0 | \n",
384 | " 0 | \n",
385 | " 0 | \n",
386 | " 0 | \n",
387 | " 0 | \n",
388 | " 0 | \n",
389 | " 0 | \n",
390 | " 0 | \n",
391 | " 0 | \n",
392 | "
\n",
393 | " \n",
394 | " 1 | \n",
395 | " 2 | \n",
396 | " 数据分析 | \n",
397 | " 晶赞科技 | \n",
398 | " 100-499人 | \n",
399 | " 上海 静安区 共和新路 | \n",
400 | " 本科 | \n",
401 | " 3-5年 | \n",
402 | " 数据分析 | \n",
403 | " 工作职责:1、负责政务数据/商业数据的研究和分析。例如,通过政务公开数据的挖掘和分析,形成主... | \n",
404 | " 18.0 | \n",
405 | " ... | \n",
406 | " 0 | \n",
407 | " 1 | \n",
408 | " 1 | \n",
409 | " 1 | \n",
410 | " 1 | \n",
411 | " 0 | \n",
412 | " 0 | \n",
413 | " 0 | \n",
414 | " 0 | \n",
415 | " 0 | \n",
416 | "
\n",
417 | " \n",
418 | "
\n",
419 | "
2 rows × 22 columns
\n",
420 | "
"
421 | ],
422 | "text/plain": [
423 | " Unnamed: 0 Keyword 公司名称 公司规模 地区 学历要求 工作经验 职位名称 \\\n",
424 | "0 1 数据分析 上海恒奕集团 500-999人 上海 普陀区 武宁路 大专 1-3年 数据分析 \n",
425 | "1 2 数据分析 晶赞科技 100-499人 上海 静安区 共和新路 本科 3-5年 数据分析 \n",
426 | "\n",
427 | " 职位描述 salary ... Sql \\\n",
428 | "0 岗位职责:通过抓取第三方数据来分析在不同地区,我们医院项目的目标人群画像分析,实时网络热度盘... 10.0 ... 0 \n",
429 | "1 工作职责:1、负责政务数据/商业数据的研究和分析。例如,通过政务公开数据的挖掘和分析,形成主... 18.0 ... 0 \n",
430 | "\n",
431 | " Python Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n",
432 | "0 0 0 0 0 0 0 0 0 0 \n",
433 | "1 1 1 1 1 0 0 0 0 0 \n",
434 | "\n",
435 | "[2 rows x 22 columns]"
436 | ]
437 | },
438 | "execution_count": 17,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "data_analysis.head(2)"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 18,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "data_analysis = data_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 19,
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "data_mining = data_mining.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 20,
468 | "metadata": {},
469 | "outputs": [],
470 | "source": [
471 | "machine_learning = machine_learning.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 21,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "business_analysis = business_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {},
486 | "source": [
487 | "---"
488 | ]
489 | },
490 | {
491 | "cell_type": "markdown",
492 | "metadata": {},
493 | "source": [
494 | "## 掌握的软件技能对薪资的影响关系"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 22,
500 | "metadata": {},
501 | "outputs": [
502 | {
503 | "data": {
504 | "text/plain": [
505 | "salary 1.000000\n",
506 | "Python 0.249000\n",
507 | "Hive 0.248617\n",
508 | "Sql 0.248049\n",
509 | "Spark 0.185153\n",
510 | "Sas 0.164862\n",
511 | "Hadoop 0.159602\n",
512 | "Spss 0.071946\n",
513 | "Tableau 0.068340\n",
514 | "Ppt -0.052048\n",
515 | "Excel -0.077646\n",
516 | "Name: salary, dtype: float64"
517 | ]
518 | },
519 | "execution_count": 22,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "corr_matrix = data_analysis.corr()\n",
526 | "corr_matrix[\"salary\"].sort_values(ascending=False)"
527 | ]
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "metadata": {},
532 | "source": [
533 | "- **Data Analysis**的职位中,`Hive`,`Spark`,`Hadoop`大数据应用方面的软件是**薪资的加分项**。\n",
534 | "- 同时,`Python`,`SQL`,`SAS`,`Tableau`,`SPSS`等统计分析软件与可视化软件也是数据分析师**区别于低薪分析专员**的因素。\n",
535 | "- `PPT`,`Excel`作为必须的软件技能,对薪资变化**并没有太大的影响**,甚至仅仅会Excel的职位沦落为专员,会是一个减分项。\n",
536 | "- 结论:在数据分析领域,拥有**大数据软件技能**并且懂得**Python**这一编程语言的分析师的待遇较好。"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": 23,
542 | "metadata": {},
543 | "outputs": [
544 | {
545 | "data": {
546 | "text/plain": [
547 | "salary 1.000000\n",
548 | "Spark 0.199728\n",
549 | "Java 0.196519\n",
550 | "Hive 0.185769\n",
551 | "C 0.159396\n",
552 | "Hadoop 0.155124\n",
553 | "Python 0.102946\n",
554 | "Shell 0.037366\n",
555 | "Linux 0.005632\n",
556 | "Sql -0.050092\n",
557 | "Sas -0.072597\n",
558 | "Name: salary, dtype: float64"
559 | ]
560 | },
561 | "execution_count": 23,
562 | "metadata": {},
563 | "output_type": "execute_result"
564 | }
565 | ],
566 | "source": [
567 | "corr_matrix = data_mining.corr()\n",
568 | "corr_matrix[\"salary\"].sort_values(ascending=False)"
569 | ]
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "metadata": {},
574 | "source": [
575 | "- **Data Mining**的职位中,`Hive`,`Spark`,`Hadoop`大数据方面的软件是薪资**极大的加分项**。\n",
576 | "- `Java`,`C`,`Python`等编程语言对数据挖掘的工作有很大帮助因此也体现在了对薪资的**正面影响**上。\n",
577 | "- 分析结论:具备**数据挖掘算法与编码能力**且具备**大数据方面分析技能**的数据挖掘工程师的待遇较好。"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": 24,
583 | "metadata": {
584 | "scrolled": true
585 | },
586 | "outputs": [
587 | {
588 | "data": {
589 | "text/plain": [
590 | "salary 1.000000\n",
591 | "Spark 0.144507\n",
592 | "Hive 0.116132\n",
593 | "Hadoop 0.109608\n",
594 | "Java 0.088857\n",
595 | "Tensorflow 0.072449\n",
596 | "Sql -0.022844\n",
597 | "C -0.032998\n",
598 | "Python -0.054629\n",
599 | "Linux -0.058181\n",
600 | "Matlab -0.062595\n",
601 | "Name: salary, dtype: float64"
602 | ]
603 | },
604 | "execution_count": 24,
605 | "metadata": {},
606 | "output_type": "execute_result"
607 | }
608 | ],
609 | "source": [
610 | "corr_matrix = machine_learning.corr()\n",
611 | "corr_matrix[\"salary\"].sort_values(ascending=False)"
612 | ]
613 | },
614 | {
615 | "cell_type": "markdown",
616 | "metadata": {},
617 | "source": [
618 | "- **Machine Learning**的职位中,没有特别突出的技能加分项,列表中的软件技能基本都是入职必备的技能。\n",
619 | "- `Hive`,`Spark`,`Hadoop`等大数据方面的技能会对薪资有一定程度的提升,不过影响较小。\n",
620 | "- 分析结论:机器学习工程师入门难度稍高,需要掌握具备的软件技能也较多,没有特别突出的薪资加分项。"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 25,
626 | "metadata": {},
627 | "outputs": [
628 | {
629 | "data": {
630 | "text/plain": [
631 | "salary 1.000000\n",
632 | "Python 0.394292\n",
633 | "C 0.371366\n",
634 | "Java 0.244305\n",
635 | "Linux 0.242700\n",
636 | "Hive 0.168359\n",
637 | "Sql 0.107485\n",
638 | "Sas 0.078156\n",
639 | "Excel 0.068413\n",
640 | "Ppt -0.019429\n",
641 | "Spss -0.091822\n",
642 | "Name: salary, dtype: float64"
643 | ]
644 | },
645 | "execution_count": 25,
646 | "metadata": {},
647 | "output_type": "execute_result"
648 | }
649 | ],
650 | "source": [
651 | "corr_matrix = business_analysis.corr()\n",
652 | "corr_matrix[\"salary\"].sort_values(ascending=False)"
653 | ]
654 | },
655 | {
656 | "cell_type": "markdown",
657 | "metadata": {},
658 | "source": [
659 | "- **Business Analysis**的职位中,编程语言是**极大的薪资加分项**。如`C`,`Python`,`Java`。\n",
660 | "- `Excel`,`PPT`,`SPSS`等软件是这个职位的**必备技能**,因此对职位薪资没有太大的影响。\n",
661 | "- 结论:在商业分析领域,拥有**商业分析思维**并且具有**编程能力**的分析师的待遇较好。"
662 | ]
663 | },
664 | {
665 | "cell_type": "markdown",
666 | "metadata": {},
667 | "source": [
668 | "---"
669 | ]
670 | },
671 | {
672 | "cell_type": "markdown",
673 | "metadata": {},
674 | "source": [
675 | "# 模型方法选择"
676 | ]
677 | },
678 | {
679 | "cell_type": "markdown",
680 | "metadata": {},
681 | "source": [
682 | "\n",
683 | "线性回归\n",
684 | "决策树回归\n",
685 | "随机森林回归\n",
686 | "KNN回归\n",
687 | "Adaboost回归\n",
688 | "GBRT回归\n",
689 | "Bagging回归\n",
690 | "ExtraTree极端随机树回归\n",
691 | "
"
692 | ]
693 | },
694 | {
695 | "cell_type": "markdown",
696 | "metadata": {},
697 | "source": [
698 | "---"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "# 准备数据"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": 26,
711 | "metadata": {},
712 | "outputs": [
713 | {
714 | "data": {
715 | "text/html": [
716 | "\n",
717 | "\n",
730 | "
\n",
731 | " \n",
732 | " \n",
733 | " | \n",
734 | " 公司名称 | \n",
735 | " 公司规模 | \n",
736 | " 地区 | \n",
737 | " 学历要求 | \n",
738 | " 工作经验 | \n",
739 | " 职位名称 | \n",
740 | " salary | \n",
741 | " 融资情况 | \n",
742 | " Sql | \n",
743 | " Python | \n",
744 | " Excel | \n",
745 | " Sas | \n",
746 | " Spss | \n",
747 | " Hive | \n",
748 | " Hadoop | \n",
749 | " Ppt | \n",
750 | " Tableau | \n",
751 | " Spark | \n",
752 | "
\n",
753 | " \n",
754 | " \n",
755 | " \n",
756 | " 0 | \n",
757 | " 上海恒奕集团 | \n",
758 | " 500-999人 | \n",
759 | " 上海 普陀区 武宁路 | \n",
760 | " 大专 | \n",
761 | " 1-3年 | \n",
762 | " 数据分析 | \n",
763 | " 10.0 | \n",
764 | " A轮 | \n",
765 | " 0 | \n",
766 | " 0 | \n",
767 | " 0 | \n",
768 | " 0 | \n",
769 | " 0 | \n",
770 | " 0 | \n",
771 | " 0 | \n",
772 | " 0 | \n",
773 | " 0 | \n",
774 | " 0 | \n",
775 | "
\n",
776 | " \n",
777 | " 1 | \n",
778 | " 晶赞科技 | \n",
779 | " 100-499人 | \n",
780 | " 上海 静安区 共和新路 | \n",
781 | " 本科 | \n",
782 | " 3-5年 | \n",
783 | " 数据分析 | \n",
784 | " 18.0 | \n",
785 | " D轮及以上 | \n",
786 | " 0 | \n",
787 | " 1 | \n",
788 | " 1 | \n",
789 | " 1 | \n",
790 | " 1 | \n",
791 | " 0 | \n",
792 | " 0 | \n",
793 | " 0 | \n",
794 | " 0 | \n",
795 | " 0 | \n",
796 | "
\n",
797 | " \n",
798 | "
\n",
799 | "
"
800 | ],
801 | "text/plain": [
802 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql Python \\\n",
803 | "0 上海恒奕集团 500-999人 上海 普陀区 武宁路 大专 1-3年 数据分析 10.0 A轮 0 0 \n",
804 | "1 晶赞科技 100-499人 上海 静安区 共和新路 本科 3-5年 数据分析 18.0 D轮及以上 0 1 \n",
805 | "\n",
806 | " Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n",
807 | "0 0 0 0 0 0 0 0 0 \n",
808 | "1 1 1 1 0 0 0 0 0 "
809 | ]
810 | },
811 | "execution_count": 26,
812 | "metadata": {},
813 | "output_type": "execute_result"
814 | }
815 | ],
816 | "source": [
817 | "data_analysis.head(2)"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 27,
823 | "metadata": {},
824 | "outputs": [],
825 | "source": [
826 | "from sklearn.model_selection import train_test_split\n",
827 | "\n",
828 | "train_set, test_set = train_test_split(data_analysis, test_size=0.2, random_state=42)"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": 28,
834 | "metadata": {},
835 | "outputs": [],
836 | "source": [
837 | "data_train = train_set.copy()\n",
838 | "data_test = test_set.copy()"
839 | ]
840 | },
841 | {
842 | "cell_type": "code",
843 | "execution_count": 29,
844 | "metadata": {},
845 | "outputs": [
846 | {
847 | "data": {
848 | "text/plain": [
849 | "(737, 18)"
850 | ]
851 | },
852 | "execution_count": 29,
853 | "metadata": {},
854 | "output_type": "execute_result"
855 | }
856 | ],
857 | "source": [
858 | "data_train.shape"
859 | ]
860 | },
861 | {
862 | "cell_type": "code",
863 | "execution_count": 30,
864 | "metadata": {},
865 | "outputs": [
866 | {
867 | "data": {
868 | "text/plain": [
869 | "(185, 18)"
870 | ]
871 | },
872 | "execution_count": 30,
873 | "metadata": {},
874 | "output_type": "execute_result"
875 | }
876 | ],
877 | "source": [
878 | "data_test.shape"
879 | ]
880 | },
881 | {
882 | "cell_type": "code",
883 | "execution_count": 31,
884 | "metadata": {},
885 | "outputs": [],
886 | "source": [
887 | "from sklearn.pipeline import Pipeline\n",
888 | "from sklearn.preprocessing import StandardScaler\n",
889 | "from sklearn.preprocessing import Imputer\n",
890 | "from sklearn.compose import ColumnTransformer\n",
891 | "from sklearn.preprocessing import OneHotEncoder"
892 | ]
893 | },
894 | {
895 | "cell_type": "code",
896 | "execution_count": 32,
897 | "metadata": {},
898 | "outputs": [
899 | {
900 | "data": {
901 | "text/html": [
902 | "\n",
903 | "\n",
916 | "
\n",
917 | " \n",
918 | " \n",
919 | " | \n",
920 | " 公司名称 | \n",
921 | " 公司规模 | \n",
922 | " 地区 | \n",
923 | " 学历要求 | \n",
924 | " 工作经验 | \n",
925 | " 职位名称 | \n",
926 | " salary | \n",
927 | " 融资情况 | \n",
928 | " Sql | \n",
929 | " Python | \n",
930 | " Excel | \n",
931 | " Sas | \n",
932 | " Spss | \n",
933 | " Hive | \n",
934 | " Hadoop | \n",
935 | " Ppt | \n",
936 | " Tableau | \n",
937 | " Spark | \n",
938 | "
\n",
939 | " \n",
940 | " \n",
941 | " \n",
942 | " 0 | \n",
943 | " 上海恒奕集团 | \n",
944 | " 500-999人 | \n",
945 | " 上海 普陀区 武宁路 | \n",
946 | " 大专 | \n",
947 | " 1-3年 | \n",
948 | " 数据分析 | \n",
949 | " 10.0 | \n",
950 | " A轮 | \n",
951 | " 0 | \n",
952 | " 0 | \n",
953 | " 0 | \n",
954 | " 0 | \n",
955 | " 0 | \n",
956 | " 0 | \n",
957 | " 0 | \n",
958 | " 0 | \n",
959 | " 0 | \n",
960 | " 0 | \n",
961 | "
\n",
962 | " \n",
963 | "
\n",
964 | "
"
965 | ],
966 | "text/plain": [
967 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql Python \\\n",
968 | "0 上海恒奕集团 500-999人 上海 普陀区 武宁路 大专 1-3年 数据分析 10.0 A轮 0 0 \n",
969 | "\n",
970 | " Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n",
971 | "0 0 0 0 0 0 0 0 0 "
972 | ]
973 | },
974 | "execution_count": 32,
975 | "metadata": {},
976 | "output_type": "execute_result"
977 | }
978 | ],
979 | "source": [
980 | "data_analysis.head(1)"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 33,
986 | "metadata": {},
987 | "outputs": [],
988 | "source": [
989 | "data_analysis_num = data_analysis.drop(['公司名称','公司规模','地区','学历要求','工作经验','职位名称','融资情况','salary'], axis=1)\n",
990 | "num_attribs = list(data_analysis_num)\n",
991 | "cat_attribs = ['公司规模','学历要求','工作经验']\n",
992 | "\n",
993 | "num_pipeline = Pipeline([\n",
994 | " ('std_scaler', StandardScaler()),\n",
995 | " ])\n",
996 | "\n",
997 | "full_pipeline = ColumnTransformer([\n",
998 | " (\"num\", num_pipeline, num_attribs),\n",
999 | " (\"cat\", OneHotEncoder(), cat_attribs),\n",
1000 | " ])\n",
1001 | "\n",
1002 | "data_analysis_prepared = full_pipeline.fit_transform(data_train)\n",
1003 | "data_analysis_test = full_pipeline.transform(data_test)"
1004 | ]
1005 | },
1006 | {
1007 | "cell_type": "code",
1008 | "execution_count": 34,
1009 | "metadata": {
1010 | "scrolled": true
1011 | },
1012 | "outputs": [
1013 | {
1014 | "data": {
1015 | "text/plain": [
1016 | "array([[ 1.01779743, 1.15424368, -0.70135785, -0.56638197, 2.02058373,\n",
1017 | " -0.40306397, -0.36822985, -0.36584991, -0.33908304, -0.26977806,\n",
1018 | " 1. , 0. , 0. , 0. , 0. ,\n",
1019 | " 0. , 0. , 0. , 0. , 1. ,\n",
1020 | " 0. , 0. , 0. , 0. , 1. ,\n",
1021 | " 0. , 0. , 0. ]])"
1022 | ]
1023 | },
1024 | "execution_count": 34,
1025 | "metadata": {},
1026 | "output_type": "execute_result"
1027 | }
1028 | ],
1029 | "source": [
1030 | "data_analysis_prepared[:1]"
1031 | ]
1032 | },
1033 | {
1034 | "cell_type": "code",
1035 | "execution_count": 35,
1036 | "metadata": {},
1037 | "outputs": [
1038 | {
1039 | "data": {
1040 | "text/html": [
1041 | "\n",
1042 | "\n",
1055 | "
\n",
1056 | " \n",
1057 | " \n",
1058 | " | \n",
1059 | " 公司名称 | \n",
1060 | " 公司规模 | \n",
1061 | " 地区 | \n",
1062 | " 学历要求 | \n",
1063 | " 工作经验 | \n",
1064 | " 职位名称 | \n",
1065 | " salary | \n",
1066 | " 融资情况 | \n",
1067 | " Sql | \n",
1068 | " Python | \n",
1069 | " Excel | \n",
1070 | " Sas | \n",
1071 | " Spss | \n",
1072 | " Hive | \n",
1073 | " Hadoop | \n",
1074 | " Ppt | \n",
1075 | " Tableau | \n",
1076 | " Spark | \n",
1077 | "
\n",
1078 | " \n",
1079 | " \n",
1080 | " \n",
1081 | " 527 | \n",
1082 | " 上海兴致 | \n",
1083 | " 0-20人 | \n",
1084 | " 上海 浦东新区 八佰伴 | \n",
1085 | " 本科 | \n",
1086 | " 3-5年 | \n",
1087 | " 高级数据分析师 | \n",
1088 | " 22.5 | \n",
1089 | " 0-20人 | \n",
1090 | " 1 | \n",
1091 | " 1 | \n",
1092 | " 0 | \n",
1093 | " 0 | \n",
1094 | " 1 | \n",
1095 | " 0 | \n",
1096 | " 0 | \n",
1097 | " 0 | \n",
1098 | " 0 | \n",
1099 | " 0 | \n",
1100 | "
\n",
1101 | " \n",
1102 | "
\n",
1103 | "
"
1104 | ],
1105 | "text/plain": [
1106 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql Python \\\n",
1107 | "527 上海兴致 0-20人 上海 浦东新区 八佰伴 本科 3-5年 高级数据分析师 22.5 0-20人 1 1 \n",
1108 | "\n",
1109 | " Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n",
1110 | "527 0 0 1 0 0 0 0 0 "
1111 | ]
1112 | },
1113 | "execution_count": 35,
1114 | "metadata": {},
1115 | "output_type": "execute_result"
1116 | }
1117 | ],
1118 | "source": [
1119 | "data_train.head(1)"
1120 | ]
1121 | },
1122 | {
1123 | "cell_type": "code",
1124 | "execution_count": 36,
1125 | "metadata": {},
1126 | "outputs": [],
1127 | "source": [
1128 | "data_analysis_labels = data_train.salary.values\n",
1129 | "test_labels = data_test.salary.values"
1130 | ]
1131 | },
1132 | {
1133 | "cell_type": "markdown",
1134 | "metadata": {},
1135 | "source": [
1136 | "---"
1137 | ]
1138 | },
1139 | {
1140 | "cell_type": "markdown",
1141 | "metadata": {},
1142 | "source": [
1143 | "# 训练模型"
1144 | ]
1145 | },
1146 | {
1147 | "cell_type": "markdown",
1148 | "metadata": {},
1149 | "source": [
1150 | "## 线性回归"
1151 | ]
1152 | },
1153 | {
1154 | "cell_type": "code",
1155 | "execution_count": 37,
1156 | "metadata": {},
1157 | "outputs": [
1158 | {
1159 | "data": {
1160 | "text/plain": [
1161 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n",
1162 | " normalize=False)"
1163 | ]
1164 | },
1165 | "execution_count": 37,
1166 | "metadata": {},
1167 | "output_type": "execute_result"
1168 | }
1169 | ],
1170 | "source": [
1171 | "from sklearn.linear_model import LinearRegression\n",
1172 | "\n",
1173 | "lin_reg = LinearRegression()\n",
1174 | "lin_reg.fit(data_analysis_prepared, data_analysis_labels)"
1175 | ]
1176 | },
1177 | {
1178 | "cell_type": "code",
1179 | "execution_count": 38,
1180 | "metadata": {},
1181 | "outputs": [
1182 | {
1183 | "data": {
1184 | "text/plain": [
1185 | "4.209936980782373"
1186 | ]
1187 | },
1188 | "execution_count": 38,
1189 | "metadata": {},
1190 | "output_type": "execute_result"
1191 | }
1192 | ],
1193 | "source": [
1194 | "from sklearn.metrics import mean_squared_error\n",
1195 | "import numpy as np\n",
1196 | "\n",
1197 | "salary_predictions = lin_reg.predict(data_analysis_prepared)\n",
1198 | "lin_mse = mean_squared_error(data_analysis_labels, salary_predictions)\n",
1199 | "lin_rmse = np.sqrt(lin_mse)\n",
1200 | "lin_rmse"
1201 | ]
1202 | },
1203 | {
1204 | "cell_type": "code",
1205 | "execution_count": 39,
1206 | "metadata": {},
1207 | "outputs": [],
1208 | "source": [
1209 | "#salary_predictions[:10]"
1210 | ]
1211 | },
1212 | {
1213 | "cell_type": "markdown",
1214 | "metadata": {},
1215 | "source": [
1216 | "### 测试集"
1217 | ]
1218 | },
1219 | {
1220 | "cell_type": "code",
1221 | "execution_count": 40,
1222 | "metadata": {},
1223 | "outputs": [],
1224 | "source": [
1225 | "#data_test.head(10)"
1226 | ]
1227 | },
1228 | {
1229 | "cell_type": "code",
1230 | "execution_count": 41,
1231 | "metadata": {},
1232 | "outputs": [
1233 | {
1234 | "data": {
1235 | "text/plain": [
1236 | "array([19.13476562, 16.91992188, 14.8984375 , 14.0546875 , 20.76367188,\n",
1237 | " 12.19921875, 18.13671875, 16.45507812, 20.40917969, 19.90820312])"
1238 | ]
1239 | },
1240 | "execution_count": 41,
1241 | "metadata": {},
1242 | "output_type": "execute_result"
1243 | }
1244 | ],
1245 | "source": [
1246 | "y_test = lin_reg.predict(data_analysis_test)\n",
1247 | "y_test[:10]"
1248 | ]
1249 | },
1250 | {
1251 | "cell_type": "code",
1252 | "execution_count": 42,
1253 | "metadata": {},
1254 | "outputs": [
1255 | {
1256 | "data": {
1257 | "text/plain": [
1258 | "array([20.5, 13. , 11.5, 12.5, 15. , 12.5, 22.5, 15. , 22.5, 15.5])"
1259 | ]
1260 | },
1261 | "execution_count": 42,
1262 | "metadata": {},
1263 | "output_type": "execute_result"
1264 | }
1265 | ],
1266 | "source": [
1267 | "test_labels[:10]"
1268 | ]
1269 | },
1270 | {
1271 | "cell_type": "code",
1272 | "execution_count": 43,
1273 | "metadata": {},
1274 | "outputs": [
1275 | {
1276 | "data": {
1277 | "text/plain": [
1278 | "4.252707451377156"
1279 | ]
1280 | },
1281 | "execution_count": 43,
1282 | "metadata": {},
1283 | "output_type": "execute_result"
1284 | }
1285 | ],
1286 | "source": [
1287 | "lin_mse = mean_squared_error(test_labels, y_test)\n",
1288 | "lin_rmse = np.sqrt(lin_mse)\n",
1289 | "lin_rmse"
1290 | ]
1291 | },
1292 | {
1293 | "cell_type": "markdown",
1294 | "metadata": {},
1295 | "source": [
1296 | "- 测试集上误差约为**4.27**"
1297 | ]
1298 | },
1299 | {
1300 | "cell_type": "markdown",
1301 | "metadata": {},
1302 | "source": [
1303 | "### 交叉验证"
1304 | ]
1305 | },
1306 | {
1307 | "cell_type": "code",
1308 | "execution_count": 44,
1309 | "metadata": {},
1310 | "outputs": [],
1311 | "source": [
1312 | "from sklearn.model_selection import cross_val_score\n",
1313 | "\n",
1314 | "scores = cross_val_score(lin_reg, data_analysis_prepared, data_analysis_labels,\n",
1315 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
1316 | "lin_rmse_scores = np.sqrt(-scores)"
1317 | ]
1318 | },
1319 | {
1320 | "cell_type": "code",
1321 | "execution_count": 45,
1322 | "metadata": {},
1323 | "outputs": [
1324 | {
1325 | "name": "stdout",
1326 | "output_type": "stream",
1327 | "text": [
1328 | "Scores: [4.54552557 4.54182215 3.94087967 4.42840937 4.4373358 4.62011098\n",
1329 | " 4.22660386 4.38725655 4.42436899 4.60889902]\n",
1330 | "Mean: 4.41612119574774\n",
1331 | "Standard deviation: 0.1935445501197603\n"
1332 | ]
1333 | }
1334 | ],
1335 | "source": [
1336 | "def display_scores(scores):\n",
1337 | " print(\"Scores:\", scores)\n",
1338 | " print(\"Mean:\", scores.mean())\n",
1339 | " print(\"Standard deviation:\", scores.std())\n",
1340 | "\n",
1341 | "display_scores(lin_rmse_scores)"
1342 | ]
1343 | },
1344 | {
1345 | "cell_type": "markdown",
1346 | "metadata": {},
1347 | "source": [
1348 | "---"
1349 | ]
1350 | },
1351 | {
1352 | "cell_type": "markdown",
1353 | "metadata": {},
1354 | "source": [
1355 | "## 决策树回归"
1356 | ]
1357 | },
1358 | {
1359 | "cell_type": "markdown",
1360 | "metadata": {},
1361 | "source": [
1362 | "### 建模训练"
1363 | ]
1364 | },
1365 | {
1366 | "cell_type": "code",
1367 | "execution_count": 46,
1368 | "metadata": {},
1369 | "outputs": [
1370 | {
1371 | "data": {
1372 | "text/plain": [
1373 | "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
1374 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
1375 | " min_impurity_split=None, min_samples_leaf=1,\n",
1376 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
1377 | " presort=False, random_state=42, splitter='best')"
1378 | ]
1379 | },
1380 | "execution_count": 46,
1381 | "metadata": {},
1382 | "output_type": "execute_result"
1383 | }
1384 | ],
1385 | "source": [
1386 | "from sklearn.tree import DecisionTreeRegressor\n",
1387 | "\n",
1388 | "tree_reg = DecisionTreeRegressor(random_state=42)\n",
1389 | "tree_reg.fit(data_analysis_prepared, data_analysis_labels)"
1390 | ]
1391 | },
1392 | {
1393 | "cell_type": "code",
1394 | "execution_count": 47,
1395 | "metadata": {},
1396 | "outputs": [],
1397 | "source": [
1398 | "y_pred_tree = tree_reg.predict(data_analysis_prepared)"
1399 | ]
1400 | },
1401 | {
1402 | "cell_type": "code",
1403 | "execution_count": 48,
1404 | "metadata": {},
1405 | "outputs": [
1406 | {
1407 | "data": {
1408 | "text/plain": [
1409 | "2.404700282979215"
1410 | ]
1411 | },
1412 | "execution_count": 48,
1413 | "metadata": {},
1414 | "output_type": "execute_result"
1415 | }
1416 | ],
1417 | "source": [
1418 | "from sklearn.metrics import mean_squared_error\n",
1419 | "\n",
1420 | "tree_mse = mean_squared_error(data_analysis_labels, y_pred_tree)\n",
1421 | "tree_rmse = np.sqrt(tree_mse)\n",
1422 | "tree_rmse"
1423 | ]
1424 | },
1425 | {
1426 | "cell_type": "markdown",
1427 | "metadata": {},
1428 | "source": [
1429 | "### 测试集"
1430 | ]
1431 | },
1432 | {
1433 | "cell_type": "code",
1434 | "execution_count": 49,
1435 | "metadata": {},
1436 | "outputs": [
1437 | {
1438 | "data": {
1439 | "text/plain": [
1440 | "array([17.5 , 16.54166667, 14.75 , 8. , 18.75 ,\n",
1441 | " 16.5 , 17.5 , 7. , 17.125 , 18.83333333])"
1442 | ]
1443 | },
1444 | "execution_count": 49,
1445 | "metadata": {},
1446 | "output_type": "execute_result"
1447 | }
1448 | ],
1449 | "source": [
1450 | "y_test = tree_reg.predict(data_analysis_test)\n",
1451 | "y_test[:10]"
1452 | ]
1453 | },
1454 | {
1455 | "cell_type": "code",
1456 | "execution_count": 50,
1457 | "metadata": {},
1458 | "outputs": [
1459 | {
1460 | "data": {
1461 | "text/plain": [
1462 | "array([20.5, 13. , 11.5, 12.5, 15. , 12.5, 22.5, 15. , 22.5, 15.5])"
1463 | ]
1464 | },
1465 | "execution_count": 50,
1466 | "metadata": {},
1467 | "output_type": "execute_result"
1468 | }
1469 | ],
1470 | "source": [
1471 | "test_labels[:10]"
1472 | ]
1473 | },
1474 | {
1475 | "cell_type": "code",
1476 | "execution_count": 51,
1477 | "metadata": {
1478 | "scrolled": true
1479 | },
1480 | "outputs": [
1481 | {
1482 | "data": {
1483 | "text/plain": [
1484 | "5.585045537872495"
1485 | ]
1486 | },
1487 | "execution_count": 51,
1488 | "metadata": {},
1489 | "output_type": "execute_result"
1490 | }
1491 | ],
1492 | "source": [
1493 | "tree_mse = mean_squared_error(test_labels, y_test)\n",
1494 | "tree_rmse = np.sqrt(tree_mse)\n",
1495 | "tree_rmse"
1496 | ]
1497 | },
1498 | {
1499 | "cell_type": "markdown",
1500 | "metadata": {},
1501 | "source": [
1502 | "- 测试集上误差约为**5.4**"
1503 | ]
1504 | },
1505 | {
1506 | "cell_type": "markdown",
1507 | "metadata": {},
1508 | "source": [
1509 | "---"
1510 | ]
1511 | },
1512 | {
1513 | "cell_type": "markdown",
1514 | "metadata": {},
1515 | "source": [
1516 | "## Random Forest 随机森林回归"
1517 | ]
1518 | },
1519 | {
1520 | "cell_type": "markdown",
1521 | "metadata": {},
1522 | "source": [
1523 | "### 建模训练"
1524 | ]
1525 | },
1526 | {
1527 | "cell_type": "code",
1528 | "execution_count": 52,
1529 | "metadata": {},
1530 | "outputs": [
1531 | {
1532 | "data": {
1533 | "text/plain": [
1534 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
1535 | " max_features='auto', max_leaf_nodes=None,\n",
1536 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
1537 | " min_samples_leaf=1, min_samples_split=2,\n",
1538 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n",
1539 | " oob_score=False, random_state=52, verbose=0, warm_start=False)"
1540 | ]
1541 | },
1542 | "execution_count": 52,
1543 | "metadata": {},
1544 | "output_type": "execute_result"
1545 | }
1546 | ],
1547 | "source": [
1548 | "from sklearn.ensemble import RandomForestRegressor\n",
1549 | "\n",
1550 | "forest_reg = RandomForestRegressor(random_state=52)\n",
1551 | "forest_reg.fit(data_analysis_prepared, data_analysis_labels)"
1552 | ]
1553 | },
1554 | {
1555 | "cell_type": "code",
1556 | "execution_count": 53,
1557 | "metadata": {},
1558 | "outputs": [
1559 | {
1560 | "data": {
1561 | "text/plain": [
1562 | "2.825121630127617"
1563 | ]
1564 | },
1565 | "execution_count": 53,
1566 | "metadata": {},
1567 | "output_type": "execute_result"
1568 | }
1569 | ],
1570 | "source": [
1571 | "y_pred_rf = forest_reg.predict(data_analysis_prepared)\n",
1572 | "forest_mse = mean_squared_error(data_analysis_labels, y_pred_rf)\n",
1573 | "forest_rmse = np.sqrt(forest_mse)\n",
1574 | "forest_rmse"
1575 | ]
1576 | },
1577 | {
1578 | "cell_type": "markdown",
1579 | "metadata": {},
1580 | "source": [
1581 | "### 测试集验证"
1582 | ]
1583 | },
1584 | {
1585 | "cell_type": "code",
1586 | "execution_count": 54,
1587 | "metadata": {},
1588 | "outputs": [],
1589 | "source": [
1590 | "#data_test[:10]"
1591 | ]
1592 | },
1593 | {
1594 | "cell_type": "code",
1595 | "execution_count": 55,
1596 | "metadata": {},
1597 | "outputs": [
1598 | {
1599 | "data": {
1600 | "text/plain": [
1601 | "array([17.55 , 16.76213231, 14.82916667, 11.5 , 23.55 ,\n",
1602 | " 14.2 , 16.40416667, 12.9375 , 18.65738095, 20.11666667])"
1603 | ]
1604 | },
1605 | "execution_count": 55,
1606 | "metadata": {},
1607 | "output_type": "execute_result"
1608 | }
1609 | ],
1610 | "source": [
1611 | "y_test = forest_reg.predict(data_analysis_test)\n",
1612 | "y_test[:10]"
1613 | ]
1614 | },
1615 | {
1616 | "cell_type": "code",
1617 | "execution_count": 56,
1618 | "metadata": {},
1619 | "outputs": [
1620 | {
1621 | "data": {
1622 | "text/plain": [
1623 | "array([20.5, 13. , 11.5, 12.5, 15. , 12.5, 22.5, 15. , 22.5, 15.5])"
1624 | ]
1625 | },
1626 | "execution_count": 56,
1627 | "metadata": {},
1628 | "output_type": "execute_result"
1629 | }
1630 | ],
1631 | "source": [
1632 | "test_labels[:10]"
1633 | ]
1634 | },
1635 | {
1636 | "cell_type": "code",
1637 | "execution_count": 57,
1638 | "metadata": {},
1639 | "outputs": [
1640 | {
1641 | "data": {
1642 | "text/plain": [
1643 | "4.53113932085526"
1644 | ]
1645 | },
1646 | "execution_count": 57,
1647 | "metadata": {},
1648 | "output_type": "execute_result"
1649 | }
1650 | ],
1651 | "source": [
1652 | "forest_mse = mean_squared_error(test_labels, y_test)\n",
1653 | "forest_rmse = np.sqrt(forest_mse)\n",
1654 | "forest_rmse"
1655 | ]
1656 | },
1657 | {
1658 | "cell_type": "markdown",
1659 | "metadata": {},
1660 | "source": [
1661 | "- 测试集上误差约为**4.2**"
1662 | ]
1663 | },
1664 | {
1665 | "cell_type": "markdown",
1666 | "metadata": {},
1667 | "source": [
1668 | "### 交叉验证"
1669 | ]
1670 | },
1671 | {
1672 | "cell_type": "code",
1673 | "execution_count": 58,
1674 | "metadata": {},
1675 | "outputs": [],
1676 | "source": [
1677 | "from sklearn.model_selection import cross_val_score\n",
1678 | "\n",
1679 | "scores = cross_val_score(forest_reg, data_analysis_prepared, data_analysis_labels,\n",
1680 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
1681 | "forest_rmse_scores = np.sqrt(-scores)"
1682 | ]
1683 | },
1684 | {
1685 | "cell_type": "code",
1686 | "execution_count": 59,
1687 | "metadata": {},
1688 | "outputs": [
1689 | {
1690 | "name": "stdout",
1691 | "output_type": "stream",
1692 | "text": [
1693 | "Scores: [4.49936847 5.16831172 3.87212271 5.10449212 4.76825042 4.95531615\n",
1694 | " 4.47363957 5.06157526 4.29351527 4.56866847]\n",
1695 | "Mean: 4.676526015420327\n",
1696 | "Standard deviation: 0.3920907106938965\n"
1697 | ]
1698 | }
1699 | ],
1700 | "source": [
1701 | "display_scores(forest_rmse_scores)"
1702 | ]
1703 | },
1704 | {
1705 | "cell_type": "markdown",
1706 | "metadata": {},
1707 | "source": [
1708 | "---"
1709 | ]
1710 | },
1711 | {
1712 | "cell_type": "markdown",
1713 | "metadata": {},
1714 | "source": [
1715 | "## KNN回归"
1716 | ]
1717 | },
1718 | {
1719 | "cell_type": "code",
1720 | "execution_count": 60,
1721 | "metadata": {},
1722 | "outputs": [
1723 | {
1724 | "data": {
1725 | "text/plain": [
1726 | "KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
1727 | " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
1728 | " weights='uniform')"
1729 | ]
1730 | },
1731 | "execution_count": 60,
1732 | "metadata": {},
1733 | "output_type": "execute_result"
1734 | }
1735 | ],
1736 | "source": [
1737 | "from sklearn.neighbors import KNeighborsRegressor\n",
1738 | "k = 5\n",
1739 | "knn_reg = KNeighborsRegressor(k)\n",
1740 | "knn_reg.fit(data_analysis_prepared, data_analysis_labels)"
1741 | ]
1742 | },
1743 | {
1744 | "cell_type": "code",
1745 | "execution_count": 61,
1746 | "metadata": {},
1747 | "outputs": [
1748 | {
1749 | "data": {
1750 | "text/plain": [
1751 | "4.181154008963998"
1752 | ]
1753 | },
1754 | "execution_count": 61,
1755 | "metadata": {},
1756 | "output_type": "execute_result"
1757 | }
1758 | ],
1759 | "source": [
1760 | "y_pred_knn = knn_reg.predict(data_analysis_prepared)\n",
1761 | "knn_mse = mean_squared_error(data_analysis_labels, y_pred_knn)\n",
1762 | "knn_rmse = np.sqrt(knn_mse)\n",
1763 | "knn_rmse"
1764 | ]
1765 | },
1766 | {
1767 | "cell_type": "markdown",
1768 | "metadata": {},
1769 | "source": [
1770 | "### 交叉验证"
1771 | ]
1772 | },
1773 | {
1774 | "cell_type": "code",
1775 | "execution_count": 62,
1776 | "metadata": {},
1777 | "outputs": [
1778 | {
1779 | "name": "stdout",
1780 | "output_type": "stream",
1781 | "text": [
1782 | "Scores: [4.91189951 5.22354338 4.69300803 5.17542269 5.59779155 6.05515193\n",
1783 | " 5.07427268 5.46580904 4.81125962 4.8222089 ]\n",
1784 | "Mean: 5.183036732034127\n",
1785 | "Standard deviation: 0.40101789362628404\n"
1786 | ]
1787 | }
1788 | ],
1789 | "source": [
1790 | "from sklearn.model_selection import cross_val_score\n",
1791 | "\n",
1792 | "scores = cross_val_score(knn_reg, data_analysis_prepared, data_analysis_labels,\n",
1793 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
1794 | "knn_rmse_scores = np.sqrt(-scores)\n",
1795 | "\n",
1796 | "\n",
1797 | "display_scores(knn_rmse_scores)"
1798 | ]
1799 | },
1800 | {
1801 | "cell_type": "markdown",
1802 | "metadata": {},
1803 | "source": [
1804 | "### 测试集验证"
1805 | ]
1806 | },
1807 | {
1808 | "cell_type": "code",
1809 | "execution_count": 63,
1810 | "metadata": {},
1811 | "outputs": [
1812 | {
1813 | "name": "stdout",
1814 | "output_type": "stream",
1815 | "text": [
1816 | "[18.7 17. 16.5 16.5 18.3 14.2 16.2 11.3 18.2 20.3]\n",
1817 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n"
1818 | ]
1819 | }
1820 | ],
1821 | "source": [
1822 | "y_test = knn_reg.predict(data_analysis_test)\n",
1823 | "print(y_test[:10])\n",
1824 | "print(test_labels[:10])"
1825 | ]
1826 | },
1827 | {
1828 | "cell_type": "code",
1829 | "execution_count": 64,
1830 | "metadata": {},
1831 | "outputs": [
1832 | {
1833 | "data": {
1834 | "text/plain": [
1835 | "4.933569509808097"
1836 | ]
1837 | },
1838 | "execution_count": 64,
1839 | "metadata": {},
1840 | "output_type": "execute_result"
1841 | }
1842 | ],
1843 | "source": [
1844 | "knn_mse = mean_squared_error(test_labels, y_test)\n",
1845 | "knn_rmse = np.sqrt(knn_mse)\n",
1846 | "knn_rmse"
1847 | ]
1848 | },
1849 | {
1850 | "cell_type": "markdown",
1851 | "metadata": {},
1852 | "source": [
1853 | "---"
1854 | ]
1855 | },
1856 | {
1857 | "cell_type": "markdown",
1858 | "metadata": {},
1859 | "source": [
1860 | "# Adaboost回归"
1861 | ]
1862 | },
1863 | {
1864 | "cell_type": "code",
1865 | "execution_count": 65,
1866 | "metadata": {},
1867 | "outputs": [
1868 | {
1869 | "data": {
1870 | "text/plain": [
1871 | "AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',\n",
1872 | " n_estimators=50, random_state=None)"
1873 | ]
1874 | },
1875 | "execution_count": 65,
1876 | "metadata": {},
1877 | "output_type": "execute_result"
1878 | }
1879 | ],
1880 | "source": [
1881 | "from sklearn.ensemble import AdaBoostRegressor\n",
1882 | "Adaboost_reg = AdaBoostRegressor(n_estimators=50)#这里使用50个决策树\n",
1883 | "Adaboost_reg.fit(data_analysis_prepared, data_analysis_labels)"
1884 | ]
1885 | },
1886 | {
1887 | "cell_type": "markdown",
1888 | "metadata": {},
1889 | "source": [
1890 | "### 交叉验证"
1891 | ]
1892 | },
1893 | {
1894 | "cell_type": "code",
1895 | "execution_count": 66,
1896 | "metadata": {},
1897 | "outputs": [],
1898 | "source": [
1899 | "from sklearn.model_selection import cross_val_score\n",
1900 | "\n",
1901 | "scores = cross_val_score(Adaboost_reg, data_analysis_prepared, data_analysis_labels,\n",
1902 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
1903 | "Adaboost_rmse_scores = np.sqrt(-scores)"
1904 | ]
1905 | },
1906 | {
1907 | "cell_type": "code",
1908 | "execution_count": 67,
1909 | "metadata": {},
1910 | "outputs": [
1911 | {
1912 | "name": "stdout",
1913 | "output_type": "stream",
1914 | "text": [
1915 | "Scores: [4.53692111 4.64122254 4.21608294 4.86740635 5.06359292 4.66208335\n",
1916 | " 4.47708546 4.9059454 4.66957261 4.18020659]\n",
1917 | "Mean: 4.622011927255484\n",
1918 | "Standard deviation: 0.2698669328833233\n"
1919 | ]
1920 | }
1921 | ],
1922 | "source": [
1923 | "display_scores(Adaboost_rmse_scores)"
1924 | ]
1925 | },
1926 | {
1927 | "cell_type": "markdown",
1928 | "metadata": {},
1929 | "source": [
1930 | "### 测试集验证"
1931 | ]
1932 | },
1933 | {
1934 | "cell_type": "code",
1935 | "execution_count": 68,
1936 | "metadata": {},
1937 | "outputs": [
1938 | {
1939 | "name": "stdout",
1940 | "output_type": "stream",
1941 | "text": [
1942 | "[18.77375566 17.37943925 15.03588517 15.14973262 19.29107981 13.7369338\n",
1943 | " 17.37943925 13.7369338 20.0625 18.39631336]\n",
1944 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n"
1945 | ]
1946 | }
1947 | ],
1948 | "source": [
1949 | "y_test = Adaboost_reg.predict(data_analysis_test)\n",
1950 | "print(y_test[:10])\n",
1951 | "print(test_labels[:10])"
1952 | ]
1953 | },
1954 | {
1955 | "cell_type": "code",
1956 | "execution_count": 69,
1957 | "metadata": {},
1958 | "outputs": [
1959 | {
1960 | "data": {
1961 | "text/plain": [
1962 | "4.503983133229124"
1963 | ]
1964 | },
1965 | "execution_count": 69,
1966 | "metadata": {},
1967 | "output_type": "execute_result"
1968 | }
1969 | ],
1970 | "source": [
1971 | "Adaboost_mse = mean_squared_error(test_labels, y_test)\n",
1972 | "Adaboost_rmse = np.sqrt(Adaboost_mse)\n",
1973 | "Adaboost_rmse"
1974 | ]
1975 | },
1976 | {
1977 | "cell_type": "markdown",
1978 | "metadata": {},
1979 | "source": [
1980 | "---"
1981 | ]
1982 | },
1983 | {
1984 | "cell_type": "markdown",
1985 | "metadata": {},
1986 | "source": [
1987 | "## GBRT回归"
1988 | ]
1989 | },
1990 | {
1991 | "cell_type": "code",
1992 | "execution_count": 70,
1993 | "metadata": {
1994 | "scrolled": true
1995 | },
1996 | "outputs": [
1997 | {
1998 | "data": {
1999 | "text/plain": [
2000 | "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
2001 | " learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n",
2002 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
2003 | " min_impurity_split=None, min_samples_leaf=1,\n",
2004 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
2005 | " n_estimators=100, n_iter_no_change=None, presort='auto',\n",
2006 | " random_state=None, subsample=1.0, tol=0.0001,\n",
2007 | " validation_fraction=0.1, verbose=0, warm_start=False)"
2008 | ]
2009 | },
2010 | "execution_count": 70,
2011 | "metadata": {},
2012 | "output_type": "execute_result"
2013 | }
2014 | ],
2015 | "source": [
2016 | "from sklearn.ensemble import GradientBoostingRegressor\n",
2017 | "grbt_reg = GradientBoostingRegressor(n_estimators=100)#这里使用100个决策树\n",
2018 | "grbt_reg.fit(data_analysis_prepared, data_analysis_labels)"
2019 | ]
2020 | },
2021 | {
2022 | "cell_type": "markdown",
2023 | "metadata": {},
2024 | "source": [
2025 | "### 交叉验证"
2026 | ]
2027 | },
2028 | {
2029 | "cell_type": "code",
2030 | "execution_count": 71,
2031 | "metadata": {},
2032 | "outputs": [
2033 | {
2034 | "name": "stdout",
2035 | "output_type": "stream",
2036 | "text": [
2037 | "Scores: [4.87398002 4.73250604 3.68256834 4.29583401 4.61267134 4.84613495\n",
2038 | " 4.14931613 4.34059763 4.23123641 4.40989281]\n",
2039 | "Mean: 4.417473768953092\n",
2040 | "Standard deviation: 0.34596795982823186\n"
2041 | ]
2042 | }
2043 | ],
2044 | "source": [
2045 | "from sklearn.model_selection import cross_val_score\n",
2046 | "\n",
2047 | "scores = cross_val_score(grbt_reg, data_analysis_prepared, data_analysis_labels,\n",
2048 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
2049 | "grbt_rmse_scores = np.sqrt(-scores)\n",
2050 | "\n",
2051 | "display_scores(grbt_rmse_scores)"
2052 | ]
2053 | },
2054 | {
2055 | "cell_type": "markdown",
2056 | "metadata": {},
2057 | "source": [
2058 | "### 测试集验证"
2059 | ]
2060 | },
2061 | {
2062 | "cell_type": "code",
2063 | "execution_count": 72,
2064 | "metadata": {},
2065 | "outputs": [
2066 | {
2067 | "name": "stdout",
2068 | "output_type": "stream",
2069 | "text": [
2070 | "[18.43702936 17.57478885 14.06200524 14.74538168 21.77043023 14.45351525\n",
2071 | " 18.47068341 14.04148627 19.616117 20.43899639]\n",
2072 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n"
2073 | ]
2074 | }
2075 | ],
2076 | "source": [
2077 | "y_test = grbt_reg.predict(data_analysis_test)\n",
2078 | "print(y_test[:10])\n",
2079 | "print(test_labels[:10])"
2080 | ]
2081 | },
2082 | {
2083 | "cell_type": "code",
2084 | "execution_count": 73,
2085 | "metadata": {},
2086 | "outputs": [
2087 | {
2088 | "data": {
2089 | "text/plain": [
2090 | "4.290941809182011"
2091 | ]
2092 | },
2093 | "execution_count": 73,
2094 | "metadata": {},
2095 | "output_type": "execute_result"
2096 | }
2097 | ],
2098 | "source": [
2099 | "grbt_mse = mean_squared_error(test_labels, y_test)\n",
2100 | "grbt_rmse = np.sqrt(grbt_mse)\n",
2101 | "grbt_rmse"
2102 | ]
2103 | },
2104 | {
2105 | "cell_type": "markdown",
2106 | "metadata": {},
2107 | "source": [
2108 | "---"
2109 | ]
2110 | },
2111 | {
2112 | "cell_type": "markdown",
2113 | "metadata": {},
2114 | "source": [
2115 | "## Bagging回归"
2116 | ]
2117 | },
2118 | {
2119 | "cell_type": "code",
2120 | "execution_count": 74,
2121 | "metadata": {
2122 | "scrolled": true
2123 | },
2124 | "outputs": [
2125 | {
2126 | "data": {
2127 | "text/plain": [
2128 | "BaggingRegressor(base_estimator=None, bootstrap=True,\n",
2129 | " bootstrap_features=False, max_features=1.0, max_samples=1.0,\n",
2130 | " n_estimators=10, n_jobs=None, oob_score=False, random_state=None,\n",
2131 | " verbose=0, warm_start=False)"
2132 | ]
2133 | },
2134 | "execution_count": 74,
2135 | "metadata": {},
2136 | "output_type": "execute_result"
2137 | }
2138 | ],
2139 | "source": [
2140 | "from sklearn.ensemble import BaggingRegressor\n",
2141 | "bagging_reg = BaggingRegressor()\n",
2142 | "bagging_reg.fit(data_analysis_prepared, data_analysis_labels)"
2143 | ]
2144 | },
2145 | {
2146 | "cell_type": "markdown",
2147 | "metadata": {},
2148 | "source": [
2149 | "### 交叉验证"
2150 | ]
2151 | },
2152 | {
2153 | "cell_type": "code",
2154 | "execution_count": 75,
2155 | "metadata": {},
2156 | "outputs": [
2157 | {
2158 | "name": "stdout",
2159 | "output_type": "stream",
2160 | "text": [
2161 | "Scores: [4.46370865 5.11419751 4.10735786 5.0408061 5.26292997 4.96593935\n",
2162 | " 4.71601577 4.93185359 4.59819981 4.63789738]\n",
2163 | "Mean: 4.783890598234823\n",
2164 | "Standard deviation: 0.32866956869503\n"
2165 | ]
2166 | }
2167 | ],
2168 | "source": [
2169 | "from sklearn.model_selection import cross_val_score\n",
2170 | "\n",
2171 | "scores = cross_val_score(bagging_reg, data_analysis_prepared, data_analysis_labels,\n",
2172 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
2173 | "bagging_rmse_scores = np.sqrt(-scores)\n",
2174 | "\n",
2175 | "display_scores(bagging_rmse_scores)"
2176 | ]
2177 | },
2178 | {
2179 | "cell_type": "markdown",
2180 | "metadata": {},
2181 | "source": [
2182 | "### 测试集验证"
2183 | ]
2184 | },
2185 | {
2186 | "cell_type": "code",
2187 | "execution_count": 76,
2188 | "metadata": {},
2189 | "outputs": [
2190 | {
2191 | "name": "stdout",
2192 | "output_type": "stream",
2193 | "text": [
2194 | "[17.53333333 16.95503081 15.19285714 11.65 19.43928571 14.36\n",
2195 | " 18.40166667 11.55 17.67583333 19.52738095]\n",
2196 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n"
2197 | ]
2198 | }
2199 | ],
2200 | "source": [
2201 | "y_test = bagging_reg.predict(data_analysis_test)\n",
2202 | "print(y_test[:10])\n",
2203 | "print(test_labels[:10])"
2204 | ]
2205 | },
2206 | {
2207 | "cell_type": "code",
2208 | "execution_count": 77,
2209 | "metadata": {},
2210 | "outputs": [
2211 | {
2212 | "data": {
2213 | "text/plain": [
2214 | "4.851544621491279"
2215 | ]
2216 | },
2217 | "execution_count": 77,
2218 | "metadata": {},
2219 | "output_type": "execute_result"
2220 | }
2221 | ],
2222 | "source": [
2223 | "bagging_mse = mean_squared_error(test_labels, y_test)\n",
2224 | "bagging_rmse = np.sqrt(bagging_mse)\n",
2225 | "bagging_rmse"
2226 | ]
2227 | },
2228 | {
2229 | "cell_type": "markdown",
2230 | "metadata": {},
2231 | "source": [
2232 | "---"
2233 | ]
2234 | },
2235 | {
2236 | "cell_type": "markdown",
2237 | "metadata": {},
2238 | "source": [
2239 | "# 模型拟合效果评价"
2240 | ]
2241 | },
2242 | {
2243 | "cell_type": "code",
2244 | "execution_count": 78,
2245 | "metadata": {},
2246 | "outputs": [
2247 | {
2248 | "name": "stdout",
2249 | "output_type": "stream",
2250 | "text": [
2251 | "linear 在测试集上的误差表现为: 4.252707451377156\n",
2252 | "tree 在测试集上的误差表现为: 5.585045537872495\n",
2253 | "forest 在测试集上的误差表现为: 4.53113932085526\n",
2254 | "knn 在测试集上的误差表现为: 4.933569509808097\n",
2255 | "Adaboost 在测试集上的误差表现为: 4.503983133229124\n",
2256 | "grbt 在测试集上的误差表现为: 4.290941809182011\n",
2257 | "bagging 在测试集上的误差表现为: 4.851544621491279\n"
2258 | ]
2259 | }
2260 | ],
2261 | "source": [
2262 | "model_list = [lin_rmse,tree_rmse,forest_rmse,knn_rmse,Adaboost_rmse,grbt_rmse,bagging_rmse]\n",
2263 | "model_name = ['linear','tree','forest','knn','Adaboost','grbt','bagging']\n",
2264 | "i = 0\n",
2265 | "for model in model_list:\n",
2266 | " print(model_name[i],'在测试集上的误差表现为:',model)\n",
2267 | " i+=1"
2268 | ]
2269 | },
2270 | {
2271 | "cell_type": "markdown",
2272 | "metadata": {},
2273 | "source": [
2274 | "---"
2275 | ]
2276 | },
2277 | {
2278 | "cell_type": "markdown",
2279 | "metadata": {},
2280 | "source": [
2281 | "# 网格搜索调参"
2282 | ]
2283 | },
2284 | {
2285 | "cell_type": "markdown",
2286 | "metadata": {},
2287 | "source": [
2288 | "### 对随机森林进行参数探索调整"
2289 | ]
2290 | },
2291 | {
2292 | "cell_type": "code",
2293 | "execution_count": 79,
2294 | "metadata": {},
2295 | "outputs": [
2296 | {
2297 | "name": "stderr",
2298 | "output_type": "stream",
2299 | "text": [
2300 | "C:\\Users\\13626\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
2301 | " DeprecationWarning)\n"
2302 | ]
2303 | },
2304 | {
2305 | "data": {
2306 | "text/plain": [
2307 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
2308 | " estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
2309 | " max_features='auto', max_leaf_nodes=None,\n",
2310 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
2311 | " min_samples_leaf=1, min_samples_split=2,\n",
2312 | " min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n",
2313 | " oob_score=False, random_state=42, verbose=0, warm_start=False),\n",
2314 | " fit_params=None, iid='warn', n_jobs=None,\n",
2315 | " param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],\n",
2316 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
2317 | " scoring='neg_mean_squared_error', verbose=0)"
2318 | ]
2319 | },
2320 | "execution_count": 79,
2321 | "metadata": {},
2322 | "output_type": "execute_result"
2323 | }
2324 | ],
2325 | "source": [
2326 | "from sklearn.model_selection import GridSearchCV\n",
2327 | "\n",
2328 | "param_grid = [\n",
2329 | " # try 12 (3×4) combinations of hyperparameters\n",
2330 | " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
2331 | " # then try 6 (2×3) combinations with bootstrap set as False\n",
2332 | " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
2333 | " ]\n",
2334 | "\n",
2335 | "forest_reg = RandomForestRegressor(random_state=42)\n",
2336 | "\n",
2337 | "#grbt_reg = GradientBoostingRegressor()\n",
2338 | "\n",
2339 | "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n",
2340 | " scoring='neg_mean_squared_error', return_train_score=True)\n",
2341 | "grid_search.fit(data_analysis_prepared, data_analysis_labels)"
2342 | ]
2343 | },
2344 | {
2345 | "cell_type": "code",
2346 | "execution_count": 80,
2347 | "metadata": {},
2348 | "outputs": [
2349 | {
2350 | "name": "stdout",
2351 | "output_type": "stream",
2352 | "text": [
2353 | "5.230290790238236 {'max_features': 2, 'n_estimators': 3}\n",
2354 | "4.828374365895161 {'max_features': 2, 'n_estimators': 10}\n",
2355 | "4.755202551631041 {'max_features': 2, 'n_estimators': 30}\n",
2356 | "5.064133429590988 {'max_features': 4, 'n_estimators': 3}\n",
2357 | "4.770620452203674 {'max_features': 4, 'n_estimators': 10}\n",
2358 | "4.698809097897846 {'max_features': 4, 'n_estimators': 30}\n",
2359 | "5.09938179972705 {'max_features': 6, 'n_estimators': 3}\n",
2360 | "4.7206651591273845 {'max_features': 6, 'n_estimators': 10}\n",
2361 | "4.687999289996666 {'max_features': 6, 'n_estimators': 30}\n",
2362 | "5.072523426909564 {'max_features': 8, 'n_estimators': 3}\n",
2363 | "4.75724488427645 {'max_features': 8, 'n_estimators': 10}\n",
2364 | "4.708816709801628 {'max_features': 8, 'n_estimators': 30}\n",
2365 | "5.388487805697899 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}\n",
2366 | "5.0528879660893775 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}\n",
2367 | "5.316193794492318 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}\n",
2368 | "5.022162677508516 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}\n",
2369 | "5.281169036748664 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}\n",
2370 | "5.056574946174836 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}\n"
2371 | ]
2372 | }
2373 | ],
2374 | "source": [
2375 | "cvres = grid_search.cv_results_\n",
2376 | "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
2377 | " print(np.sqrt(-mean_score), params)"
2378 | ]
2379 | },
2380 | {
2381 | "cell_type": "code",
2382 | "execution_count": 81,
2383 | "metadata": {},
2384 | "outputs": [
2385 | {
2386 | "data": {
2387 | "text/plain": [
2388 | "{'max_features': 6, 'n_estimators': 30}"
2389 | ]
2390 | },
2391 | "execution_count": 81,
2392 | "metadata": {},
2393 | "output_type": "execute_result"
2394 | }
2395 | ],
2396 | "source": [
2397 | "grid_search.best_params_"
2398 | ]
2399 | },
2400 | {
2401 | "cell_type": "markdown",
2402 | "metadata": {},
2403 | "source": [
2404 | "## 变量重要性"
2405 | ]
2406 | },
2407 | {
2408 | "cell_type": "code",
2409 | "execution_count": 82,
2410 | "metadata": {},
2411 | "outputs": [],
2412 | "source": [
2413 | "feature_importances = grid_search.best_estimator_.feature_importances_\n",
2414 | "#feature_importances"
2415 | ]
2416 | },
2417 | {
2418 | "cell_type": "code",
2419 | "execution_count": 83,
2420 | "metadata": {},
2421 | "outputs": [],
2422 | "source": [
2423 | "num_attribs = list(data_analysis_num)\n",
2424 | "cat_attribs = ['公司规模','学历要求','工作经验']"
2425 | ]
2426 | },
2427 | {
2428 | "cell_type": "code",
2429 | "execution_count": 84,
2430 | "metadata": {},
2431 | "outputs": [
2432 | {
2433 | "data": {
2434 | "text/plain": [
2435 | "[(0.06256050964151738, 'Sql'),\n",
2436 | " (0.05996157379002209, 'Hive'),\n",
2437 | " (0.05833431092085402, 'Python'),\n",
2438 | " (0.05536655983841162, 'Sas'),\n",
2439 | " (0.039090728357453634, 'Excel'),\n",
2440 | " (0.0372204358630469, 'Spss'),\n",
2441 | " (0.03173383704493009, 'Spark'),\n",
2442 | " (0.029334029896999316, '学历要求'),\n",
2443 | " (0.027283707992806678, '工作经验'),\n",
2444 | " (0.026729166721593346, 'Tableau'),\n",
2445 | " (0.025363715544763292, 'Hadoop'),\n",
2446 | " (0.018739070532311868, 'Ppt'),\n",
2447 | " (0.016549484575144015, '公司规模')]"
2448 | ]
2449 | },
2450 | "execution_count": 84,
2451 | "metadata": {},
2452 | "output_type": "execute_result"
2453 | }
2454 | ],
2455 | "source": [
2456 | "# 变量重要性排序\n",
2457 | "attributes = num_attribs + cat_attribs\n",
2458 | "sorted(zip(feature_importances, attributes), reverse=True)"
2459 | ]
2460 | },
2461 | {
2462 | "cell_type": "markdown",
2463 | "metadata": {},
2464 | "source": [
2465 | "- **公司规模**对薪资的影响相比之下比较小。"
2466 | ]
2467 | },
2468 | {
2469 | "cell_type": "markdown",
2470 | "metadata": {},
2471 | "source": [
2472 | "---"
2473 | ]
2474 | },
2475 | {
2476 | "cell_type": "markdown",
2477 | "metadata": {},
2478 | "source": [
2479 | "# 最终模型"
2480 | ]
2481 | },
2482 | {
2483 | "cell_type": "code",
2484 | "execution_count": 85,
2485 | "metadata": {},
2486 | "outputs": [
2487 | {
2488 | "data": {
2489 | "text/plain": [
2490 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
2491 | " max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
2492 | " min_impurity_split=None, min_samples_leaf=1,\n",
2493 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
2494 | " n_estimators=30, n_jobs=None, oob_score=False, random_state=42,\n",
2495 | " verbose=0, warm_start=False)"
2496 | ]
2497 | },
2498 | "execution_count": 85,
2499 | "metadata": {},
2500 | "output_type": "execute_result"
2501 | }
2502 | ],
2503 | "source": [
2504 | "final_model = grid_search.best_estimator_\n",
2505 | "final_model"
2506 | ]
2507 | },
2508 | {
2509 | "cell_type": "code",
2510 | "execution_count": 86,
2511 | "metadata": {},
2512 | "outputs": [],
2513 | "source": [
2514 | "scores = cross_val_score(final_model, data_analysis_prepared, data_analysis_labels,\n",
2515 | " scoring=\"neg_mean_squared_error\", cv=10)\n",
2516 | "final_model_rmse_scores = np.sqrt(-scores)"
2517 | ]
2518 | },
2519 | {
2520 | "cell_type": "code",
2521 | "execution_count": 87,
2522 | "metadata": {},
2523 | "outputs": [
2524 | {
2525 | "name": "stdout",
2526 | "output_type": "stream",
2527 | "text": [
2528 | "Scores: [4.53008814 5.08441071 3.97171206 4.80954518 5.01055618 4.71745052\n",
2529 | " 4.66927057 4.74793007 4.18567529 4.34334473]\n",
2530 | "Mean: 4.606998345081946\n",
2531 | "Standard deviation: 0.3353022753836264\n"
2532 | ]
2533 | }
2534 | ],
2535 | "source": [
2536 | "display_scores(final_model_rmse_scores)"
2537 | ]
2538 | },
2539 | {
2540 | "cell_type": "markdown",
2541 | "metadata": {},
2542 | "source": [
2543 | "- 交叉验证误差为**4.56**"
2544 | ]
2545 | },
2546 | {
2547 | "cell_type": "markdown",
2548 | "metadata": {},
2549 | "source": [
2550 | "---"
2551 | ]
2552 | },
2553 | {
2554 | "cell_type": "markdown",
2555 | "metadata": {},
2556 | "source": [
2557 | "# 薪资预测"
2558 | ]
2559 | },
2560 | {
2561 | "cell_type": "code",
2562 | "execution_count": 88,
2563 | "metadata": {},
2564 | "outputs": [],
2565 | "source": [
2566 | "final_predictions = final_model.predict(data_analysis_test)"
2567 | ]
2568 | },
2569 | {
2570 | "cell_type": "code",
2571 | "execution_count": 89,
2572 | "metadata": {},
2573 | "outputs": [],
2574 | "source": [
2575 | "salary_test_series = Series(final_predictions,index=data_test.index)"
2576 | ]
2577 | },
2578 | {
2579 | "cell_type": "code",
2580 | "execution_count": 90,
2581 | "metadata": {},
2582 | "outputs": [],
2583 | "source": [
2584 | "data_test_prediction = data_test.copy()\n",
2585 | "data_test_prediction.insert(7,'prediction',salary_test_series)"
2586 | ]
2587 | },
2588 | {
2589 | "cell_type": "code",
2590 | "execution_count": 91,
2591 | "metadata": {
2592 | "scrolled": false
2593 | },
2594 | "outputs": [
2595 | {
2596 | "data": {
2597 | "text/html": [
2598 | "\n",
2599 | "\n",
2612 | "
\n",
2613 | " \n",
2614 | " \n",
2615 | " | \n",
2616 | " 公司名称 | \n",
2617 | " 公司规模 | \n",
2618 | " 地区 | \n",
2619 | " 学历要求 | \n",
2620 | " 工作经验 | \n",
2621 | " 职位名称 | \n",
2622 | " salary | \n",
2623 | " prediction | \n",
2624 | " 融资情况 | \n",
2625 | " Sql | \n",
2626 | " Python | \n",
2627 | " Excel | \n",
2628 | " Sas | \n",
2629 | " Spss | \n",
2630 | " Hive | \n",
2631 | " Hadoop | \n",
2632 | " Ppt | \n",
2633 | " Tableau | \n",
2634 | " Spark | \n",
2635 | "
\n",
2636 | " \n",
2637 | " \n",
2638 | " \n",
2639 | " 602 | \n",
2640 | " 旗计智能 | \n",
2641 | " 1000-9999人 | \n",
2642 | " 上海 浦东新区 张江 | \n",
2643 | " 本科 | \n",
2644 | " 3-5年 | \n",
2645 | " 数据分析主管/经理(风险方向) | \n",
2646 | " 25.0 | \n",
2647 | " 17.123016 | \n",
2648 | " 已上市 | \n",
2649 | " 1 | \n",
2650 | " 1 | \n",
2651 | " 1 | \n",
2652 | " 1 | \n",
2653 | " 0 | \n",
2654 | " 0 | \n",
2655 | " 0 | \n",
2656 | " 0 | \n",
2657 | " 0 | \n",
2658 | " 0 | \n",
2659 | "
\n",
2660 | " \n",
2661 | " 260 | \n",
2662 | " 速网电商 | \n",
2663 | " 100-499人 | \n",
2664 | " 上海 闵行区 莘庄 | \n",
2665 | " 本科 | \n",
2666 | " 1-3年 | \n",
2667 | " 数据分析 | \n",
2668 | " 10.0 | \n",
2669 | " 11.195915 | \n",
2670 | " 不需要融资 | \n",
2671 | " 0 | \n",
2672 | " 0 | \n",
2673 | " 0 | \n",
2674 | " 0 | \n",
2675 | " 0 | \n",
2676 | " 0 | \n",
2677 | " 0 | \n",
2678 | " 0 | \n",
2679 | " 0 | \n",
2680 | " 0 | \n",
2681 | "
\n",
2682 | " \n",
2683 | " 512 | \n",
2684 | " Oriente | \n",
2685 | " 500-999人 | \n",
2686 | " 上海 黄浦区 人民广场 | \n",
2687 | " 本科 | \n",
2688 | " 3-5年 | \n",
2689 | " 数据分析师 | \n",
2690 | " 27.5 | \n",
2691 | " 19.633333 | \n",
2692 | " B轮 | \n",
2693 | " 1 | \n",
2694 | " 1 | \n",
2695 | " 0 | \n",
2696 | " 1 | \n",
2697 | " 0 | \n",
2698 | " 0 | \n",
2699 | " 1 | \n",
2700 | " 0 | \n",
2701 | " 0 | \n",
2702 | " 0 | \n",
2703 | "
\n",
2704 | " \n",
2705 | " 1027 | \n",
2706 | " 儒傲会软件定制 | \n",
2707 | " 20-99人 | \n",
2708 | " 上海 嘉定区 江桥 | \n",
2709 | " 本科 | \n",
2710 | " 5-10年 | \n",
2711 | " 系统数据分析师招募要求(儒傲会) | \n",
2712 | " 17.5 | \n",
2713 | " 19.751667 | \n",
2714 | " 天使轮 | \n",
2715 | " 1 | \n",
2716 | " 0 | \n",
2717 | " 0 | \n",
2718 | " 0 | \n",
2719 | " 0 | \n",
2720 | " 0 | \n",
2721 | " 0 | \n",
2722 | " 0 | \n",
2723 | " 0 | \n",
2724 | " 0 | \n",
2725 | "
\n",
2726 | " \n",
2727 | " 787 | \n",
2728 | " 华数康 | \n",
2729 | " 100-499人 | \n",
2730 | " 上海 长宁区 古北 | \n",
2731 | " 学历不限 | \n",
2732 | " 经验不限 | \n",
2733 | " 数据分析师 | \n",
2734 | " 22.5 | \n",
2735 | " 11.050000 | \n",
2736 | " B轮 | \n",
2737 | " 0 | \n",
2738 | " 0 | \n",
2739 | " 0 | \n",
2740 | " 1 | \n",
2741 | " 0 | \n",
2742 | " 0 | \n",
2743 | " 0 | \n",
2744 | " 0 | \n",
2745 | " 0 | \n",
2746 | " 0 | \n",
2747 | "
\n",
2748 | " \n",
2749 | " 85 | \n",
2750 | " 上海深界信息科技 | \n",
2751 | " 0-20人 | \n",
2752 | " 上海 徐汇区 漕河泾 | \n",
2753 | " 大专 | \n",
2754 | " 1-3年 | \n",
2755 | " 数据分析师 | \n",
2756 | " 12.5 | \n",
2757 | " 13.820000 | \n",
2758 | " 不需要融资 | \n",
2759 | " 0 | \n",
2760 | " 1 | \n",
2761 | " 0 | \n",
2762 | " 1 | \n",
2763 | " 0 | \n",
2764 | " 1 | \n",
2765 | " 0 | \n",
2766 | " 0 | \n",
2767 | " 0 | \n",
2768 | " 0 | \n",
2769 | "
\n",
2770 | " \n",
2771 | " 368 | \n",
2772 | " 汉云信息 | \n",
2773 | " 100-499人 | \n",
2774 | " 上海 长宁区 天山路 | \n",
2775 | " 本科 | \n",
2776 | " 1-3年 | \n",
2777 | " 互联网数据分析师 | \n",
2778 | " 9.0 | \n",
2779 | " 10.550000 | \n",
2780 | " 不需要融资 | \n",
2781 | " 1 | \n",
2782 | " 0 | \n",
2783 | " 1 | \n",
2784 | " 0 | \n",
2785 | " 0 | \n",
2786 | " 0 | \n",
2787 | " 0 | \n",
2788 | " 0 | \n",
2789 | " 0 | \n",
2790 | " 0 | \n",
2791 | "
\n",
2792 | " \n",
2793 | " 907 | \n",
2794 | " 饿了么 | \n",
2795 | " 1000-9999人 | \n",
2796 | " 上海 普陀区 金沙江路 | \n",
2797 | " 本科 | \n",
2798 | " 3-5年 | \n",
2799 | " 运力规划/数据分析 | \n",
2800 | " 22.5 | \n",
2801 | " 17.009510 | \n",
2802 | " D轮及以上 | \n",
2803 | " 0 | \n",
2804 | " 0 | \n",
2805 | " 0 | \n",
2806 | " 0 | \n",
2807 | " 0 | \n",
2808 | " 0 | \n",
2809 | " 0 | \n",
2810 | " 0 | \n",
2811 | " 0 | \n",
2812 | " 0 | \n",
2813 | "
\n",
2814 | " \n",
2815 | " 1050 | \n",
2816 | " 上海翔鸢信息科技 | \n",
2817 | " 20-99人 | \n",
2818 | " 上海 青浦区 赵巷 | \n",
2819 | " 大专 | \n",
2820 | " 经验不限 | \n",
2821 | " 王者荣耀数据分析师 | \n",
2822 | " 7.0 | \n",
2823 | " 7.725000 | \n",
2824 | " 20-99人 | \n",
2825 | " 0 | \n",
2826 | " 0 | \n",
2827 | " 1 | \n",
2828 | " 0 | \n",
2829 | " 0 | \n",
2830 | " 0 | \n",
2831 | " 0 | \n",
2832 | " 0 | \n",
2833 | " 0 | \n",
2834 | " 0 | \n",
2835 | "
\n",
2836 | " \n",
2837 | " 917 | \n",
2838 | " 观安信息 | \n",
2839 | " 100-499人 | \n",
2840 | " 上海 普陀区 金沙江路 | \n",
2841 | " 本科 | \n",
2842 | " 5-10年 | \n",
2843 | " 大数据安全产品经理 | \n",
2844 | " 25.0 | \n",
2845 | " 25.044444 | \n",
2846 | " B轮 | \n",
2847 | " 0 | \n",
2848 | " 0 | \n",
2849 | " 0 | \n",
2850 | " 0 | \n",
2851 | " 0 | \n",
2852 | " 0 | \n",
2853 | " 0 | \n",
2854 | " 0 | \n",
2855 | " 0 | \n",
2856 | " 0 | \n",
2857 | "
\n",
2858 | " \n",
2859 | "
\n",
2860 | "
"
2861 | ],
2862 | "text/plain": [
2863 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 \\\n",
2864 | "602 旗计智能 1000-9999人 上海 浦东新区 张江 本科 3-5年 数据分析主管/经理(风险方向) \n",
2865 | "260 速网电商 100-499人 上海 闵行区 莘庄 本科 1-3年 数据分析 \n",
2866 | "512 Oriente 500-999人 上海 黄浦区 人民广场 本科 3-5年 数据分析师 \n",
2867 | "1027 儒傲会软件定制 20-99人 上海 嘉定区 江桥 本科 5-10年 系统数据分析师招募要求(儒傲会) \n",
2868 | "787 华数康 100-499人 上海 长宁区 古北 学历不限 经验不限 数据分析师 \n",
2869 | "85 上海深界信息科技 0-20人 上海 徐汇区 漕河泾 大专 1-3年 数据分析师 \n",
2870 | "368 汉云信息 100-499人 上海 长宁区 天山路 本科 1-3年 互联网数据分析师 \n",
2871 | "907 饿了么 1000-9999人 上海 普陀区 金沙江路 本科 3-5年 运力规划/数据分析 \n",
2872 | "1050 上海翔鸢信息科技 20-99人 上海 青浦区 赵巷 大专 经验不限 王者荣耀数据分析师 \n",
2873 | "917 观安信息 100-499人 上海 普陀区 金沙江路 本科 5-10年 大数据安全产品经理 \n",
2874 | "\n",
2875 | " salary prediction 融资情况 Sql Python Excel Sas Spss Hive Hadoop \\\n",
2876 | "602 25.0 17.123016 已上市 1 1 1 1 0 0 0 \n",
2877 | "260 10.0 11.195915 不需要融资 0 0 0 0 0 0 0 \n",
2878 | "512 27.5 19.633333 B轮 1 1 0 1 0 0 1 \n",
2879 | "1027 17.5 19.751667 天使轮 1 0 0 0 0 0 0 \n",
2880 | "787 22.5 11.050000 B轮 0 0 0 1 0 0 0 \n",
2881 | "85 12.5 13.820000 不需要融资 0 1 0 1 0 1 0 \n",
2882 | "368 9.0 10.550000 不需要融资 1 0 1 0 0 0 0 \n",
2883 | "907 22.5 17.009510 D轮及以上 0 0 0 0 0 0 0 \n",
2884 | "1050 7.0 7.725000 20-99人 0 0 1 0 0 0 0 \n",
2885 | "917 25.0 25.044444 B轮 0 0 0 0 0 0 0 \n",
2886 | "\n",
2887 | " Ppt Tableau Spark \n",
2888 | "602 0 0 0 \n",
2889 | "260 0 0 0 \n",
2890 | "512 0 0 0 \n",
2891 | "1027 0 0 0 \n",
2892 | "787 0 0 0 \n",
2893 | "85 0 0 0 \n",
2894 | "368 0 0 0 \n",
2895 | "907 0 0 0 \n",
2896 | "1050 0 0 0 \n",
2897 | "917 0 0 0 "
2898 | ]
2899 | },
2900 | "execution_count": 91,
2901 | "metadata": {},
2902 | "output_type": "execute_result"
2903 | }
2904 | ],
2905 | "source": [
2906 | "data_test_prediction.sample(10)"
2907 | ]
2908 | },
2909 | {
2910 | "cell_type": "markdown",
2911 | "metadata": {},
2912 | "source": [
2913 | "- 预测结果与实际薪资相比误差在**可接受范围内**。"
2914 | ]
2915 | },
2916 | {
2917 | "cell_type": "markdown",
2918 | "metadata": {},
2919 | "source": [
2920 | "# 预测函数接口"
2921 | ]
2922 | },
2923 | {
2924 | "cell_type": "code",
2925 | "execution_count": 92,
2926 | "metadata": {},
2927 | "outputs": [
2928 | {
2929 | "data": {
2930 | "text/html": [
2931 | "\n",
2932 | "\n",
2945 | "
\n",
2946 | " \n",
2947 | " \n",
2948 | " | \n",
2949 | " 公司名称 | \n",
2950 | " 公司规模 | \n",
2951 | " 地区 | \n",
2952 | " 学历要求 | \n",
2953 | " 工作经验 | \n",
2954 | " 职位名称 | \n",
2955 | " salary | \n",
2956 | " 融资情况 | \n",
2957 | " Sql | \n",
2958 | " Python | \n",
2959 | " Excel | \n",
2960 | " Sas | \n",
2961 | " Spss | \n",
2962 | " Hive | \n",
2963 | " Hadoop | \n",
2964 | " Ppt | \n",
2965 | " Tableau | \n",
2966 | " Spark | \n",
2967 | "
\n",
2968 | " \n",
2969 | " \n",
2970 | " \n",
2971 | " 361 | \n",
2972 | " 携程旅行网 | \n",
2973 | " 10000人以上 | \n",
2974 | " 上海 长宁区 北新泾 | \n",
2975 | " 本科 | \n",
2976 | " 3-5年 | \n",
2977 | " 海外运营数据分析师 | \n",
2978 | " 20.5 | \n",
2979 | " 已上市 | \n",
2980 | " 1 | \n",
2981 | " 1 | \n",
2982 | " 0 | \n",
2983 | " 0 | \n",
2984 | " 0 | \n",
2985 | " 0 | \n",
2986 | " 0 | \n",
2987 | " 0 | \n",
2988 | " 0 | \n",
2989 | " 0 | \n",
2990 | "
\n",
2991 | " \n",
2992 | "
\n",
2993 | "
"
2994 | ],
2995 | "text/plain": [
2996 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql \\\n",
2997 | "361 携程旅行网 10000人以上 上海 长宁区 北新泾 本科 3-5年 海外运营数据分析师 20.5 已上市 1 \n",
2998 | "\n",
2999 | " Python Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n",
3000 | "361 1 0 0 0 0 0 0 0 0 "
3001 | ]
3002 | },
3003 | "execution_count": 92,
3004 | "metadata": {},
3005 | "output_type": "execute_result"
3006 | }
3007 | ],
3008 | "source": [
3009 | "data_test.head(1)"
3010 | ]
3011 | },
3012 | {
3013 | "cell_type": "code",
3014 | "execution_count": 93,
3015 | "metadata": {},
3016 | "outputs": [],
3017 | "source": [
3018 | "from pandas import DataFrame"
3019 | ]
3020 | },
3021 | {
3022 | "cell_type": "markdown",
3023 | "metadata": {},
3024 | "source": [
3025 | "### 预测功能函数"
3026 | ]
3027 | },
3028 | {
3029 | "cell_type": "code",
3030 | "execution_count": 94,
3031 | "metadata": {},
3032 | "outputs": [],
3033 | "source": [
3034 | "def prediction_function(scale,degree,experience,v_skills):\n",
3035 | " predict_X = {\n",
3036 | " '公司规模':[scale],\n",
3037 | " '学历要求':[degree],\n",
3038 | " '工作经验':[experience],\n",
3039 | " 'Sql':[v_skills[0]],\n",
3040 | " 'Python':[v_skills[1]],\n",
3041 | " 'Excel':[v_skills[2]],\n",
3042 | " 'Sas':[v_skills[3]],\n",
3043 | " 'Spss':[v_skills[4]],\n",
3044 | " 'Hive':[v_skills[5]],\n",
3045 | " 'Hadoop':[v_skills[6]],\n",
3046 | " 'Ppt':[v_skills[7]],\n",
3047 | " 'Tableau':[v_skills[8]],\n",
3048 | " 'Spark':[v_skills[9]],\n",
3049 | " }\n",
3050 | " predict_tmp = pd.DataFrame(predict_X)\n",
3051 | " X_predict = full_pipeline.transform(predict_tmp)\n",
3052 | " return X_predict"
3053 | ]
3054 | },
3055 | {
3056 | "cell_type": "markdown",
3057 | "metadata": {},
3058 | "source": [
3059 | "### 技能转换函数"
3060 | ]
3061 | },
3062 | {
3063 | "cell_type": "code",
3064 | "execution_count": 95,
3065 | "metadata": {},
3066 | "outputs": [],
3067 | "source": [
3068 | "def skills_switch(skill_list):\n",
3069 | " tmp_list = []\n",
3070 | " skills = ['Sql','Python','Excel','Sas','Spss','Hive','Hadoop','Ppt','Tableau','Spark']\n",
3071 | " for skill in skills:\n",
3072 | " # 大小写转换\n",
3073 | " if skill in skill_list:\n",
3074 | " tmp_list.append(1)\n",
3075 | " else:\n",
3076 | " tmp_list.append(0)\n",
3077 | " return tmp_list"
3078 | ]
3079 | },
3080 | {
3081 | "cell_type": "markdown",
3082 | "metadata": {},
3083 | "source": [
3084 | "### 预测主函数"
3085 | ]
3086 | },
3087 | {
3088 | "cell_type": "code",
3089 | "execution_count": 96,
3090 | "metadata": {},
3091 | "outputs": [],
3092 | "source": [
3093 | "def predict(scale,degree,experience,v_skills):\n",
3094 | " X_predict = prediction_function(scale,degree,experience,v_skills)\n",
3095 | " Y_predict = final_model.predict(X_predict)\n",
3096 | " print('预测薪资为:',Y_predict[0],'k/month')"
3097 | ]
3098 | },
3099 | {
3100 | "cell_type": "markdown",
3101 | "metadata": {},
3102 | "source": [
3103 | "---"
3104 | ]
3105 | },
3106 | {
3107 | "cell_type": "markdown",
3108 | "metadata": {},
3109 | "source": [
3110 | "\n",
3111 | "企业规模:['10000人以上' '1000-9999人' '20-99人' '0-20人' '500-999人' '100-499人']\n",
3112 | "学历:['本科' '大专' '学历不限' '硕士']\n",
3113 | "工作经验:['3-5年' '1-3年' '经验不限' '5-10年' '1年以内' '应届生']\n",
3114 | "Skills:[Sql,Python,Excel,Sas,Spss,Hive,Hadoop,Ppt,Tableau,Spark]\n",
3115 | "
"
3116 | ]
3117 | },
3118 | {
3119 | "cell_type": "markdown",
3120 | "metadata": {},
3121 | "source": [
3122 | "### 预测函数"
3123 | ]
3124 | },
3125 | {
3126 | "cell_type": "code",
3127 | "execution_count": 97,
3128 | "metadata": {},
3129 | "outputs": [
3130 | {
3131 | "name": "stdout",
3132 | "output_type": "stream",
3133 | "text": [
3134 | "预测薪资为: 12.14722222222222 k/month\n"
3135 | ]
3136 | }
3137 | ],
3138 | "source": [
3139 | "#-----------设置变量\n",
3140 | "scale = '10000人以上'\n",
3141 | "degree = '本科'\n",
3142 | "experience = '1-3年'\n",
3143 | "# ------------------\n",
3144 | "# --------设置所掌握的技能(顺序无关)\n",
3145 | "mastered_skills = ['Sql','Python','Excel','Spss','Ppt']\n",
3146 | "v_skills = skills_switch(mastered_skills)\n",
3147 | "# -----------------------------------\n",
3148 | "predict(scale,degree,experience,v_skills)"
3149 | ]
3150 | },
3151 | {
3152 | "cell_type": "code",
3153 | "execution_count": 98,
3154 | "metadata": {},
3155 | "outputs": [
3156 | {
3157 | "name": "stdout",
3158 | "output_type": "stream",
3159 | "text": [
3160 | "10000人以上 | 本科 | 应届生 | Sql,Python,Excel,Spss,Ppt\n",
3161 | "预测薪资为: 13.066666666666666 k/month\n",
3162 | "------------------------------------------------------------\n",
3163 | "10000人以上 | 本科 | 1年以内 | Sql,Python,Excel,Spss,Ppt\n",
3164 | "预测薪资为: 13.308333333333334 k/month\n",
3165 | "------------------------------------------------------------\n",
3166 | "10000人以上 | 本科 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n",
3167 | "预测薪资为: 12.14722222222222 k/month\n",
3168 | "------------------------------------------------------------\n",
3169 | "10000人以上 | 本科 | 3-5年 | Sql,Python,Excel,Spss,Ppt\n",
3170 | "预测薪资为: 20.964444444444446 k/month\n",
3171 | "------------------------------------------------------------\n",
3172 | "10000人以上 | 本科 | 5-10年 | Sql,Python,Excel,Spss,Ppt\n",
3173 | "预测薪资为: 23.97222222222222 k/month\n",
3174 | "------------------------------------------------------------\n"
3175 | ]
3176 | }
3177 | ],
3178 | "source": [
3179 | "experiences = ['应届生','1年以内', '1-3年','3-5年', '5-10年' ]\n",
3180 | "\n",
3181 | "for exp in experiences:\n",
3182 | " print(scale,'|',degree,'|',exp,'|',\",\".join(mastered_skills))\n",
3183 | " predict(scale,degree,exp,v_skills)\n",
3184 | " print('-'*60)"
3185 | ]
3186 | },
3187 | {
3188 | "cell_type": "markdown",
3189 | "metadata": {},
3190 | "source": [
3191 | "---"
3192 | ]
3193 | },
3194 | {
3195 | "cell_type": "code",
3196 | "execution_count": 101,
3197 | "metadata": {},
3198 | "outputs": [
3199 | {
3200 | "name": "stdout",
3201 | "output_type": "stream",
3202 | "text": [
3203 | "500-999人 | 学历不限 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n",
3204 | "预测薪资为: 21.625 k/month\n",
3205 | "------------------------------------------------------------\n",
3206 | "500-999人 | 大专 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n",
3207 | "预测薪资为: 19.925 k/month\n",
3208 | "------------------------------------------------------------\n",
3209 | "500-999人 | 本科 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n",
3210 | "预测薪资为: 23.091666666666665 k/month\n",
3211 | "------------------------------------------------------------\n",
3212 | "500-999人 | 硕士 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n",
3213 | "预测薪资为: 20.791666666666668 k/month\n",
3214 | "------------------------------------------------------------\n"
3215 | ]
3216 | }
3217 | ],
3218 | "source": [
3219 | "experience = '1-3年'\n",
3220 | "scale = '500-999人'\n",
3221 | "degrees = ['学历不限','大专','本科','硕士']\n",
3222 | "\n",
3223 | "for degree in degrees:\n",
3224 | " print(scale,'|',degree,'|',experience,'|',\",\".join(mastered_skills))\n",
3225 | " predict(scale,degree,exp,v_skills)\n",
3226 | " print('-'*60)"
3227 | ]
3228 | },
3229 | {
3230 | "cell_type": "code",
3231 | "execution_count": null,
3232 | "metadata": {},
3233 | "outputs": [],
3234 | "source": []
3235 | }
3236 | ],
3237 | "metadata": {
3238 | "kernelspec": {
3239 | "display_name": "Python 3",
3240 | "language": "python",
3241 | "name": "python3"
3242 | },
3243 | "language_info": {
3244 | "codemirror_mode": {
3245 | "name": "ipython",
3246 | "version": 3
3247 | },
3248 | "file_extension": ".py",
3249 | "mimetype": "text/x-python",
3250 | "name": "python",
3251 | "nbconvert_exporter": "python",
3252 | "pygments_lexer": "ipython3",
3253 | "version": "3.6.3"
3254 | }
3255 | },
3256 | "nbformat": 4,
3257 | "nbformat_minor": 2
3258 | }
3259 |
--------------------------------------------------------------------------------
/output_13_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/output_13_0.png
--------------------------------------------------------------------------------
/spider/shanghai_jobs_discovery.py:
--------------------------------------------------------------------------------
1 | # common imports
2 | import requests
3 | from lxml import etree
4 | import time
5 | import random
6 | import pymongo
7 | from retrying import retry
8 |
9 |
10 | # ---------------------
11 |
12 | # 页面获取函数
13 | def get_page(page, keyword):
14 | header = {
15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
16 | 'Chrome/69.0.3497.12 Safari/537.36 '
17 | }
18 | print('正在爬取第', page, '页')
19 | url = 'https://www.zhipin.com/c101020100/?query={k}&page={page}&ka=page-{page}'.format(page=page, k=keyword)
20 | response = requests.get(url, headers=header)
21 | return response.text
22 |
23 |
24 | # --------------
25 | @retry(wait_fixed=8000)
26 | def job_detail(link):
27 | header = {
28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
29 | 'Chrome/69.0.3497.12 Safari/537.36 '
30 | }
31 | response = requests.get(link, headers=header)
32 | data = etree.HTML(response.text)
33 |
34 | # ---检验是否出现验证码
35 | tips = data.xpath('/html/head/title/text()')
36 | tips_title = 'BOSS直聘验证码'
37 | if tips[0] == tips_title:
38 | print('检查是否弹出验证码')
39 | # 弹出验证码则引发IOError来进行循环
40 | raise IOError
41 | # ----------------------
42 | job_desc = data.xpath('//*[@id="main"]/div[3]/div/div[2]/div[3]/div[@class="job-sec"][1]/div/text()')
43 |
44 | jd = "".join(job_desc).strip()
45 | return jd
46 |
47 |
48 | def parse_page(html, keyword, page):
49 | # 观察数据结构可得
50 | data = etree.HTML(html)
51 | if page == 1:
52 | items = data.xpath('//*[@id="main"]/div/div[3]/ul/li')
53 | else:
54 | items = data.xpath('//*[@id="main"]/div/div[2]/ul/li')
55 | for item in items:
56 | district = item.xpath('./div/div[1]/p/text()[1]')[0]
57 | job_links = item.xpath('./div/div[1]/h3/a/@href')[0]
58 | job_title = item.xpath('./div/div[1]/h3/a/div[1]/text()')[0]
59 | job_salary = item.xpath('./div/div[1]/h3/a/span/text()')[0]
60 | job_company = item.xpath('./div/div[2]/div/h3/a/text()')[0]
61 | job_experience = item.xpath('./div/div[1]/p/text()[2]')[0]
62 | job_degree = item.xpath('./div/div[1]/p/text()[3]')[0]
63 | fin_status = item.xpath('./div/div[2]/div/p/text()[2]')[0]
64 | try:
65 | company_scale = item.xpath('./div/div[2]/div/p/text()[3]')[0]
66 | except Exception:
67 | company_scale = item.xpath('./div/div[2]/div/p/text()[2]')[0]
68 | job_link = host + job_links
69 | # print(job_link)
70 | # 获取职位描述
71 | detail = job_detail(job_link)
72 | # ---------------
73 | job = {
74 | 'Keyword': keyword,
75 | '地区': district,
76 | '职位名称': job_title,
77 | '职位薪资': job_salary,
78 | '公司名称': job_company,
79 | '工作经验': job_experience,
80 | '学历要求': job_degree,
81 | '公司规模': company_scale,
82 | '融资情况': fin_status,
83 | '职位描述': detail,
84 | }
85 | print(job)
86 | save_to_mongo(job)
87 | time.sleep(random.randint(6, 9))
88 | # ---------------------------------------
89 |
90 |
91 | # 连接到MongoDB
92 | MONGO_URL = 'localhost'
93 | MONGO_DB = 'Graduation_project'
94 | MONGO_COLLECTION = 'shanghai_discovery'
95 | client = pymongo.MongoClient(MONGO_URL, port=27017)
96 | db = client[MONGO_DB]
97 |
98 |
99 | def save_to_mongo(data):
100 | # 保存到MongoDB中
101 | try:
102 | if db[MONGO_COLLECTION].insert(data):
103 | print('存储到 MongoDB 成功')
104 | except Exception:
105 | print('存储到 MongoDB 失败')
106 |
107 |
108 | if __name__ == '__main__':
109 | MAX_PAGE = 10
110 | host = 'https://www.zhipin.com'
111 | keywords = ['数据分析', '数据挖掘', '商业分析', '机器学习']
112 | for keyword in keywords:
113 | for i in range(1, MAX_PAGE + 1):
114 | html = get_page(i, keyword)
115 | # ------------ 解析数据 ---------------
116 | parse_page(html, keyword, i)
117 | print('-' * 100)
118 | # -----------------
119 | timewait = random.randint(15, 18)
120 | time.sleep(timewait)
121 | print('等待', timewait, '秒')
122 |
--------------------------------------------------------------------------------