├── NLP-analysis ├── Data-Science-NLP_11-19.ipynb ├── data │ ├── business_analysis.csv │ ├── business_analysis_with_skills.csv │ ├── data_analysis.csv │ ├── data_analysis_with_skills.csv │ ├── data_mining.csv │ ├── data_mining_with_skills.csv │ ├── machine_learning.csv │ └── machine_learning_with_skills.csv ├── dict │ ├── Stopword.txt │ ├── userdict.txt │ ├── 硬技能.txt │ └── 软技能.txt └── images │ ├── 各区职位数.png │ ├── 数据科学求职技能.png │ ├── 数据科学求职硬技能.png │ └── 数据科学求职软技能.png ├── README.md ├── analysis&forecast └── Data-Science-Analysis.ipynb ├── data_to_be_analysed ├── business_analysis_with_skills.csv ├── data_analysis_with_skills.csv ├── data_mining_with_skills.csv └── machine_learning_with_skills.csv ├── output_13_0.png └── spider └── shanghai_jobs_discovery.py /NLP-analysis/dict/Stopword.txt: -------------------------------------------------------------------------------- 1 |  2 | 线 3 | 分 4 | 数据 5 | 分析 6 | 数学 7 | 型 8 | 模型 9 | 率 10 | 度 11 | 统计 12 | 能 13 | 学习 14 | 用 15 | 人 16 | 性 17 | 结构 18 | 采集 19 | 库 20 | 数 21 | 网 22 | 据 23 | 网络 24 | 画像 25 | 机器 26 | 语 27 | 程 28 | 机 29 | 清 30 | 理学 31 | 式 32 | 建 33 | 代数 34 | 概率 35 | 工程 36 | 集 37 | 字 38 | 概率 39 | 深 40 | 营 41 | 运 42 | 学 43 | 仓 44 | 工 45 | 感 46 | 合 47 | 析 48 | 拟 49 | 爬 50 | 代 51 | 户 52 | 掘 53 | 挖掘 54 | 用户 55 | 建模 56 | 逻辑 57 | 敏感 58 | 思维 59 | 深度 60 | 清洗 61 | 测试 62 | 模式 63 | - 64 | 识别 65 | A 66 | B 67 | D 68 | E 69 | F 70 | G 71 | H 72 | I 73 | J 74 | K 75 | L 76 | M 77 | N 78 | O 79 | P 80 | Q 81 | R 82 | S 83 | T 84 | U 85 | V 86 | W 87 | X 88 | Y 89 | Z 90 | Pos 91 | Sa 92 | 1 93 | 2 94 | 3 95 | 4 96 | 5 97 | 6 98 | 7 99 | 8 100 | 9 101 | 0 102 | / 103 | 。 104 | , 105 | 、 106 | . 107 | , 108 | / 109 | \ 110 | ' 111 | " 112 | ; 113 | ` 114 | ( 115 | ) 116 | ( 117 | ) 118 | ? 119 | ? 120 | -------------------------------------------------------------------------------- /NLP-analysis/dict/userdict.txt: -------------------------------------------------------------------------------- 1 | 数据结构 2 | 数据库 3 | 数据分析 4 | 数据挖掘 5 | 建模 6 | 商务智能 7 | 商业 8 | 拟合 9 | 开源 10 | 统计 11 | 数学 12 | 决策 13 | 线性代数 14 | 数据报表 15 | 定量分析 16 | 沟通 17 | 业务 18 | 管理学 19 | 英语 20 | 数字 21 | 测试 22 | 机器学习 23 | 人工智能 24 | 深度学习 25 | 爬虫 26 | 概率统计 27 | 决策支持 28 | 概率论 29 | 网络工程 30 | 算法 31 | 经济学 32 | 报表 33 | 报告 34 | 评估 35 | 评价 36 | 咨询 37 | 逻辑思维 38 | 数字敏感 39 | 数据采集 40 | 数据清洗 41 | 模式识别 42 | 用户画像 43 | 用户行为 44 | 数据运营 45 | 数据仓库 46 | 数据模型 -------------------------------------------------------------------------------- /NLP-analysis/dict/硬技能.txt: -------------------------------------------------------------------------------- 1 | Python 2 | C 3 | Java 4 | R 5 | Ppt 6 | Excel 7 | Spss 8 | Word 9 | Powerpoint 10 | Tensorflow 11 | Scikit 12 | Sql 13 | Shell 14 | Hadoop 15 | Stack 16 | Git 17 | Github 18 | Tableau 19 | Linux 20 | Unix 21 | Sas 22 | Matlab 23 | Oracle 24 | Hive 25 | Access 26 | Spark 27 | Powerbi 28 | Mongodb 29 | Redis 30 | PostgreSql -------------------------------------------------------------------------------- /NLP-analysis/dict/软技能.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/dict/软技能.txt -------------------------------------------------------------------------------- /NLP-analysis/images/各区职位数.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/各区职位数.png -------------------------------------------------------------------------------- /NLP-analysis/images/数据科学求职技能.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/数据科学求职技能.png -------------------------------------------------------------------------------- /NLP-analysis/images/数据科学求职硬技能.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/数据科学求职硬技能.png -------------------------------------------------------------------------------- /NLP-analysis/images/数据科学求职软技能.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/NLP-analysis/images/数据科学求职软技能.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ```python 4 | import warnings 5 | warnings.filterwarnings("ignore") 6 | ``` 7 | 8 | # Read Data 9 | 10 | 11 | ```python 12 | import pandas as pd 13 | from pandas import Series 14 | ``` 15 | 16 | 17 | ```python 18 | data_analysis = pd.read_csv('./data_to_be_analysed/data_analysis_with_skills.csv') 19 | data_mining = pd.read_csv('./data_to_be_analysed/data_mining_with_skills.csv') 20 | machine_learning = pd.read_csv('./data_to_be_analysed/machine_learning_with_skills.csv') 21 | business_analysis = pd.read_csv('./data_to_be_analysed/business_analysis_with_skills.csv') 22 | ``` 23 | 24 | --- 25 | 26 | ## 添加薪资均值 27 | 28 | 29 | ```python 30 | import re 31 | # 均值函数 32 | def average(job_salary): 33 | # 取薪资均值---------------- 34 | pattern = re.compile('\d+') 35 | salary = job_salary 36 | try: 37 | res = re.findall(pattern, salary) 38 | avg_salary = 0 39 | sum = 0 40 | for i in res: 41 | a = int(i) 42 | sum = sum + a 43 | avg_salary = sum / 2 44 | except Exception: 45 | avg_salary = 0 46 | # 函数返回值 47 | return avg_salary 48 | 49 | salary_list = [] 50 | for i in range(0,data_analysis.shape[0]): 51 | avg_sal = average(data_analysis['职位薪资'][i]) 52 | salary_list.append(avg_sal) 53 | sal = Series(salary_list) 54 | 55 | data_analysis.insert(9,'salary',sal) 56 | ``` 57 | 58 | 59 | ```python 60 | salary_list = [] 61 | for i in range(0,data_mining.shape[0]): 62 | avg_sal = average(data_mining['职位薪资'][i]) 63 | salary_list.append(avg_sal) 64 | sal = Series(salary_list) 65 | 66 | data_mining.insert(9,'salary',sal) 67 | ``` 68 | 69 | 70 | ```python 71 | salary_list = [] 72 | for i in range(0,machine_learning.shape[0]): 73 | avg_sal = average(machine_learning['职位薪资'][i]) 74 | salary_list.append(avg_sal) 75 | sal = Series(salary_list) 76 | 77 | machine_learning.insert(9,'salary',sal) 78 | ``` 79 | 80 | 81 | ```python 82 | salary_list = [] 83 | for i in range(0,business_analysis.shape[0]): 84 | avg_sal = average(business_analysis['职位薪资'][i]) 85 | salary_list.append(avg_sal) 86 | sal = Series(salary_list) 87 | 88 | business_analysis.insert(9,'salary',sal) 89 | ``` 90 | 91 | --- 92 | 93 | # 薪资分布探索 94 | 95 | 96 | ```python 97 | data_analysis.salary.describe() 98 | ``` 99 | 100 | 101 | 102 | 103 | count 575.000000 104 | mean 17.446957 105 | std 8.261053 106 | min 2.500000 107 | 25% 11.500000 108 | 50% 15.000000 109 | 75% 22.500000 110 | max 47.500000 111 | Name: salary, dtype: float64 112 | 113 | 114 | 115 | 116 | ```python 117 | %matplotlib inline 118 | import matplotlib.pyplot as plt 119 | data_analysis.salary.hist(bins=50, figsize=(8,5)) 120 | 121 | plt.show() 122 | ``` 123 | 124 | 125 | ![png](output_13_0.png) 126 | 127 | 128 | - 薪资主要分布在**5k-30k**之间 129 | 130 | --- 131 | 132 | 133 | ```python 134 | data_analysis = data_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1) 135 | ``` 136 | 137 | 138 | ```python 139 | data_mining = data_mining.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1) 140 | ``` 141 | 142 | 143 | ```python 144 | machine_learning = machine_learning.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1) 145 | ``` 146 | 147 | 148 | ```python 149 | business_analysis = business_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1) 150 | ``` 151 | 152 | --- 153 | 154 | ## 掌握的软件技能对薪资的影响关系 155 | 156 | 157 | ```python 158 | corr_matrix = data_analysis.corr() 159 | corr_matrix["salary"].sort_values(ascending=False) 160 | ``` 161 | 162 | 163 | 164 | 165 | salary 1.000000 166 | Hive 0.292764 167 | Hadoop 0.254448 168 | Python 0.242217 169 | Sql 0.235256 170 | Spark 0.216505 171 | Sas 0.119493 172 | Tableau 0.044519 173 | Spss 0.024708 174 | Ppt -0.042691 175 | Excel -0.140370 176 | Name: salary, dtype: float64 177 | 178 | 179 | 180 | - **Data Analysis**的职位中,`Hive`,`Spark`,`Hadoop`大数据应用方面的软件是**薪资的加分项**。 181 | - 同时,`Python`,`SQL`,`SAS`,`Tableau`,`SPSS`等统计分析软件与可视化软件也是数据分析师**区别于低薪分析专员**的因素。 182 | - `PPT`,`Excel`作为必须的软件技能,对薪资变化**并没有太大的影响**,甚至仅仅会Excel的职位沦落为专员,会是一个减分项。 183 | - 结论:在数据分析领域,拥有**大数据软件技能**并且懂得**Python**这一编程语言的分析师的待遇较好。 184 | 185 | 186 | ```python 187 | corr_matrix = data_mining.corr() 188 | corr_matrix["salary"].sort_values(ascending=False) 189 | ``` 190 | 191 | 192 | 193 | 194 | salary 1.000000 195 | Hive 0.195964 196 | Spark 0.180962 197 | Java 0.180336 198 | Hadoop 0.136721 199 | C 0.127518 200 | Python 0.067957 201 | Shell 0.020722 202 | Linux -0.009015 203 | Sql -0.052715 204 | Sas -0.089915 205 | Name: salary, dtype: float64 206 | 207 | 208 | 209 | - **Data Mining**的职位中,`Hive`,`Spark`,`Hadoop`大数据方面的软件是薪资**极大的加分项**。 210 | - `Java`,`C`,`Python`等编程语言对数据挖掘的工作有很大帮助因此也体现在了对薪资的**正面影响**上。 211 | - 分析结论:具备**数据挖掘算法与编码能力**且具备**大数据方面分析技能**的数据挖掘工程师的待遇较好。 212 | 213 | 214 | ```python 215 | corr_matrix = machine_learning.corr() 216 | corr_matrix["salary"].sort_values(ascending=False) 217 | ``` 218 | 219 | 220 | 221 | 222 | salary 1.000000 223 | Hive 0.095518 224 | Spark 0.093537 225 | Java 0.064341 226 | Tensorflow 0.059785 227 | Hadoop 0.057670 228 | Sql -0.035192 229 | Linux -0.036466 230 | C -0.052703 231 | Matlab -0.058808 232 | Python -0.104268 233 | Name: salary, dtype: float64 234 | 235 | 236 | 237 | - **Machine Learning**的职位中,没有特别突出的技能加分项,列表中的软件技能基本都是入职必备的技能。 238 | - `Hive`,`Spark`,`Hadoop`等大数据方面的技能会对薪资有一定程度的提升,不过影响较小。 239 | - 分析结论:机器学习工程师入门难度稍高,需要掌握具备的软件技能也较多,没有特别突出的薪资加分项。 240 | 241 | 242 | ```python 243 | corr_matrix = business_analysis.corr() 244 | corr_matrix["salary"].sort_values(ascending=False) 245 | ``` 246 | 247 | 248 | 249 | 250 | salary 1.000000 251 | C 0.399615 252 | Python 0.377288 253 | Linux 0.255181 254 | Java 0.237707 255 | Hive 0.153111 256 | Sql 0.115494 257 | Sas 0.085293 258 | Excel 0.059614 259 | Ppt -0.057346 260 | Spss -0.068219 261 | Name: salary, dtype: float64 262 | 263 | 264 | 265 | - **Business Analysis**的职位中,编程语言是**极大的薪资加分项**。如`C`,`Python`,`Java`。 266 | - `Excel`,`PPT`,`SPSS`等软件是这个职位的**必备技能**,因此对职位薪资没有太大的影响。 267 | - 结论:在商业分析领域,拥有**商业分析思维**并且具有**编程能力**的分析师的待遇较好。 268 | 269 | --- 270 | 271 | --- 272 | 273 | # 准备数据 274 | 275 | 276 | ```python 277 | from sklearn.model_selection import train_test_split 278 | 279 | train_set, test_set = train_test_split(data_analysis, test_size=0.2, random_state=52) 280 | ``` 281 | 282 | 283 | ```python 284 | data_train = train_set.copy() 285 | data_test = test_set.copy() 286 | ``` 287 | 288 | ```python 289 | from sklearn.pipeline import Pipeline 290 | from sklearn.preprocessing import StandardScaler 291 | from sklearn.preprocessing import Imputer 292 | from sklearn.compose import ColumnTransformer 293 | from sklearn.preprocessing import OneHotEncoder 294 | ``` 295 | 296 | 297 | ```python 298 | data_analysis_num = data_analysis.drop(['公司名称','公司规模','地区','学历要求','工作经验','职位名称','融资情况','salary'], axis=1) 299 | num_attribs = list(data_analysis_num) 300 | cat_attribs = ['公司规模','学历要求','工作经验','融资情况'] 301 | 302 | num_pipeline = Pipeline([ 303 | ('std_scaler', StandardScaler()), 304 | ]) 305 | 306 | full_pipeline = ColumnTransformer([ 307 | ("num", num_pipeline, num_attribs), 308 | ("cat", OneHotEncoder(), cat_attribs), 309 | ]) 310 | 311 | data_analysis_prepared = full_pipeline.fit_transform(data_train) 312 | data_analysis_test = full_pipeline.transform(data_test) 313 | ``` 314 | 315 | 316 | ```python 317 | data_analysis_prepared[:1] 318 | ``` 319 | 320 | 321 | 322 | 323 | array([[-1.04902651, -0.99566158, -0.6853091 , -0.58404441, -0.5 , 324 | -0.4307749 , -0.38729833, 2.68594224, -0.37608404, -0.27343437, 325 | 0. , 0. , 1. , 0. , 0. , 326 | 0. , 0. , 0. , 1. , 0. , 327 | 0. , 0. , 0. , 0. , 1. , 328 | 0. , 0. , 0. , 0. , 0. , 329 | 0. , 0. , 0. , 0. , 0. , 330 | 0. , 0. , 0. , 0. , 1. , 331 | 0. ]]) 332 | 333 | 334 | 335 | 336 | ```python 337 | data_analysis_labels = data_train.salary.values 338 | test_labels = data_test.salary.values 339 | ``` 340 | 341 | --- 342 | 343 | # 训练模型 344 | 345 | ## Linear 346 | 347 | 348 | ```python 349 | from sklearn.linear_model import LinearRegression 350 | 351 | lin_reg = LinearRegression() 352 | lin_reg.fit(data_analysis_prepared, data_analysis_labels) 353 | ``` 354 | 355 | 356 | 357 | 358 | LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, 359 | normalize=False) 360 | 361 | 362 | 363 | 364 | ```python 365 | from sklearn.metrics import mean_squared_error 366 | import numpy as np 367 | 368 | salary_predictions = lin_reg.predict(data_analysis_prepared) 369 | lin_mse = mean_squared_error(data_analysis_labels, salary_predictions) 370 | lin_rmse = np.sqrt(lin_mse) 371 | lin_rmse 372 | ``` 373 | 374 | 375 | 376 | 377 | 5.402995127278521 378 | 379 | 380 | 381 | 382 | ```python 383 | #salary_predictions[:10] 384 | ``` 385 | 386 | ### 测试集 387 | 388 | 389 | ```python 390 | data_test.head(10) 391 | ``` 392 | 393 | 394 | 395 | 396 |
397 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 |
公司名称公司规模地区学历要求工作经验职位名称salary融资情况SqlPythonExcelSasSpssHiveHadoopTableauPptSpark
198上海克顿文化传媒100-499人上海 黄浦区 新天地本科3-5年数据分析师11.5已上市0000100010
316今日头条10000人以上上海 徐汇区 上海南站本科1-3年广告数据分析师11.5D轮及以上1011100000
52壹米滴答10000人以上上海 青浦区 徐泾本科1-3年资深BI数据分析师16.0C轮1000001001
313拼多多1000-9999人上海 长宁区 天山路本科经验不限数据分析师22.5已上市1100010000
19太平洋房屋10000人以上上海 徐汇区 枫林路本科1-3年数据分析师12.0已上市1110000100
482印鹰100-499人上海 静安区 汶水路本科3-5年数据分析经理20.0B轮1000001000
212复硕正态20-99人上海 静安区 大宁路本科1-3年高级数据分析员10.0不需要融资0000100010
127云序生物20-99人上海 松江区 新桥大专1-3年生信分析师/数据分析8.0不需要融资0100000000
401上海创蓝文化传播500-999人上海 松江区 松江工业区本科1-3年数据分析师20.0A轮1110000110
260上海智帛20-99人上海 闵行区 漕宝路大专1-3年数据分析10.0未融资1010000000
647 |
648 | 649 | 650 | 651 | 652 | ```python 653 | y_test = lin_reg.predict(data_analysis_test) 654 | y_test[:10] 655 | ``` 656 | 657 | 658 | 659 | 660 | array([17.01653719, 16.41342787, 21.16768836, 19.22802331, 13.22095168, 661 | 22.22075171, 11.02715534, 7.95300838, 13.12913168, 4.30171607]) 662 | 663 | 664 | 665 | 666 | ```python 667 | test_labels[:10] 668 | ``` 669 | 670 | 671 | 672 | 673 | array([11.5, 11.5, 16. , 22.5, 12. , 20. , 10. , 8. , 20. , 10. ]) 674 | 675 | 676 | 677 | 678 | ```python 679 | lin_mse = mean_squared_error(test_labels, y_test) 680 | lin_rmse = np.sqrt(lin_mse) 681 | lin_rmse 682 | ``` 683 | 684 | 685 | 686 | 687 | 6.7698143548675915 688 | 689 | 690 | 691 | - 测试集上误差约为**6.77** 692 | 693 | ### 交叉验证 694 | 695 | 696 | ```python 697 | from sklearn.model_selection import cross_val_score 698 | 699 | scores = cross_val_score(lin_reg, data_analysis_prepared, data_analysis_labels, 700 | scoring="neg_mean_squared_error", cv=10) 701 | lin_rmse_scores = np.sqrt(-scores) 702 | ``` 703 | 704 | 705 | ```python 706 | display_scores(lin_rmse_scores) 707 | ``` 708 | 709 | Scores: [5.81120933 5.92292919 6.50970607 6.24610706 5.18158564 6.27624993 710 | 7.37315509 6.07787995 5.67585695 4.86847943] 711 | Mean: 5.994315863710689 712 | Standard deviation: 0.6621760708217165 713 | 714 | 715 | --- 716 | 717 | ## D-Tree 718 | 719 | ### 建模训练 720 | 721 | 722 | ```python 723 | from sklearn.tree import DecisionTreeRegressor 724 | 725 | tree_reg = DecisionTreeRegressor(random_state=52) 726 | tree_reg.fit(data_analysis_prepared, data_analysis_labels) 727 | ``` 728 | 729 | 730 | 731 | 732 | DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None, 733 | max_leaf_nodes=None, min_impurity_decrease=0.0, 734 | min_impurity_split=None, min_samples_leaf=1, 735 | min_samples_split=2, min_weight_fraction_leaf=0.0, 736 | presort=False, random_state=52, splitter='best') 737 | 738 | 739 | 740 | 741 | ```python 742 | y_pred_tree = tree_reg.predict(data_analysis_prepared) 743 | ``` 744 | 745 | 746 | ```python 747 | from sklearn.metrics import mean_squared_error 748 | 749 | tree_mse = mean_squared_error(data_analysis_labels, y_pred_tree) 750 | tree_rmse = np.sqrt(tree_mse) 751 | tree_rmse 752 | ``` 753 | 754 | 755 | 756 | 757 | 1.4079709678742711 758 | 759 | 760 | 761 | ### 测试集 762 | 763 | 764 | ```python 765 | y_test = tree_reg.predict(data_analysis_test) 766 | y_test[:10] 767 | ``` 768 | 769 | 770 | 771 | 772 | array([15. , 12.5, 10. , 27. , 30. , 14.5, 8. , 7. , 12.5, 7. ]) 773 | 774 | 775 | 776 | 777 | ```python 778 | test_labels[:10] 779 | ``` 780 | 781 | 782 | 783 | 784 | array([11.5, 11.5, 16. , 22.5, 12. , 20. , 10. , 8. , 20. , 10. ]) 785 | 786 | 787 | 788 | 789 | ```python 790 | tree_mse = mean_squared_error(test_labels, y_test) 791 | tree_rmse = np.sqrt(tree_mse) 792 | tree_rmse 793 | ``` 794 | 795 | 796 | 797 | 798 | 8.252411446722123 799 | 800 | 801 | 802 | - 测试集上误差约为**8.25** 803 | 804 | --- 805 | 806 | ## Random Forest 807 | 808 | ### 建模训练 809 | 810 | 811 | ```python 812 | from sklearn.ensemble import RandomForestRegressor 813 | 814 | forest_reg = RandomForestRegressor(random_state=52) 815 | forest_reg.fit(data_analysis_prepared, data_analysis_labels) 816 | ``` 817 | 818 | 819 | 820 | 821 | RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, 822 | max_features='auto', max_leaf_nodes=None, 823 | min_impurity_decrease=0.0, min_impurity_split=None, 824 | min_samples_leaf=1, min_samples_split=2, 825 | min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, 826 | oob_score=False, random_state=52, verbose=0, warm_start=False) 827 | 828 | 829 | 830 | 831 | ```python 832 | y_pred_rf = forest_reg.predict(data_analysis_prepared) 833 | forest_mse = mean_squared_error(data_analysis_labels, y_pred_rf) 834 | forest_rmse = np.sqrt(forest_mse) 835 | forest_rmse 836 | ``` 837 | 838 | 839 | 840 | 841 | 2.714777885552381 842 | 843 | 844 | 845 | ### 测试集 846 | 847 | 848 | ```python 849 | data_test[:10] 850 | ``` 851 | 852 | 853 | 854 | 855 |
856 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 |
公司名称公司规模地区学历要求工作经验职位名称salary融资情况SqlPythonExcelSasSpssHiveHadoopTableauPptSpark
198上海克顿文化传媒100-499人上海 黄浦区 新天地本科3-5年数据分析师11.5已上市0000100010
316今日头条10000人以上上海 徐汇区 上海南站本科1-3年广告数据分析师11.5D轮及以上1011100000
52壹米滴答10000人以上上海 青浦区 徐泾本科1-3年资深BI数据分析师16.0C轮1000001001
313拼多多1000-9999人上海 长宁区 天山路本科经验不限数据分析师22.5已上市1100010000
19太平洋房屋10000人以上上海 徐汇区 枫林路本科1-3年数据分析师12.0已上市1110000100
482印鹰100-499人上海 静安区 汶水路本科3-5年数据分析经理20.0B轮1000001000
212复硕正态20-99人上海 静安区 大宁路本科1-3年高级数据分析员10.0不需要融资0000100010
127云序生物20-99人上海 松江区 新桥大专1-3年生信分析师/数据分析8.0不需要融资0100000000
401上海创蓝文化传播500-999人上海 松江区 松江工业区本科1-3年数据分析师20.0A轮1110000110
260上海智帛20-99人上海 闵行区 漕宝路大专1-3年数据分析10.0未融资1010000000
1106 |
1107 | 1108 | 1109 | 1110 | 1111 | ```python 1112 | y_test = forest_reg.predict(data_analysis_test) 1113 | y_test[:10] 1114 | ``` 1115 | 1116 | 1117 | 1118 | 1119 | array([20.08333333, 10.35 , 18.025 , 26.25 , 19.58333333, 1120 | 20.35 , 9.11666667, 10.825 , 12.55428571, 9.1 ]) 1121 | 1122 | 1123 | 1124 | 1125 | ```python 1126 | test_labels[:10] 1127 | ``` 1128 | 1129 | 1130 | 1131 | 1132 | array([11.5, 11.5, 16. , 22.5, 12. , 20. , 10. , 8. , 20. , 10. ]) 1133 | 1134 | 1135 | 1136 | 1137 | ```python 1138 | forest_mse = mean_squared_error(test_labels, y_test) 1139 | forest_rmse = np.sqrt(forest_mse) 1140 | forest_rmse 1141 | ``` 1142 | 1143 | 1144 | 1145 | 1146 | 7.087180783205348 1147 | 1148 | 1149 | 1150 | - 测试集上误差约为**7.09** 1151 | 1152 | ### 交叉验证 1153 | 1154 | 1155 | ```python 1156 | from sklearn.model_selection import cross_val_score 1157 | 1158 | scores = cross_val_score(forest_reg, data_analysis_prepared, data_analysis_labels, 1159 | scoring="neg_mean_squared_error", cv=10) 1160 | forest_rmse_scores = np.sqrt(-scores) 1161 | ``` 1162 | 1163 | 1164 | ```python 1165 | def display_scores(scores): 1166 | print("Scores:", scores) 1167 | print("Mean:", scores.mean()) 1168 | print("Standard deviation:", scores.std()) 1169 | 1170 | display_scores(forest_rmse_scores) 1171 | ``` 1172 | 1173 | Scores: [5.92160593 6.47537707 8.01906699 5.64303733 6.39749406 7.22392532 1174 | 6.18275771 6.05244757 6.53447138 4.86251585] 1175 | Mean: 6.331269920627992 1176 | Standard deviation: 0.8130474122018511 1177 | 1178 | 1179 | --- 1180 | 1181 | # 网格搜索调参 1182 | 1183 | 1184 | ```python 1185 | from sklearn.model_selection import GridSearchCV 1186 | 1187 | param_grid = [ 1188 | # try 12 (3×4) combinations of hyperparameters 1189 | {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, 1190 | # then try 6 (2×3) combinations with bootstrap set as False 1191 | {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, 1192 | ] 1193 | 1194 | forest_reg = RandomForestRegressor(random_state=52) 1195 | # train across 5 folds, that's a total of (12+6)*5=90 rounds of training 1196 | grid_search = GridSearchCV(forest_reg, param_grid, cv=5, 1197 | scoring='neg_mean_squared_error', return_train_score=True) 1198 | grid_search.fit(data_analysis_prepared, data_analysis_labels) 1199 | ``` 1200 | 1201 | 1202 | 1203 | 1204 | GridSearchCV(cv=5, error_score='raise-deprecating', 1205 | estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, 1206 | max_features='auto', max_leaf_nodes=None, 1207 | min_impurity_decrease=0.0, min_impurity_split=None, 1208 | min_samples_leaf=1, min_samples_split=2, 1209 | min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None, 1210 | oob_score=False, random_state=52, verbose=0, warm_start=False), 1211 | fit_params=None, iid='warn', n_jobs=None, 1212 | param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}], 1213 | pre_dispatch='2*n_jobs', refit=True, return_train_score=True, 1214 | scoring='neg_mean_squared_error', verbose=0) 1215 | 1216 | 1217 | 1218 | 1219 | ```python 1220 | cvres = grid_search.cv_results_ 1221 | for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): 1222 | print(np.sqrt(-mean_score), params) 1223 | ``` 1224 | 1225 | 7.29061793698431 {'max_features': 2, 'n_estimators': 3} 1226 | 6.413781857864982 {'max_features': 2, 'n_estimators': 10} 1227 | 6.092544533188321 {'max_features': 2, 'n_estimators': 30} 1228 | 7.107886960097701 {'max_features': 4, 'n_estimators': 3} 1229 | 6.315256370330592 {'max_features': 4, 'n_estimators': 10} 1230 | 5.976022358347516 {'max_features': 4, 'n_estimators': 30} 1231 | 7.001163746738424 {'max_features': 6, 'n_estimators': 3} 1232 | 6.1921055299084635 {'max_features': 6, 'n_estimators': 10} 1233 | 5.996739844433075 {'max_features': 6, 'n_estimators': 30} 1234 | 7.088902371406774 {'max_features': 8, 'n_estimators': 3} 1235 | 6.292396346910386 {'max_features': 8, 'n_estimators': 10} 1236 | 5.980558606686522 {'max_features': 8, 'n_estimators': 30} 1237 | 7.0825937380292405 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3} 1238 | 6.412140716903331 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10} 1239 | 7.062864506385558 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3} 1240 | 6.337407579436449 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10} 1241 | 6.766095704089036 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3} 1242 | 6.251001206038802 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10} 1243 | 1244 | 1245 | 1246 | ```python 1247 | grid_search.best_params_ 1248 | ``` 1249 | 1250 | 1251 | 1252 | 1253 | {'max_features': 4, 'n_estimators': 30} 1254 | 1255 | 1256 | 1257 | ## 变量重要性 1258 | 1259 | 1260 | ```python 1261 | feature_importances = grid_search.best_estimator_.feature_importances_ 1262 | #feature_importances 1263 | ``` 1264 | 1265 | 1266 | ```python 1267 | num_attribs = list(data_analysis_num) 1268 | cat_attribs = ['公司规模','学历要求','工作经验','融资情况'] 1269 | ``` 1270 | 1271 | 1272 | ```python 1273 | # 变量重要性排序 1274 | attributes = num_attribs + cat_attribs 1275 | sorted(zip(feature_importances, attributes), reverse=True) 1276 | ``` 1277 | 1278 | 1279 | 1280 | 1281 | [(0.05883012920907406, 'Sql'), 1282 | (0.05739271707726927, 'Hadoop'), 1283 | (0.052855819385887605, 'Python'), 1284 | (0.05042672750583558, '工作经验'), 1285 | (0.04841490914313372, 'Hive'), 1286 | (0.03169130633549138, 'Excel'), 1287 | (0.0301026272691416, 'Spark'), 1288 | (0.027897066519544437, 'Sas'), 1289 | (0.02382836465248971, 'Spss'), 1290 | (0.023060368955297835, '学历要求'), 1291 | (0.022374373956317948, 'Tableau'), 1292 | (0.01650026584689836, '融资情况'), 1293 | (0.013561201776627235, 'Ppt'), 1294 | (0.004150442668926646, '公司规模')] 1295 | 1296 | 1297 | 1298 | - **公司规模**对薪资的影响很小。 1299 | 1300 | --- 1301 | 1302 | # 最终模型 1303 | 1304 | 1305 | ```python 1306 | final_model = grid_search.best_estimator_ 1307 | 1308 | final_predictions = final_model.predict(data_analysis_test) 1309 | 1310 | final_mse = mean_squared_error(test_labels, final_predictions) 1311 | final_rmse = np.sqrt(final_mse) 1312 | ``` 1313 | 1314 | 1315 | ```python 1316 | final_rmse 1317 | ``` 1318 | 1319 | 1320 | 1321 | 1322 | 6.867133419234717 1323 | 1324 | 1325 | 1326 | - 误差为6.87 1327 | 1328 | --- 1329 | 1330 | # 薪资预测 1331 | 1332 | 1333 | ```python 1334 | salary_test_series = Series(final_predictions,index=data_test.index) 1335 | ``` 1336 | 1337 | 1338 | ```python 1339 | data_test_prediction = data_test.copy() 1340 | data_test_prediction.insert(7,'prediction',salary_test_series) 1341 | ``` 1342 | 1343 | 1344 | ```python 1345 | data_test_prediction.sample(10) 1346 | ``` 1347 | 1348 | 1349 | 1350 | 1351 |
1352 | 1365 | 1366 | 1367 | 1368 | 1369 | 1370 | 1371 | 1372 | 1373 | 1374 | 1375 | 1376 | 1377 | 1378 | 1379 | 1380 | 1381 | 1382 | 1383 | 1384 | 1385 | 1386 | 1387 | 1388 | 1389 | 1390 | 1391 | 1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | 1399 | 1400 | 1401 | 1402 | 1403 | 1404 | 1405 | 1406 | 1407 | 1408 | 1409 | 1410 | 1411 | 1412 | 1413 | 1414 | 1415 | 1416 | 1417 | 1418 | 1419 | 1420 | 1421 | 1422 | 1423 | 1424 | 1425 | 1426 | 1427 | 1428 | 1429 | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | 1440 | 1441 | 1442 | 1443 | 1444 | 1445 | 1446 | 1447 | 1448 | 1449 | 1450 | 1451 | 1452 | 1453 | 1454 | 1455 | 1456 | 1457 | 1458 | 1459 | 1460 | 1461 | 1462 | 1463 | 1464 | 1465 | 1466 | 1467 | 1468 | 1469 | 1470 | 1471 | 1472 | 1473 | 1474 | 1475 | 1476 | 1477 | 1478 | 1479 | 1480 | 1481 | 1482 | 1483 | 1484 | 1485 | 1486 | 1487 | 1488 | 1489 | 1490 | 1491 | 1492 | 1493 | 1494 | 1495 | 1496 | 1497 | 1498 | 1499 | 1500 | 1501 | 1502 | 1503 | 1504 | 1505 | 1506 | 1507 | 1508 | 1509 | 1510 | 1511 | 1512 | 1513 | 1514 | 1515 | 1516 | 1517 | 1518 | 1519 | 1520 | 1521 | 1522 | 1523 | 1524 | 1525 | 1526 | 1527 | 1528 | 1529 | 1530 | 1531 | 1532 | 1533 | 1534 | 1535 | 1536 | 1537 | 1538 | 1539 | 1540 | 1541 | 1542 | 1543 | 1544 | 1545 | 1546 | 1547 | 1548 | 1549 | 1550 | 1551 | 1552 | 1553 | 1554 | 1555 | 1556 | 1557 | 1558 | 1559 | 1560 | 1561 | 1562 | 1563 | 1564 | 1565 | 1566 | 1567 | 1568 | 1569 | 1570 | 1571 | 1572 | 1573 | 1574 | 1575 | 1576 | 1577 | 1578 | 1579 | 1580 | 1581 | 1582 | 1583 | 1584 | 1585 | 1586 | 1587 | 1588 | 1589 | 1590 | 1591 | 1592 | 1593 | 1594 | 1595 | 1596 | 1597 | 1598 | 1599 | 1600 | 1601 | 1602 | 1603 | 1604 | 1605 | 1606 | 1607 | 1608 | 1609 | 1610 | 1611 | 1612 |
公司名称公司规模地区学历要求工作经验职位名称salaryprediction融资情况SqlPythonExcelSasSpssHiveHadoopTableauPptSpark
8辰德网络科技100-499人上海本科1-3年数据分析7.08.916667未融资1000000000
224安心记加班100-499人上海 徐汇区 交大本科3-5年高级数据分析师22.517.355556B轮1110011001
490北京海金格医药科技100-499人上海 静安区 宜川路本科1年以内临床数据分析10.09.800000已上市0001000000
230任意门科技100-499人上海 浦东新区 花木本科1-3年数据分析师22.518.438889C轮1100010101
299天壤智能100-499人上海 徐汇区 龙华本科3-5年数据分析师32.518.216667A轮1110011100
329横援投资20-99人上海 松江区 九亭本科3-5年数据分析师11.512.033333未融资0000000000
351雅智捷20-99人上海 静安区 天目西路本科3-5年高级咨询顾问 - 数据分析15.018.46666720-99人1001000000
316今日头条10000人以上上海 徐汇区 上海南站本科1-3年广告数据分析师11.518.383333D轮及以上1011100000
535高顿教育1000-9999人上海 虹口区 虹口足球场本科3-5年大数据产品经理(数据分析师)17.519.737222C轮0000000000
520浅橙科技500-999人上海 杨浦区 新江湾城本科经验不限数据分析(风险政策)7.518.483333B轮1101000000
1613 |
1614 | 1615 | 1616 | 1617 | --- 1618 | 1619 | 1620 | ```python 1621 | 1622 | ``` 1623 | -------------------------------------------------------------------------------- /analysis&forecast/Data-Science-Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import warnings\n", 10 | "warnings.filterwarnings(\"ignore\")" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Read Data" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 4, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "from pandas import Series" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 5, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "data_analysis = pd.read_csv('./data_to_be_analysed/data_analysis_with_skills.csv')\n", 37 | "data_mining = pd.read_csv('./data_to_be_analysed/data_mining_with_skills.csv')\n", 38 | "machine_learning = pd.read_csv('./data_to_be_analysed/machine_learning_with_skills.csv')\n", 39 | "business_analysis = pd.read_csv('./data_to_be_analysed/business_analysis_with_skills.csv')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "---" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "metadata": { 53 | "scrolled": true 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "(1053, 21)" 60 | ] 61 | }, 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "data_analysis.shape" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "---" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## 添加薪资均值" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 7, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "import re\n", 92 | "# 均值函数\n", 93 | "def average(job_salary):\n", 94 | " # 取薪资均值----------------\n", 95 | " pattern = re.compile('\\d+')\n", 96 | " salary = job_salary\n", 97 | " try:\n", 98 | " res = re.findall(pattern, salary)\n", 99 | " avg_salary = 0\n", 100 | " sum = 0\n", 101 | " for i in res:\n", 102 | " a = int(i)\n", 103 | " sum = sum + a\n", 104 | " avg_salary = sum / 2\n", 105 | " except Exception:\n", 106 | " avg_salary = 0\n", 107 | " # 函数返回值\n", 108 | " return avg_salary\n", 109 | "\n", 110 | "salary_list = []\n", 111 | "for i in range(0,data_analysis.shape[0]):\n", 112 | " avg_sal = average(data_analysis['职位薪资'][i])\n", 113 | " salary_list.append(avg_sal)\n", 114 | "sal = Series(salary_list)\n", 115 | "\n", 116 | "data_analysis.insert(9,'salary',sal)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "salary_list = []\n", 126 | "for i in range(0,data_mining.shape[0]):\n", 127 | " avg_sal = average(data_mining['职位薪资'][i])\n", 128 | " salary_list.append(avg_sal)\n", 129 | "sal = Series(salary_list)\n", 130 | "\n", 131 | "data_mining.insert(9,'salary',sal)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "salary_list = []\n", 141 | "for i in range(0,machine_learning.shape[0]):\n", 142 | " avg_sal = average(machine_learning['职位薪资'][i])\n", 143 | " salary_list.append(avg_sal)\n", 144 | "sal = Series(salary_list)\n", 145 | "\n", 146 | "machine_learning.insert(9,'salary',sal)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 10, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "salary_list = []\n", 156 | "for i in range(0,business_analysis.shape[0]):\n", 157 | " avg_sal = average(business_analysis['职位薪资'][i])\n", 158 | " salary_list.append(avg_sal)\n", 159 | "sal = Series(salary_list)\n", 160 | "\n", 161 | "business_analysis.insert(9,'salary',sal)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "---" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "# 薪资分布探索" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "count 1053.000000\n", 187 | "mean 16.632479\n", 188 | "std 7.925945\n", 189 | "min 1.500000\n", 190 | "25% 11.000000\n", 191 | "50% 15.000000\n", 192 | "75% 22.500000\n", 193 | "max 52.500000\n", 194 | "Name: salary, dtype: float64" 195 | ] 196 | }, 197 | "execution_count": 11, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "data_analysis.salary.describe()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 12, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "Index(['Unnamed: 0', 'Keyword', '公司名称', '公司规模', '地区', '学历要求', '工作经验', '职位名称',\n", 215 | " '职位描述', 'salary', '职位薪资', '融资情况', 'Sql', 'Python', 'Excel', 'Sas',\n", 216 | " 'Spss', 'Hive', 'Hadoop', 'Ppt', 'Tableau', 'Spark'],\n", 217 | " dtype='object')" 218 | ] 219 | }, 220 | "execution_count": 12, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "data_analysis.columns" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 13, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAeoAAAEyCAYAAAA1AJN4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAFEJJREFUeJzt3W2MXOV5h/HrLk4ah20xhrCiNu1SxUpD2YSGFaWlanchaghGMR9Cm4imJqWyqlJKG0dhST+gVkJy1JKXqi+SFVAcibAhhBSEaRvLZUsjFVI7oVnASXGJS2yo3SjYySYo0SZ3P8xxO4LdmfWcmd1nZq6fhHbOOc+cc8+NZ/77nJk9E5mJJEkq04+tdgGSJGlpBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYGtWuwCAs88+O8fGxlqO+e53v8vpp5++MgUNGHtXj/3rnL2rx/51rh96t3///m9m5uvajSsiqMfGxti3b1/LMbOzs0xOTq5MQQPG3tVj/zpn7+qxf53rh95FxH8tZ5ynviVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIK1DeqIuCsijkXEk03r/jwivhoRX4mIz0XEuqZtt0bEwYj4WkS8rVeFS5I0DJYzo/4EcOXL1u0BLszMNwH/AdwKEBEXAO8Cfr66z99ExGldq1aSpCHTNqgz81HgWy9b9/nMXKgWHwM2Vre3ADOZ+f3M/DpwELiki/VKkjRUuvEe9e8Af1/d3gB8o2nb4WqdJEnqQK0rk0XEnwALwN0nVy0yLJe47zZgG8Do6Cizs7MtjzU/P992jBZn7+qxf52zd/XYv84NUu86DuqI2ApcDVyRmSfD+DBwXtOwjcDzi90/M3cCOwEmJiay3aXe+uFycKWyd/XYv87Zu3rsX+cGqXcdBXVEXAncAvxaZn6vadODwKci4sPATwGbgC/WrlIDaWx6d8vth3ZsXqFKJKlcbYM6Iu4BJoGzI+IwcBuNT3n/OLAnIgAey8zfy8ynIuJe4Gkap8RvzMwf9qp4SZIGXdugzsx3L7L6zhbjbwdur1OUJElq8MpkkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFaxvUEXFXRByLiCeb1q2PiD0R8Uz188xqfUTEX0bEwYj4SkS8pZfFS5I06NYsY8wngL8CPtm0bhrYm5k7ImK6Wr4FeDuwqfrvF4G/rX6qD41N7265/dCOzStUiSQNr7Yz6sx8FPjWy1ZvAXZVt3cB1zSt/2Q2PAasi4hzu1WsJEnDJjKz/aCIMeChzLywWj6emeuatr+YmWdGxEPAjsz8QrV+L3BLZu5bZJ/bgG0Ao6OjF8/MzLSsYX5+npGRkeU+LjXptHdzR0603D6+4YxOS1qR/XeL//Y6Z+/qsX+d64feTU1N7c/MiXbjlnPq+1TEIusW/U0gM3cCOwEmJiZycnKy5Y5nZ2dpN0aL67R317c79X3dqe9zJfffLf7b65y9q8f+dW6Qetfpp76PnjylXf08Vq0/DJzXNG4j8Hzn5UmSNNw6DeoHga3V7a3AA03rf7v69PelwInMfKFmjZIkDa22p74j4h5gEjg7Ig4DtwE7gHsj4gbgOeDaavjDwFXAQeB7wHt7ULMkSUOjbVBn5ruX2HTFImMTuLFuUZIkqcErk0mSVDCDWpKkghnUkiQVrNt/Ry2piZdhlVSXM2pJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWBrVrsAqWRj07vZPr7A9dO7F91+aMfmFa5I0rBxRi1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqWK2gjog/joinIuLJiLgnIl4TEedHxOMR8UxEfDoiXt2tYiVJGjYdB3VEbAD+EJjIzAuB04B3AR8CPpKZm4AXgRu6UagkScOo7qnvNcDaiFgDvBZ4AbgcuK/avgu4puYxJEkaWh0HdWYeAf4CeI5GQJ8A9gPHM3OhGnYY2FC3SEmShlVkZmd3jDgT+Czwm8Bx4DPV8m2Z+fpqzHnAw5k5vsj9twHbAEZHRy+emZlpebz5+XlGRkY6qnXYddq7uSMnWm4f33BGpyWtyP67Ye7ICUbXwtGXFt/ersZ+eIy95PO2HvvXuX7o3dTU1P7MnGg3rs63Z70V+Hpm/g9ARNwP/DKwLiLWVLPqjcDzi905M3cCOwEmJiZycnKy5cFmZ2dpN0aL67R3S31j1EmHrjv1fa7k/rvh+urbs+6YW/yp0q7GfniMveTzth7717lB6l2d96ifAy6NiNdGRABXAE8DjwDvrMZsBR6oV6IkScOrznvUj9P40NiXgLlqXzuBW4D3RcRB4Czgzi7UKUnSUKpz6pvMvA247WWrnwUuqbNfSZLU4JXJJEkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwdasdgGS+tvY9O4lt20fX2By5UqRBlKtGXVErIuI+yLiqxFxICJ+KSLWR8SeiHim+nlmt4qVJGnY1D31/THgHzLz54A3AweAaWBvZm4C9lbLkiSpAx0HdUT8JPCrwJ0AmfmDzDwObAF2VcN2AdfULVKSpGEVmdnZHSMuAnYCT9OYTe8HbgaOZOa6pnEvZuYrTn9HxDZgG8Do6OjFMzMzLY83Pz/PyMhIR7WWaO7IiZbbxzec0bVjddq7Xte4kj3o1NyRE4yuhaMvLb69XY398BjravUYR9fCOev7/zGulkF73VtJ/dC7qamp/Zk50W5cnaCeAB4DLsvMxyPiY8C3gZuWE9TNJiYmct++fS2PNzs7y+TkZEe1lqjVB3AADu3Y3LVjddq7Xte4kj3o1Nj0braPL3DH3OKfu2xXYz88xrrafZjspuu2rGA1g2XQXvdWUj/0LiKWFdR13qM+DBzOzMer5fuAtwBHI+LcqohzgWM1jiFJ0lDrOKgz87+Bb0TEG6pVV9A4Df4gsLVatxV4oFaFkiQNsbp/R30TcHdEvBp4FngvjfC/NyJuAJ4Drq15DEmShlatoM7MJ4DFzq9fUWe/kiSpwUuISpJUMINakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkgtW91rcK1fzVg9vHF7i+zdctSpLK5IxakqSCGdSSJBXMoJYkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSClY7qCPitIj4ckQ8VC2fHxGPR8QzEfHpiHh1/TIlSRpO3ZhR3wwcaFr+EPCRzNwEvAjc0IVjSJI0lGoFdURsBDYDH6+WA7gcuK8asgu4ps4xJEkaZnVn1B8FPgD8qFo+CziemQvV8mFgQ81jSJI0tCIzO7tjxNXAVZn5+xExCbwfeC/wr5n5+mrMecDDmTm+yP23AdsARkdHL56ZmWl5vPn5eUZGRjqqtURzR0603D6+4Yyu7X90LRx9qdbuFtXNGnux/26YO3KiZf/a1dgPj7GuVo9xdC2cs77/H+NqGbTXvZXUD72bmpran5kT7catqXGMy4B3RMRVwGuAn6Qxw14XEWuqWfVG4PnF7pyZO4GdABMTEzk5OdnyYLOzs7Qb00+un97dcvuh6ya7tv/t4wvcMVfnf/XiulljL/bfDddP727Zv3Y19sNjrKvVY9w+vsBvDNDzdqUN2uveShqk3nX86p2ZtwK3ApycUWfmdRHxGeCdwAywFXigC3UOnbF2L/A7Nq9QJZKk1dSLv6O+BXhfRByk8Z71nT04hiRJQ6Er50MzcxaYrW4/C1zSjf1qae1m3MPAsw6ShoFXJpMkqWDd/4SRpBXjWQVp8DmjliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMINakqSCGdSSJBXMC570iJf4lCR1gzNqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYQS1JUsEMakmSCmZQS5JUMC8hqp7xMqqSVJ8zakmSCuaMWgOr3Yz+0I7NK1SJJHXOGbUkSQVzRq2O+R60JPWeM2pJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQUzqCVJKphBLUlSwQxqSZIK1nFQR8R5EfFIRByIiKci4uZq/fqI2BMRz1Q/z+xeuZIkDZc6M+oFYHtmvhG4FLgxIi4ApoG9mbkJ2FstS5KkDnQc1Jn5QmZ+qbr9HeAAsAHYAuyqhu0CrqlbpCRJwyoys/5OIsaAR4ELgecyc13Tthcz8xWnvyNiG7ANYHR09OKZmZmWx5ifn2dkZKR2rStl7siJ1S7h/4yuhaMvrXYVp258wxktt9ftcbv9nzxGq/7VrXE5Nazm/uvWMLoWzlnf+xoGVb+97pWkH3o3NTW1PzMn2o2rHdQRMQL8M3B7Zt4fEceXE9TNJiYmct++fS2PMzs7y+TkZK1aV1JJ3yy1fXyBO+b674vS2n1fdN0eL+f7qMemd7fsX90a634ndgnfud2qhu3jC9x03Zae1zCo+u11ryT90LuIWFZQ13r1johXAZ8F7s7M+6vVRyPi3Mx8ISLOBY7VOYak3lnOLzsrEfaSllbnU98B3AkcyMwPN216ENha3d4KPNB5eZIkDbc6M+rLgPcAcxHxRLXug8AO4N6IuAF4Dri2XomSJA2vjoM6M78AxBKbr+h0v9JKKelzBJK0FK9MJklSwfrvo8AaGs54JckZtSRJRTOoJUkqmEEtSVLBfI96CSVc8UmSJGfUkiQVzBl1h/xEsqSTPAOnXnJGLUlSwQxqSZIKZlBLklQwg1qSpIIZ1JIkFcygliSpYAa1JEkFM6glSSqYFzyRVDQvJqJh54xakqSCOaOWCualaiU5o5YkqWAGtSRJBTOoJUkqmEEtSVLBDGpJkgpmUEuSVDD/PEuqwT+fUjcs9e9o+/gC10/v9qIuQ84ZtSRJBXNGLWmgeQlS9Ttn1JIkFWwgZ9TLed/Q36JVgn54j7sfapQGmTNqSZIKNpAz6uVwliBJy+P7/KvLGbUkSQUb2hm1NAw8cyT1v57NqCPiyoj4WkQcjIjpXh1HkqRB1pOgjojTgL8G3g5cALw7Ii7oxbEkSRpkvTr1fQlwMDOfBYiIGWAL8HSPjiepUJ5+13Ks9gfWVvv4rfTq1PcG4BtNy4erdZIk6RREZnZ/pxHXAm/LzN+tlt8DXJKZNzWN2QZsqxbfAHytzW7PBr7Z9WKHg72rx/51zt7VY/861w+9+5nMfF27Qb069X0YOK9peSPwfPOAzNwJ7FzuDiNiX2ZOdKe84WLv6rF/nbN39di/zg1S73p16vvfgE0RcX5EvBp4F/Bgj44lSdLA6smMOjMXIuIPgH8ETgPuysynenEsSZIGWc8ueJKZDwMPd3GXyz5Nrlewd/XYv87Zu3rsX+cGpnc9+TCZJEnqDq/1LUlSwQxqSZIKVnxQe83wUxMRd0XEsYh4smnd+ojYExHPVD/PXM0aSxUR50XEIxFxICKeioibq/X2bxki4jUR8cWI+Peqf39arT8/Ih6v+vfp6i9BtIiIOC0ivhwRD1XL9m6ZIuJQRMxFxBMRsa9aNxDP3aKD2muGd+QTwJUvWzcN7M3MTcDealmvtABsz8w3ApcCN1b/3uzf8nwfuDwz3wxcBFwZEZcCHwI+UvXvReCGVayxdDcDB5qW7d2pmcrMi5r+fnognrtFBzVN1wzPzB8AJ68ZriVk5qPAt162eguwq7q9C7hmRYvqE5n5QmZ+qbr9HRovmBuwf8uSDfPV4quq/xK4HLivWm//lhARG4HNwMer5cDe1TUQz93Sg9prhnfHaGa+AI0wAs5Z5XqKFxFjwC8Aj2P/lq06dfsEcAzYA/wncDwzF6ohPoeX9lHgA8CPquWzsHenIoHPR8T+6hLVMCDP3Z79HXWXxCLr/Hsy9VREjACfBf4oM7/dmNhoOTLzh8BFEbEO+BzwxsWGrWxV5YuIq4Fjmbk/IiZPrl5kqL1b2mWZ+XxEnAPsiYivrnZB3VL6jLrtNcO1LEcj4lyA6uexVa6nWBHxKhohfXdm3l+ttn+nKDOPA7M03utfFxEnJwU+hxd3GfCOiDhE4y2+y2nMsO3dMmXm89XPYzR+SbyEAXnulh7UXjO8Ox4Etla3twIPrGItxareE7wTOJCZH27aZP+WISJeV82kiYi1wFtpvM//CPDOapj9W0Rm3pqZGzNzjMbr3D9l5nXYu2WJiNMj4idO3gZ+HXiSAXnuFn9lsoi4isZvlievGX77KpdUtIi4B5ik8RVvR4HbgL8D7gV+GngOuDYzX/6Bs6EXEb8C/Aswx/+/T/hBGu9T2782IuJNND6wcxqNScC9mflnEfGzNGaJ64EvA7+Vmd9fvUrLVp36fn9mXm3vlqfq0+eqxTXApzLz9og4iwF47hYf1JIkDbPST31LkjTUDGpJkgpmUEuSVDCDWpKkghnUkiQVzKCWJKlgBrUkSQX7X6yswXm5vaZhAAAAAElFTkSuQmCC\n", 237 | "text/plain": [ 238 | "" 239 | ] 240 | }, 241 | "metadata": { 242 | "needs_background": "light" 243 | }, 244 | "output_type": "display_data" 245 | } 246 | ], 247 | "source": [ 248 | "%matplotlib inline\n", 249 | "import matplotlib.pyplot as plt\n", 250 | "data_analysis.salary.hist(bins=50, figsize=(8,5))\n", 251 | "\n", 252 | "plt.show()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "- 薪资主要分布在**5k-30k**之间" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 14, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "(42, 22)" 271 | ] 272 | }, 273 | "execution_count": 14, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "data_analysis[data_analysis.salary>30].shape" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 15, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "(22, 22)" 291 | ] 292 | }, 293 | "execution_count": 15, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "data_analysis[data_analysis.salary<5].shape" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 16, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "data_analysis = data_analysis[data_analysis['salary']<30]\n", 309 | "data_analysis = data_analysis[data_analysis['salary']>5]" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "---" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 17, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/html": [ 327 | "
\n", 328 | "\n", 341 | "\n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | "
Unnamed: 0Keyword公司名称公司规模地区学历要求工作经验职位名称职位描述salary...SqlPythonExcelSasSpssHiveHadoopPptTableauSpark
01数据分析上海恒奕集团500-999人上海 普陀区 武宁路大专1-3年数据分析岗位职责:通过抓取第三方数据来分析在不同地区,我们医院项目的目标人群画像分析,实时网络热度盘...10.0...0000000000
12数据分析晶赞科技100-499人上海 静安区 共和新路本科3-5年数据分析工作职责:1、负责政务数据/商业数据的研究和分析。例如,通过政务公开数据的挖掘和分析,形成主...18.0...0111100000
\n", 419 | "

2 rows × 22 columns

\n", 420 | "
" 421 | ], 422 | "text/plain": [ 423 | " Unnamed: 0 Keyword 公司名称 公司规模 地区 学历要求 工作经验 职位名称 \\\n", 424 | "0 1 数据分析 上海恒奕集团 500-999人 上海 普陀区 武宁路 大专 1-3年 数据分析 \n", 425 | "1 2 数据分析 晶赞科技 100-499人 上海 静安区 共和新路 本科 3-5年 数据分析 \n", 426 | "\n", 427 | " 职位描述 salary ... Sql \\\n", 428 | "0 岗位职责:通过抓取第三方数据来分析在不同地区,我们医院项目的目标人群画像分析,实时网络热度盘... 10.0 ... 0 \n", 429 | "1 工作职责:1、负责政务数据/商业数据的研究和分析。例如,通过政务公开数据的挖掘和分析,形成主... 18.0 ... 0 \n", 430 | "\n", 431 | " Python Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n", 432 | "0 0 0 0 0 0 0 0 0 0 \n", 433 | "1 1 1 1 1 0 0 0 0 0 \n", 434 | "\n", 435 | "[2 rows x 22 columns]" 436 | ] 437 | }, 438 | "execution_count": 17, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "data_analysis.head(2)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 18, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "data_analysis = data_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 19, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "data_mining = data_mining.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 20, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "machine_learning = machine_learning.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 21, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "business_analysis = business_analysis.drop(['Unnamed: 0','Keyword','职位描述','职位薪资'],axis=1)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "---" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "## 掌握的软件技能对薪资的影响关系" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 22, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "salary 1.000000\n", 506 | "Python 0.249000\n", 507 | "Hive 0.248617\n", 508 | "Sql 0.248049\n", 509 | "Spark 0.185153\n", 510 | "Sas 0.164862\n", 511 | "Hadoop 0.159602\n", 512 | "Spss 0.071946\n", 513 | "Tableau 0.068340\n", 514 | "Ppt -0.052048\n", 515 | "Excel -0.077646\n", 516 | "Name: salary, dtype: float64" 517 | ] 518 | }, 519 | "execution_count": 22, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [ 525 | "corr_matrix = data_analysis.corr()\n", 526 | "corr_matrix[\"salary\"].sort_values(ascending=False)" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "- **Data Analysis**的职位中,`Hive`,`Spark`,`Hadoop`大数据应用方面的软件是**薪资的加分项**。\n", 534 | "- 同时,`Python`,`SQL`,`SAS`,`Tableau`,`SPSS`等统计分析软件与可视化软件也是数据分析师**区别于低薪分析专员**的因素。\n", 535 | "- `PPT`,`Excel`作为必须的软件技能,对薪资变化**并没有太大的影响**,甚至仅仅会Excel的职位沦落为专员,会是一个减分项。\n", 536 | "- 结论:在数据分析领域,拥有**大数据软件技能**并且懂得**Python**这一编程语言的分析师的待遇较好。" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 23, 542 | "metadata": {}, 543 | "outputs": [ 544 | { 545 | "data": { 546 | "text/plain": [ 547 | "salary 1.000000\n", 548 | "Spark 0.199728\n", 549 | "Java 0.196519\n", 550 | "Hive 0.185769\n", 551 | "C 0.159396\n", 552 | "Hadoop 0.155124\n", 553 | "Python 0.102946\n", 554 | "Shell 0.037366\n", 555 | "Linux 0.005632\n", 556 | "Sql -0.050092\n", 557 | "Sas -0.072597\n", 558 | "Name: salary, dtype: float64" 559 | ] 560 | }, 561 | "execution_count": 23, 562 | "metadata": {}, 563 | "output_type": "execute_result" 564 | } 565 | ], 566 | "source": [ 567 | "corr_matrix = data_mining.corr()\n", 568 | "corr_matrix[\"salary\"].sort_values(ascending=False)" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "- **Data Mining**的职位中,`Hive`,`Spark`,`Hadoop`大数据方面的软件是薪资**极大的加分项**。\n", 576 | "- `Java`,`C`,`Python`等编程语言对数据挖掘的工作有很大帮助因此也体现在了对薪资的**正面影响**上。\n", 577 | "- 分析结论:具备**数据挖掘算法与编码能力**且具备**大数据方面分析技能**的数据挖掘工程师的待遇较好。" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 24, 583 | "metadata": { 584 | "scrolled": true 585 | }, 586 | "outputs": [ 587 | { 588 | "data": { 589 | "text/plain": [ 590 | "salary 1.000000\n", 591 | "Spark 0.144507\n", 592 | "Hive 0.116132\n", 593 | "Hadoop 0.109608\n", 594 | "Java 0.088857\n", 595 | "Tensorflow 0.072449\n", 596 | "Sql -0.022844\n", 597 | "C -0.032998\n", 598 | "Python -0.054629\n", 599 | "Linux -0.058181\n", 600 | "Matlab -0.062595\n", 601 | "Name: salary, dtype: float64" 602 | ] 603 | }, 604 | "execution_count": 24, 605 | "metadata": {}, 606 | "output_type": "execute_result" 607 | } 608 | ], 609 | "source": [ 610 | "corr_matrix = machine_learning.corr()\n", 611 | "corr_matrix[\"salary\"].sort_values(ascending=False)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "metadata": {}, 617 | "source": [ 618 | "- **Machine Learning**的职位中,没有特别突出的技能加分项,列表中的软件技能基本都是入职必备的技能。\n", 619 | "- `Hive`,`Spark`,`Hadoop`等大数据方面的技能会对薪资有一定程度的提升,不过影响较小。\n", 620 | "- 分析结论:机器学习工程师入门难度稍高,需要掌握具备的软件技能也较多,没有特别突出的薪资加分项。" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 25, 626 | "metadata": {}, 627 | "outputs": [ 628 | { 629 | "data": { 630 | "text/plain": [ 631 | "salary 1.000000\n", 632 | "Python 0.394292\n", 633 | "C 0.371366\n", 634 | "Java 0.244305\n", 635 | "Linux 0.242700\n", 636 | "Hive 0.168359\n", 637 | "Sql 0.107485\n", 638 | "Sas 0.078156\n", 639 | "Excel 0.068413\n", 640 | "Ppt -0.019429\n", 641 | "Spss -0.091822\n", 642 | "Name: salary, dtype: float64" 643 | ] 644 | }, 645 | "execution_count": 25, 646 | "metadata": {}, 647 | "output_type": "execute_result" 648 | } 649 | ], 650 | "source": [ 651 | "corr_matrix = business_analysis.corr()\n", 652 | "corr_matrix[\"salary\"].sort_values(ascending=False)" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "- **Business Analysis**的职位中,编程语言是**极大的薪资加分项**。如`C`,`Python`,`Java`。\n", 660 | "- `Excel`,`PPT`,`SPSS`等软件是这个职位的**必备技能**,因此对职位薪资没有太大的影响。\n", 661 | "- 结论:在商业分析领域,拥有**商业分析思维**并且具有**编程能力**的分析师的待遇较好。" 662 | ] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "---" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "metadata": {}, 674 | "source": [ 675 | "# 模型方法选择" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "
\n",
 683 |     "线性回归\n",
 684 |     "决策树回归\n",
 685 |     "随机森林回归\n",
 686 |     "KNN回归\n",
 687 |     "Adaboost回归\n",
 688 |     "GBRT回归\n",
 689 |     "Bagging回归\n",
 690 |     "ExtraTree极端随机树回归\n",
 691 |     "
" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "---" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "# 准备数据" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 26, 711 | "metadata": {}, 712 | "outputs": [ 713 | { 714 | "data": { 715 | "text/html": [ 716 | "
\n", 717 | "\n", 730 | "\n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | "
公司名称公司规模地区学历要求工作经验职位名称salary融资情况SqlPythonExcelSasSpssHiveHadoopPptTableauSpark
0上海恒奕集团500-999人上海 普陀区 武宁路大专1-3年数据分析10.0A轮0000000000
1晶赞科技100-499人上海 静安区 共和新路本科3-5年数据分析18.0D轮及以上0111100000
\n", 799 | "
" 800 | ], 801 | "text/plain": [ 802 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql Python \\\n", 803 | "0 上海恒奕集团 500-999人 上海 普陀区 武宁路 大专 1-3年 数据分析 10.0 A轮 0 0 \n", 804 | "1 晶赞科技 100-499人 上海 静安区 共和新路 本科 3-5年 数据分析 18.0 D轮及以上 0 1 \n", 805 | "\n", 806 | " Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n", 807 | "0 0 0 0 0 0 0 0 0 \n", 808 | "1 1 1 1 0 0 0 0 0 " 809 | ] 810 | }, 811 | "execution_count": 26, 812 | "metadata": {}, 813 | "output_type": "execute_result" 814 | } 815 | ], 816 | "source": [ 817 | "data_analysis.head(2)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 27, 823 | "metadata": {}, 824 | "outputs": [], 825 | "source": [ 826 | "from sklearn.model_selection import train_test_split\n", 827 | "\n", 828 | "train_set, test_set = train_test_split(data_analysis, test_size=0.2, random_state=42)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 28, 834 | "metadata": {}, 835 | "outputs": [], 836 | "source": [ 837 | "data_train = train_set.copy()\n", 838 | "data_test = test_set.copy()" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 29, 844 | "metadata": {}, 845 | "outputs": [ 846 | { 847 | "data": { 848 | "text/plain": [ 849 | "(737, 18)" 850 | ] 851 | }, 852 | "execution_count": 29, 853 | "metadata": {}, 854 | "output_type": "execute_result" 855 | } 856 | ], 857 | "source": [ 858 | "data_train.shape" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 30, 864 | "metadata": {}, 865 | "outputs": [ 866 | { 867 | "data": { 868 | "text/plain": [ 869 | "(185, 18)" 870 | ] 871 | }, 872 | "execution_count": 30, 873 | "metadata": {}, 874 | "output_type": "execute_result" 875 | } 876 | ], 877 | "source": [ 878 | "data_test.shape" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": 31, 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [ 887 | "from sklearn.pipeline import Pipeline\n", 888 | "from sklearn.preprocessing import StandardScaler\n", 889 | "from sklearn.preprocessing import Imputer\n", 890 | "from sklearn.compose import ColumnTransformer\n", 891 | "from sklearn.preprocessing import OneHotEncoder" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": 32, 897 | "metadata": {}, 898 | "outputs": [ 899 | { 900 | "data": { 901 | "text/html": [ 902 | "
\n", 903 | "\n", 916 | "\n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | "
公司名称公司规模地区学历要求工作经验职位名称salary融资情况SqlPythonExcelSasSpssHiveHadoopPptTableauSpark
0上海恒奕集团500-999人上海 普陀区 武宁路大专1-3年数据分析10.0A轮0000000000
\n", 964 | "
" 965 | ], 966 | "text/plain": [ 967 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql Python \\\n", 968 | "0 上海恒奕集团 500-999人 上海 普陀区 武宁路 大专 1-3年 数据分析 10.0 A轮 0 0 \n", 969 | "\n", 970 | " Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n", 971 | "0 0 0 0 0 0 0 0 0 " 972 | ] 973 | }, 974 | "execution_count": 32, 975 | "metadata": {}, 976 | "output_type": "execute_result" 977 | } 978 | ], 979 | "source": [ 980 | "data_analysis.head(1)" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 33, 986 | "metadata": {}, 987 | "outputs": [], 988 | "source": [ 989 | "data_analysis_num = data_analysis.drop(['公司名称','公司规模','地区','学历要求','工作经验','职位名称','融资情况','salary'], axis=1)\n", 990 | "num_attribs = list(data_analysis_num)\n", 991 | "cat_attribs = ['公司规模','学历要求','工作经验']\n", 992 | "\n", 993 | "num_pipeline = Pipeline([\n", 994 | " ('std_scaler', StandardScaler()),\n", 995 | " ])\n", 996 | "\n", 997 | "full_pipeline = ColumnTransformer([\n", 998 | " (\"num\", num_pipeline, num_attribs),\n", 999 | " (\"cat\", OneHotEncoder(), cat_attribs),\n", 1000 | " ])\n", 1001 | "\n", 1002 | "data_analysis_prepared = full_pipeline.fit_transform(data_train)\n", 1003 | "data_analysis_test = full_pipeline.transform(data_test)" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": 34, 1009 | "metadata": { 1010 | "scrolled": true 1011 | }, 1012 | "outputs": [ 1013 | { 1014 | "data": { 1015 | "text/plain": [ 1016 | "array([[ 1.01779743, 1.15424368, -0.70135785, -0.56638197, 2.02058373,\n", 1017 | " -0.40306397, -0.36822985, -0.36584991, -0.33908304, -0.26977806,\n", 1018 | " 1. , 0. , 0. , 0. , 0. ,\n", 1019 | " 0. , 0. , 0. , 0. , 1. ,\n", 1020 | " 0. , 0. , 0. , 0. , 1. ,\n", 1021 | " 0. , 0. , 0. ]])" 1022 | ] 1023 | }, 1024 | "execution_count": 34, 1025 | "metadata": {}, 1026 | "output_type": "execute_result" 1027 | } 1028 | ], 1029 | "source": [ 1030 | "data_analysis_prepared[:1]" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "code", 1035 | "execution_count": 35, 1036 | "metadata": {}, 1037 | "outputs": [ 1038 | { 1039 | "data": { 1040 | "text/html": [ 1041 | "
\n", 1042 | "\n", 1055 | "\n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | "
公司名称公司规模地区学历要求工作经验职位名称salary融资情况SqlPythonExcelSasSpssHiveHadoopPptTableauSpark
527上海兴致0-20人上海 浦东新区 八佰伴本科3-5年高级数据分析师22.50-20人1100100000
\n", 1103 | "
" 1104 | ], 1105 | "text/plain": [ 1106 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql Python \\\n", 1107 | "527 上海兴致 0-20人 上海 浦东新区 八佰伴 本科 3-5年 高级数据分析师 22.5 0-20人 1 1 \n", 1108 | "\n", 1109 | " Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n", 1110 | "527 0 0 1 0 0 0 0 0 " 1111 | ] 1112 | }, 1113 | "execution_count": 35, 1114 | "metadata": {}, 1115 | "output_type": "execute_result" 1116 | } 1117 | ], 1118 | "source": [ 1119 | "data_train.head(1)" 1120 | ] 1121 | }, 1122 | { 1123 | "cell_type": "code", 1124 | "execution_count": 36, 1125 | "metadata": {}, 1126 | "outputs": [], 1127 | "source": [ 1128 | "data_analysis_labels = data_train.salary.values\n", 1129 | "test_labels = data_test.salary.values" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "markdown", 1134 | "metadata": {}, 1135 | "source": [ 1136 | "---" 1137 | ] 1138 | }, 1139 | { 1140 | "cell_type": "markdown", 1141 | "metadata": {}, 1142 | "source": [ 1143 | "# 训练模型" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "markdown", 1148 | "metadata": {}, 1149 | "source": [ 1150 | "## 线性回归" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": 37, 1156 | "metadata": {}, 1157 | "outputs": [ 1158 | { 1159 | "data": { 1160 | "text/plain": [ 1161 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", 1162 | " normalize=False)" 1163 | ] 1164 | }, 1165 | "execution_count": 37, 1166 | "metadata": {}, 1167 | "output_type": "execute_result" 1168 | } 1169 | ], 1170 | "source": [ 1171 | "from sklearn.linear_model import LinearRegression\n", 1172 | "\n", 1173 | "lin_reg = LinearRegression()\n", 1174 | "lin_reg.fit(data_analysis_prepared, data_analysis_labels)" 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "code", 1179 | "execution_count": 38, 1180 | "metadata": {}, 1181 | "outputs": [ 1182 | { 1183 | "data": { 1184 | "text/plain": [ 1185 | "4.209936980782373" 1186 | ] 1187 | }, 1188 | "execution_count": 38, 1189 | "metadata": {}, 1190 | "output_type": "execute_result" 1191 | } 1192 | ], 1193 | "source": [ 1194 | "from sklearn.metrics import mean_squared_error\n", 1195 | "import numpy as np\n", 1196 | "\n", 1197 | "salary_predictions = lin_reg.predict(data_analysis_prepared)\n", 1198 | "lin_mse = mean_squared_error(data_analysis_labels, salary_predictions)\n", 1199 | "lin_rmse = np.sqrt(lin_mse)\n", 1200 | "lin_rmse" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "code", 1205 | "execution_count": 39, 1206 | "metadata": {}, 1207 | "outputs": [], 1208 | "source": [ 1209 | "#salary_predictions[:10]" 1210 | ] 1211 | }, 1212 | { 1213 | "cell_type": "markdown", 1214 | "metadata": {}, 1215 | "source": [ 1216 | "### 测试集" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "code", 1221 | "execution_count": 40, 1222 | "metadata": {}, 1223 | "outputs": [], 1224 | "source": [ 1225 | "#data_test.head(10)" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "code", 1230 | "execution_count": 41, 1231 | "metadata": {}, 1232 | "outputs": [ 1233 | { 1234 | "data": { 1235 | "text/plain": [ 1236 | "array([19.13476562, 16.91992188, 14.8984375 , 14.0546875 , 20.76367188,\n", 1237 | " 12.19921875, 18.13671875, 16.45507812, 20.40917969, 19.90820312])" 1238 | ] 1239 | }, 1240 | "execution_count": 41, 1241 | "metadata": {}, 1242 | "output_type": "execute_result" 1243 | } 1244 | ], 1245 | "source": [ 1246 | "y_test = lin_reg.predict(data_analysis_test)\n", 1247 | "y_test[:10]" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "code", 1252 | "execution_count": 42, 1253 | "metadata": {}, 1254 | "outputs": [ 1255 | { 1256 | "data": { 1257 | "text/plain": [ 1258 | "array([20.5, 13. , 11.5, 12.5, 15. , 12.5, 22.5, 15. , 22.5, 15.5])" 1259 | ] 1260 | }, 1261 | "execution_count": 42, 1262 | "metadata": {}, 1263 | "output_type": "execute_result" 1264 | } 1265 | ], 1266 | "source": [ 1267 | "test_labels[:10]" 1268 | ] 1269 | }, 1270 | { 1271 | "cell_type": "code", 1272 | "execution_count": 43, 1273 | "metadata": {}, 1274 | "outputs": [ 1275 | { 1276 | "data": { 1277 | "text/plain": [ 1278 | "4.252707451377156" 1279 | ] 1280 | }, 1281 | "execution_count": 43, 1282 | "metadata": {}, 1283 | "output_type": "execute_result" 1284 | } 1285 | ], 1286 | "source": [ 1287 | "lin_mse = mean_squared_error(test_labels, y_test)\n", 1288 | "lin_rmse = np.sqrt(lin_mse)\n", 1289 | "lin_rmse" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "markdown", 1294 | "metadata": {}, 1295 | "source": [ 1296 | "- 测试集上误差约为**4.27**" 1297 | ] 1298 | }, 1299 | { 1300 | "cell_type": "markdown", 1301 | "metadata": {}, 1302 | "source": [ 1303 | "### 交叉验证" 1304 | ] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "execution_count": 44, 1309 | "metadata": {}, 1310 | "outputs": [], 1311 | "source": [ 1312 | "from sklearn.model_selection import cross_val_score\n", 1313 | "\n", 1314 | "scores = cross_val_score(lin_reg, data_analysis_prepared, data_analysis_labels,\n", 1315 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 1316 | "lin_rmse_scores = np.sqrt(-scores)" 1317 | ] 1318 | }, 1319 | { 1320 | "cell_type": "code", 1321 | "execution_count": 45, 1322 | "metadata": {}, 1323 | "outputs": [ 1324 | { 1325 | "name": "stdout", 1326 | "output_type": "stream", 1327 | "text": [ 1328 | "Scores: [4.54552557 4.54182215 3.94087967 4.42840937 4.4373358 4.62011098\n", 1329 | " 4.22660386 4.38725655 4.42436899 4.60889902]\n", 1330 | "Mean: 4.41612119574774\n", 1331 | "Standard deviation: 0.1935445501197603\n" 1332 | ] 1333 | } 1334 | ], 1335 | "source": [ 1336 | "def display_scores(scores):\n", 1337 | " print(\"Scores:\", scores)\n", 1338 | " print(\"Mean:\", scores.mean())\n", 1339 | " print(\"Standard deviation:\", scores.std())\n", 1340 | "\n", 1341 | "display_scores(lin_rmse_scores)" 1342 | ] 1343 | }, 1344 | { 1345 | "cell_type": "markdown", 1346 | "metadata": {}, 1347 | "source": [ 1348 | "---" 1349 | ] 1350 | }, 1351 | { 1352 | "cell_type": "markdown", 1353 | "metadata": {}, 1354 | "source": [ 1355 | "## 决策树回归" 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "markdown", 1360 | "metadata": {}, 1361 | "source": [ 1362 | "### 建模训练" 1363 | ] 1364 | }, 1365 | { 1366 | "cell_type": "code", 1367 | "execution_count": 46, 1368 | "metadata": {}, 1369 | "outputs": [ 1370 | { 1371 | "data": { 1372 | "text/plain": [ 1373 | "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n", 1374 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", 1375 | " min_impurity_split=None, min_samples_leaf=1,\n", 1376 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 1377 | " presort=False, random_state=42, splitter='best')" 1378 | ] 1379 | }, 1380 | "execution_count": 46, 1381 | "metadata": {}, 1382 | "output_type": "execute_result" 1383 | } 1384 | ], 1385 | "source": [ 1386 | "from sklearn.tree import DecisionTreeRegressor\n", 1387 | "\n", 1388 | "tree_reg = DecisionTreeRegressor(random_state=42)\n", 1389 | "tree_reg.fit(data_analysis_prepared, data_analysis_labels)" 1390 | ] 1391 | }, 1392 | { 1393 | "cell_type": "code", 1394 | "execution_count": 47, 1395 | "metadata": {}, 1396 | "outputs": [], 1397 | "source": [ 1398 | "y_pred_tree = tree_reg.predict(data_analysis_prepared)" 1399 | ] 1400 | }, 1401 | { 1402 | "cell_type": "code", 1403 | "execution_count": 48, 1404 | "metadata": {}, 1405 | "outputs": [ 1406 | { 1407 | "data": { 1408 | "text/plain": [ 1409 | "2.404700282979215" 1410 | ] 1411 | }, 1412 | "execution_count": 48, 1413 | "metadata": {}, 1414 | "output_type": "execute_result" 1415 | } 1416 | ], 1417 | "source": [ 1418 | "from sklearn.metrics import mean_squared_error\n", 1419 | "\n", 1420 | "tree_mse = mean_squared_error(data_analysis_labels, y_pred_tree)\n", 1421 | "tree_rmse = np.sqrt(tree_mse)\n", 1422 | "tree_rmse" 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "markdown", 1427 | "metadata": {}, 1428 | "source": [ 1429 | "### 测试集" 1430 | ] 1431 | }, 1432 | { 1433 | "cell_type": "code", 1434 | "execution_count": 49, 1435 | "metadata": {}, 1436 | "outputs": [ 1437 | { 1438 | "data": { 1439 | "text/plain": [ 1440 | "array([17.5 , 16.54166667, 14.75 , 8. , 18.75 ,\n", 1441 | " 16.5 , 17.5 , 7. , 17.125 , 18.83333333])" 1442 | ] 1443 | }, 1444 | "execution_count": 49, 1445 | "metadata": {}, 1446 | "output_type": "execute_result" 1447 | } 1448 | ], 1449 | "source": [ 1450 | "y_test = tree_reg.predict(data_analysis_test)\n", 1451 | "y_test[:10]" 1452 | ] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": 50, 1457 | "metadata": {}, 1458 | "outputs": [ 1459 | { 1460 | "data": { 1461 | "text/plain": [ 1462 | "array([20.5, 13. , 11.5, 12.5, 15. , 12.5, 22.5, 15. , 22.5, 15.5])" 1463 | ] 1464 | }, 1465 | "execution_count": 50, 1466 | "metadata": {}, 1467 | "output_type": "execute_result" 1468 | } 1469 | ], 1470 | "source": [ 1471 | "test_labels[:10]" 1472 | ] 1473 | }, 1474 | { 1475 | "cell_type": "code", 1476 | "execution_count": 51, 1477 | "metadata": { 1478 | "scrolled": true 1479 | }, 1480 | "outputs": [ 1481 | { 1482 | "data": { 1483 | "text/plain": [ 1484 | "5.585045537872495" 1485 | ] 1486 | }, 1487 | "execution_count": 51, 1488 | "metadata": {}, 1489 | "output_type": "execute_result" 1490 | } 1491 | ], 1492 | "source": [ 1493 | "tree_mse = mean_squared_error(test_labels, y_test)\n", 1494 | "tree_rmse = np.sqrt(tree_mse)\n", 1495 | "tree_rmse" 1496 | ] 1497 | }, 1498 | { 1499 | "cell_type": "markdown", 1500 | "metadata": {}, 1501 | "source": [ 1502 | "- 测试集上误差约为**5.4**" 1503 | ] 1504 | }, 1505 | { 1506 | "cell_type": "markdown", 1507 | "metadata": {}, 1508 | "source": [ 1509 | "---" 1510 | ] 1511 | }, 1512 | { 1513 | "cell_type": "markdown", 1514 | "metadata": {}, 1515 | "source": [ 1516 | "## Random Forest 随机森林回归" 1517 | ] 1518 | }, 1519 | { 1520 | "cell_type": "markdown", 1521 | "metadata": {}, 1522 | "source": [ 1523 | "### 建模训练" 1524 | ] 1525 | }, 1526 | { 1527 | "cell_type": "code", 1528 | "execution_count": 52, 1529 | "metadata": {}, 1530 | "outputs": [ 1531 | { 1532 | "data": { 1533 | "text/plain": [ 1534 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", 1535 | " max_features='auto', max_leaf_nodes=None,\n", 1536 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 1537 | " min_samples_leaf=1, min_samples_split=2,\n", 1538 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n", 1539 | " oob_score=False, random_state=52, verbose=0, warm_start=False)" 1540 | ] 1541 | }, 1542 | "execution_count": 52, 1543 | "metadata": {}, 1544 | "output_type": "execute_result" 1545 | } 1546 | ], 1547 | "source": [ 1548 | "from sklearn.ensemble import RandomForestRegressor\n", 1549 | "\n", 1550 | "forest_reg = RandomForestRegressor(random_state=52)\n", 1551 | "forest_reg.fit(data_analysis_prepared, data_analysis_labels)" 1552 | ] 1553 | }, 1554 | { 1555 | "cell_type": "code", 1556 | "execution_count": 53, 1557 | "metadata": {}, 1558 | "outputs": [ 1559 | { 1560 | "data": { 1561 | "text/plain": [ 1562 | "2.825121630127617" 1563 | ] 1564 | }, 1565 | "execution_count": 53, 1566 | "metadata": {}, 1567 | "output_type": "execute_result" 1568 | } 1569 | ], 1570 | "source": [ 1571 | "y_pred_rf = forest_reg.predict(data_analysis_prepared)\n", 1572 | "forest_mse = mean_squared_error(data_analysis_labels, y_pred_rf)\n", 1573 | "forest_rmse = np.sqrt(forest_mse)\n", 1574 | "forest_rmse" 1575 | ] 1576 | }, 1577 | { 1578 | "cell_type": "markdown", 1579 | "metadata": {}, 1580 | "source": [ 1581 | "### 测试集验证" 1582 | ] 1583 | }, 1584 | { 1585 | "cell_type": "code", 1586 | "execution_count": 54, 1587 | "metadata": {}, 1588 | "outputs": [], 1589 | "source": [ 1590 | "#data_test[:10]" 1591 | ] 1592 | }, 1593 | { 1594 | "cell_type": "code", 1595 | "execution_count": 55, 1596 | "metadata": {}, 1597 | "outputs": [ 1598 | { 1599 | "data": { 1600 | "text/plain": [ 1601 | "array([17.55 , 16.76213231, 14.82916667, 11.5 , 23.55 ,\n", 1602 | " 14.2 , 16.40416667, 12.9375 , 18.65738095, 20.11666667])" 1603 | ] 1604 | }, 1605 | "execution_count": 55, 1606 | "metadata": {}, 1607 | "output_type": "execute_result" 1608 | } 1609 | ], 1610 | "source": [ 1611 | "y_test = forest_reg.predict(data_analysis_test)\n", 1612 | "y_test[:10]" 1613 | ] 1614 | }, 1615 | { 1616 | "cell_type": "code", 1617 | "execution_count": 56, 1618 | "metadata": {}, 1619 | "outputs": [ 1620 | { 1621 | "data": { 1622 | "text/plain": [ 1623 | "array([20.5, 13. , 11.5, 12.5, 15. , 12.5, 22.5, 15. , 22.5, 15.5])" 1624 | ] 1625 | }, 1626 | "execution_count": 56, 1627 | "metadata": {}, 1628 | "output_type": "execute_result" 1629 | } 1630 | ], 1631 | "source": [ 1632 | "test_labels[:10]" 1633 | ] 1634 | }, 1635 | { 1636 | "cell_type": "code", 1637 | "execution_count": 57, 1638 | "metadata": {}, 1639 | "outputs": [ 1640 | { 1641 | "data": { 1642 | "text/plain": [ 1643 | "4.53113932085526" 1644 | ] 1645 | }, 1646 | "execution_count": 57, 1647 | "metadata": {}, 1648 | "output_type": "execute_result" 1649 | } 1650 | ], 1651 | "source": [ 1652 | "forest_mse = mean_squared_error(test_labels, y_test)\n", 1653 | "forest_rmse = np.sqrt(forest_mse)\n", 1654 | "forest_rmse" 1655 | ] 1656 | }, 1657 | { 1658 | "cell_type": "markdown", 1659 | "metadata": {}, 1660 | "source": [ 1661 | "- 测试集上误差约为**4.2**" 1662 | ] 1663 | }, 1664 | { 1665 | "cell_type": "markdown", 1666 | "metadata": {}, 1667 | "source": [ 1668 | "### 交叉验证" 1669 | ] 1670 | }, 1671 | { 1672 | "cell_type": "code", 1673 | "execution_count": 58, 1674 | "metadata": {}, 1675 | "outputs": [], 1676 | "source": [ 1677 | "from sklearn.model_selection import cross_val_score\n", 1678 | "\n", 1679 | "scores = cross_val_score(forest_reg, data_analysis_prepared, data_analysis_labels,\n", 1680 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 1681 | "forest_rmse_scores = np.sqrt(-scores)" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "code", 1686 | "execution_count": 59, 1687 | "metadata": {}, 1688 | "outputs": [ 1689 | { 1690 | "name": "stdout", 1691 | "output_type": "stream", 1692 | "text": [ 1693 | "Scores: [4.49936847 5.16831172 3.87212271 5.10449212 4.76825042 4.95531615\n", 1694 | " 4.47363957 5.06157526 4.29351527 4.56866847]\n", 1695 | "Mean: 4.676526015420327\n", 1696 | "Standard deviation: 0.3920907106938965\n" 1697 | ] 1698 | } 1699 | ], 1700 | "source": [ 1701 | "display_scores(forest_rmse_scores)" 1702 | ] 1703 | }, 1704 | { 1705 | "cell_type": "markdown", 1706 | "metadata": {}, 1707 | "source": [ 1708 | "---" 1709 | ] 1710 | }, 1711 | { 1712 | "cell_type": "markdown", 1713 | "metadata": {}, 1714 | "source": [ 1715 | "## KNN回归" 1716 | ] 1717 | }, 1718 | { 1719 | "cell_type": "code", 1720 | "execution_count": 60, 1721 | "metadata": {}, 1722 | "outputs": [ 1723 | { 1724 | "data": { 1725 | "text/plain": [ 1726 | "KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n", 1727 | " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", 1728 | " weights='uniform')" 1729 | ] 1730 | }, 1731 | "execution_count": 60, 1732 | "metadata": {}, 1733 | "output_type": "execute_result" 1734 | } 1735 | ], 1736 | "source": [ 1737 | "from sklearn.neighbors import KNeighborsRegressor\n", 1738 | "k = 5\n", 1739 | "knn_reg = KNeighborsRegressor(k)\n", 1740 | "knn_reg.fit(data_analysis_prepared, data_analysis_labels)" 1741 | ] 1742 | }, 1743 | { 1744 | "cell_type": "code", 1745 | "execution_count": 61, 1746 | "metadata": {}, 1747 | "outputs": [ 1748 | { 1749 | "data": { 1750 | "text/plain": [ 1751 | "4.181154008963998" 1752 | ] 1753 | }, 1754 | "execution_count": 61, 1755 | "metadata": {}, 1756 | "output_type": "execute_result" 1757 | } 1758 | ], 1759 | "source": [ 1760 | "y_pred_knn = knn_reg.predict(data_analysis_prepared)\n", 1761 | "knn_mse = mean_squared_error(data_analysis_labels, y_pred_knn)\n", 1762 | "knn_rmse = np.sqrt(knn_mse)\n", 1763 | "knn_rmse" 1764 | ] 1765 | }, 1766 | { 1767 | "cell_type": "markdown", 1768 | "metadata": {}, 1769 | "source": [ 1770 | "### 交叉验证" 1771 | ] 1772 | }, 1773 | { 1774 | "cell_type": "code", 1775 | "execution_count": 62, 1776 | "metadata": {}, 1777 | "outputs": [ 1778 | { 1779 | "name": "stdout", 1780 | "output_type": "stream", 1781 | "text": [ 1782 | "Scores: [4.91189951 5.22354338 4.69300803 5.17542269 5.59779155 6.05515193\n", 1783 | " 5.07427268 5.46580904 4.81125962 4.8222089 ]\n", 1784 | "Mean: 5.183036732034127\n", 1785 | "Standard deviation: 0.40101789362628404\n" 1786 | ] 1787 | } 1788 | ], 1789 | "source": [ 1790 | "from sklearn.model_selection import cross_val_score\n", 1791 | "\n", 1792 | "scores = cross_val_score(knn_reg, data_analysis_prepared, data_analysis_labels,\n", 1793 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 1794 | "knn_rmse_scores = np.sqrt(-scores)\n", 1795 | "\n", 1796 | "\n", 1797 | "display_scores(knn_rmse_scores)" 1798 | ] 1799 | }, 1800 | { 1801 | "cell_type": "markdown", 1802 | "metadata": {}, 1803 | "source": [ 1804 | "### 测试集验证" 1805 | ] 1806 | }, 1807 | { 1808 | "cell_type": "code", 1809 | "execution_count": 63, 1810 | "metadata": {}, 1811 | "outputs": [ 1812 | { 1813 | "name": "stdout", 1814 | "output_type": "stream", 1815 | "text": [ 1816 | "[18.7 17. 16.5 16.5 18.3 14.2 16.2 11.3 18.2 20.3]\n", 1817 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n" 1818 | ] 1819 | } 1820 | ], 1821 | "source": [ 1822 | "y_test = knn_reg.predict(data_analysis_test)\n", 1823 | "print(y_test[:10])\n", 1824 | "print(test_labels[:10])" 1825 | ] 1826 | }, 1827 | { 1828 | "cell_type": "code", 1829 | "execution_count": 64, 1830 | "metadata": {}, 1831 | "outputs": [ 1832 | { 1833 | "data": { 1834 | "text/plain": [ 1835 | "4.933569509808097" 1836 | ] 1837 | }, 1838 | "execution_count": 64, 1839 | "metadata": {}, 1840 | "output_type": "execute_result" 1841 | } 1842 | ], 1843 | "source": [ 1844 | "knn_mse = mean_squared_error(test_labels, y_test)\n", 1845 | "knn_rmse = np.sqrt(knn_mse)\n", 1846 | "knn_rmse" 1847 | ] 1848 | }, 1849 | { 1850 | "cell_type": "markdown", 1851 | "metadata": {}, 1852 | "source": [ 1853 | "---" 1854 | ] 1855 | }, 1856 | { 1857 | "cell_type": "markdown", 1858 | "metadata": {}, 1859 | "source": [ 1860 | "# Adaboost回归" 1861 | ] 1862 | }, 1863 | { 1864 | "cell_type": "code", 1865 | "execution_count": 65, 1866 | "metadata": {}, 1867 | "outputs": [ 1868 | { 1869 | "data": { 1870 | "text/plain": [ 1871 | "AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',\n", 1872 | " n_estimators=50, random_state=None)" 1873 | ] 1874 | }, 1875 | "execution_count": 65, 1876 | "metadata": {}, 1877 | "output_type": "execute_result" 1878 | } 1879 | ], 1880 | "source": [ 1881 | "from sklearn.ensemble import AdaBoostRegressor\n", 1882 | "Adaboost_reg = AdaBoostRegressor(n_estimators=50)#这里使用50个决策树\n", 1883 | "Adaboost_reg.fit(data_analysis_prepared, data_analysis_labels)" 1884 | ] 1885 | }, 1886 | { 1887 | "cell_type": "markdown", 1888 | "metadata": {}, 1889 | "source": [ 1890 | "### 交叉验证" 1891 | ] 1892 | }, 1893 | { 1894 | "cell_type": "code", 1895 | "execution_count": 66, 1896 | "metadata": {}, 1897 | "outputs": [], 1898 | "source": [ 1899 | "from sklearn.model_selection import cross_val_score\n", 1900 | "\n", 1901 | "scores = cross_val_score(Adaboost_reg, data_analysis_prepared, data_analysis_labels,\n", 1902 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 1903 | "Adaboost_rmse_scores = np.sqrt(-scores)" 1904 | ] 1905 | }, 1906 | { 1907 | "cell_type": "code", 1908 | "execution_count": 67, 1909 | "metadata": {}, 1910 | "outputs": [ 1911 | { 1912 | "name": "stdout", 1913 | "output_type": "stream", 1914 | "text": [ 1915 | "Scores: [4.53692111 4.64122254 4.21608294 4.86740635 5.06359292 4.66208335\n", 1916 | " 4.47708546 4.9059454 4.66957261 4.18020659]\n", 1917 | "Mean: 4.622011927255484\n", 1918 | "Standard deviation: 0.2698669328833233\n" 1919 | ] 1920 | } 1921 | ], 1922 | "source": [ 1923 | "display_scores(Adaboost_rmse_scores)" 1924 | ] 1925 | }, 1926 | { 1927 | "cell_type": "markdown", 1928 | "metadata": {}, 1929 | "source": [ 1930 | "### 测试集验证" 1931 | ] 1932 | }, 1933 | { 1934 | "cell_type": "code", 1935 | "execution_count": 68, 1936 | "metadata": {}, 1937 | "outputs": [ 1938 | { 1939 | "name": "stdout", 1940 | "output_type": "stream", 1941 | "text": [ 1942 | "[18.77375566 17.37943925 15.03588517 15.14973262 19.29107981 13.7369338\n", 1943 | " 17.37943925 13.7369338 20.0625 18.39631336]\n", 1944 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n" 1945 | ] 1946 | } 1947 | ], 1948 | "source": [ 1949 | "y_test = Adaboost_reg.predict(data_analysis_test)\n", 1950 | "print(y_test[:10])\n", 1951 | "print(test_labels[:10])" 1952 | ] 1953 | }, 1954 | { 1955 | "cell_type": "code", 1956 | "execution_count": 69, 1957 | "metadata": {}, 1958 | "outputs": [ 1959 | { 1960 | "data": { 1961 | "text/plain": [ 1962 | "4.503983133229124" 1963 | ] 1964 | }, 1965 | "execution_count": 69, 1966 | "metadata": {}, 1967 | "output_type": "execute_result" 1968 | } 1969 | ], 1970 | "source": [ 1971 | "Adaboost_mse = mean_squared_error(test_labels, y_test)\n", 1972 | "Adaboost_rmse = np.sqrt(Adaboost_mse)\n", 1973 | "Adaboost_rmse" 1974 | ] 1975 | }, 1976 | { 1977 | "cell_type": "markdown", 1978 | "metadata": {}, 1979 | "source": [ 1980 | "---" 1981 | ] 1982 | }, 1983 | { 1984 | "cell_type": "markdown", 1985 | "metadata": {}, 1986 | "source": [ 1987 | "## GBRT回归" 1988 | ] 1989 | }, 1990 | { 1991 | "cell_type": "code", 1992 | "execution_count": 70, 1993 | "metadata": { 1994 | "scrolled": true 1995 | }, 1996 | "outputs": [ 1997 | { 1998 | "data": { 1999 | "text/plain": [ 2000 | "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n", 2001 | " learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n", 2002 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", 2003 | " min_impurity_split=None, min_samples_leaf=1,\n", 2004 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 2005 | " n_estimators=100, n_iter_no_change=None, presort='auto',\n", 2006 | " random_state=None, subsample=1.0, tol=0.0001,\n", 2007 | " validation_fraction=0.1, verbose=0, warm_start=False)" 2008 | ] 2009 | }, 2010 | "execution_count": 70, 2011 | "metadata": {}, 2012 | "output_type": "execute_result" 2013 | } 2014 | ], 2015 | "source": [ 2016 | "from sklearn.ensemble import GradientBoostingRegressor\n", 2017 | "grbt_reg = GradientBoostingRegressor(n_estimators=100)#这里使用100个决策树\n", 2018 | "grbt_reg.fit(data_analysis_prepared, data_analysis_labels)" 2019 | ] 2020 | }, 2021 | { 2022 | "cell_type": "markdown", 2023 | "metadata": {}, 2024 | "source": [ 2025 | "### 交叉验证" 2026 | ] 2027 | }, 2028 | { 2029 | "cell_type": "code", 2030 | "execution_count": 71, 2031 | "metadata": {}, 2032 | "outputs": [ 2033 | { 2034 | "name": "stdout", 2035 | "output_type": "stream", 2036 | "text": [ 2037 | "Scores: [4.87398002 4.73250604 3.68256834 4.29583401 4.61267134 4.84613495\n", 2038 | " 4.14931613 4.34059763 4.23123641 4.40989281]\n", 2039 | "Mean: 4.417473768953092\n", 2040 | "Standard deviation: 0.34596795982823186\n" 2041 | ] 2042 | } 2043 | ], 2044 | "source": [ 2045 | "from sklearn.model_selection import cross_val_score\n", 2046 | "\n", 2047 | "scores = cross_val_score(grbt_reg, data_analysis_prepared, data_analysis_labels,\n", 2048 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 2049 | "grbt_rmse_scores = np.sqrt(-scores)\n", 2050 | "\n", 2051 | "display_scores(grbt_rmse_scores)" 2052 | ] 2053 | }, 2054 | { 2055 | "cell_type": "markdown", 2056 | "metadata": {}, 2057 | "source": [ 2058 | "### 测试集验证" 2059 | ] 2060 | }, 2061 | { 2062 | "cell_type": "code", 2063 | "execution_count": 72, 2064 | "metadata": {}, 2065 | "outputs": [ 2066 | { 2067 | "name": "stdout", 2068 | "output_type": "stream", 2069 | "text": [ 2070 | "[18.43702936 17.57478885 14.06200524 14.74538168 21.77043023 14.45351525\n", 2071 | " 18.47068341 14.04148627 19.616117 20.43899639]\n", 2072 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n" 2073 | ] 2074 | } 2075 | ], 2076 | "source": [ 2077 | "y_test = grbt_reg.predict(data_analysis_test)\n", 2078 | "print(y_test[:10])\n", 2079 | "print(test_labels[:10])" 2080 | ] 2081 | }, 2082 | { 2083 | "cell_type": "code", 2084 | "execution_count": 73, 2085 | "metadata": {}, 2086 | "outputs": [ 2087 | { 2088 | "data": { 2089 | "text/plain": [ 2090 | "4.290941809182011" 2091 | ] 2092 | }, 2093 | "execution_count": 73, 2094 | "metadata": {}, 2095 | "output_type": "execute_result" 2096 | } 2097 | ], 2098 | "source": [ 2099 | "grbt_mse = mean_squared_error(test_labels, y_test)\n", 2100 | "grbt_rmse = np.sqrt(grbt_mse)\n", 2101 | "grbt_rmse" 2102 | ] 2103 | }, 2104 | { 2105 | "cell_type": "markdown", 2106 | "metadata": {}, 2107 | "source": [ 2108 | "---" 2109 | ] 2110 | }, 2111 | { 2112 | "cell_type": "markdown", 2113 | "metadata": {}, 2114 | "source": [ 2115 | "## Bagging回归" 2116 | ] 2117 | }, 2118 | { 2119 | "cell_type": "code", 2120 | "execution_count": 74, 2121 | "metadata": { 2122 | "scrolled": true 2123 | }, 2124 | "outputs": [ 2125 | { 2126 | "data": { 2127 | "text/plain": [ 2128 | "BaggingRegressor(base_estimator=None, bootstrap=True,\n", 2129 | " bootstrap_features=False, max_features=1.0, max_samples=1.0,\n", 2130 | " n_estimators=10, n_jobs=None, oob_score=False, random_state=None,\n", 2131 | " verbose=0, warm_start=False)" 2132 | ] 2133 | }, 2134 | "execution_count": 74, 2135 | "metadata": {}, 2136 | "output_type": "execute_result" 2137 | } 2138 | ], 2139 | "source": [ 2140 | "from sklearn.ensemble import BaggingRegressor\n", 2141 | "bagging_reg = BaggingRegressor()\n", 2142 | "bagging_reg.fit(data_analysis_prepared, data_analysis_labels)" 2143 | ] 2144 | }, 2145 | { 2146 | "cell_type": "markdown", 2147 | "metadata": {}, 2148 | "source": [ 2149 | "### 交叉验证" 2150 | ] 2151 | }, 2152 | { 2153 | "cell_type": "code", 2154 | "execution_count": 75, 2155 | "metadata": {}, 2156 | "outputs": [ 2157 | { 2158 | "name": "stdout", 2159 | "output_type": "stream", 2160 | "text": [ 2161 | "Scores: [4.46370865 5.11419751 4.10735786 5.0408061 5.26292997 4.96593935\n", 2162 | " 4.71601577 4.93185359 4.59819981 4.63789738]\n", 2163 | "Mean: 4.783890598234823\n", 2164 | "Standard deviation: 0.32866956869503\n" 2165 | ] 2166 | } 2167 | ], 2168 | "source": [ 2169 | "from sklearn.model_selection import cross_val_score\n", 2170 | "\n", 2171 | "scores = cross_val_score(bagging_reg, data_analysis_prepared, data_analysis_labels,\n", 2172 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 2173 | "bagging_rmse_scores = np.sqrt(-scores)\n", 2174 | "\n", 2175 | "display_scores(bagging_rmse_scores)" 2176 | ] 2177 | }, 2178 | { 2179 | "cell_type": "markdown", 2180 | "metadata": {}, 2181 | "source": [ 2182 | "### 测试集验证" 2183 | ] 2184 | }, 2185 | { 2186 | "cell_type": "code", 2187 | "execution_count": 76, 2188 | "metadata": {}, 2189 | "outputs": [ 2190 | { 2191 | "name": "stdout", 2192 | "output_type": "stream", 2193 | "text": [ 2194 | "[17.53333333 16.95503081 15.19285714 11.65 19.43928571 14.36\n", 2195 | " 18.40166667 11.55 17.67583333 19.52738095]\n", 2196 | "[20.5 13. 11.5 12.5 15. 12.5 22.5 15. 22.5 15.5]\n" 2197 | ] 2198 | } 2199 | ], 2200 | "source": [ 2201 | "y_test = bagging_reg.predict(data_analysis_test)\n", 2202 | "print(y_test[:10])\n", 2203 | "print(test_labels[:10])" 2204 | ] 2205 | }, 2206 | { 2207 | "cell_type": "code", 2208 | "execution_count": 77, 2209 | "metadata": {}, 2210 | "outputs": [ 2211 | { 2212 | "data": { 2213 | "text/plain": [ 2214 | "4.851544621491279" 2215 | ] 2216 | }, 2217 | "execution_count": 77, 2218 | "metadata": {}, 2219 | "output_type": "execute_result" 2220 | } 2221 | ], 2222 | "source": [ 2223 | "bagging_mse = mean_squared_error(test_labels, y_test)\n", 2224 | "bagging_rmse = np.sqrt(bagging_mse)\n", 2225 | "bagging_rmse" 2226 | ] 2227 | }, 2228 | { 2229 | "cell_type": "markdown", 2230 | "metadata": {}, 2231 | "source": [ 2232 | "---" 2233 | ] 2234 | }, 2235 | { 2236 | "cell_type": "markdown", 2237 | "metadata": {}, 2238 | "source": [ 2239 | "# 模型拟合效果评价" 2240 | ] 2241 | }, 2242 | { 2243 | "cell_type": "code", 2244 | "execution_count": 78, 2245 | "metadata": {}, 2246 | "outputs": [ 2247 | { 2248 | "name": "stdout", 2249 | "output_type": "stream", 2250 | "text": [ 2251 | "linear 在测试集上的误差表现为: 4.252707451377156\n", 2252 | "tree 在测试集上的误差表现为: 5.585045537872495\n", 2253 | "forest 在测试集上的误差表现为: 4.53113932085526\n", 2254 | "knn 在测试集上的误差表现为: 4.933569509808097\n", 2255 | "Adaboost 在测试集上的误差表现为: 4.503983133229124\n", 2256 | "grbt 在测试集上的误差表现为: 4.290941809182011\n", 2257 | "bagging 在测试集上的误差表现为: 4.851544621491279\n" 2258 | ] 2259 | } 2260 | ], 2261 | "source": [ 2262 | "model_list = [lin_rmse,tree_rmse,forest_rmse,knn_rmse,Adaboost_rmse,grbt_rmse,bagging_rmse]\n", 2263 | "model_name = ['linear','tree','forest','knn','Adaboost','grbt','bagging']\n", 2264 | "i = 0\n", 2265 | "for model in model_list:\n", 2266 | " print(model_name[i],'在测试集上的误差表现为:',model)\n", 2267 | " i+=1" 2268 | ] 2269 | }, 2270 | { 2271 | "cell_type": "markdown", 2272 | "metadata": {}, 2273 | "source": [ 2274 | "---" 2275 | ] 2276 | }, 2277 | { 2278 | "cell_type": "markdown", 2279 | "metadata": {}, 2280 | "source": [ 2281 | "# 网格搜索调参" 2282 | ] 2283 | }, 2284 | { 2285 | "cell_type": "markdown", 2286 | "metadata": {}, 2287 | "source": [ 2288 | "### 对随机森林进行参数探索调整" 2289 | ] 2290 | }, 2291 | { 2292 | "cell_type": "code", 2293 | "execution_count": 79, 2294 | "metadata": {}, 2295 | "outputs": [ 2296 | { 2297 | "name": "stderr", 2298 | "output_type": "stream", 2299 | "text": [ 2300 | "C:\\Users\\13626\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", 2301 | " DeprecationWarning)\n" 2302 | ] 2303 | }, 2304 | { 2305 | "data": { 2306 | "text/plain": [ 2307 | "GridSearchCV(cv=5, error_score='raise-deprecating',\n", 2308 | " estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", 2309 | " max_features='auto', max_leaf_nodes=None,\n", 2310 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 2311 | " min_samples_leaf=1, min_samples_split=2,\n", 2312 | " min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n", 2313 | " oob_score=False, random_state=42, verbose=0, warm_start=False),\n", 2314 | " fit_params=None, iid='warn', n_jobs=None,\n", 2315 | " param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],\n", 2316 | " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", 2317 | " scoring='neg_mean_squared_error', verbose=0)" 2318 | ] 2319 | }, 2320 | "execution_count": 79, 2321 | "metadata": {}, 2322 | "output_type": "execute_result" 2323 | } 2324 | ], 2325 | "source": [ 2326 | "from sklearn.model_selection import GridSearchCV\n", 2327 | "\n", 2328 | "param_grid = [\n", 2329 | " # try 12 (3×4) combinations of hyperparameters\n", 2330 | " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", 2331 | " # then try 6 (2×3) combinations with bootstrap set as False\n", 2332 | " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", 2333 | " ]\n", 2334 | "\n", 2335 | "forest_reg = RandomForestRegressor(random_state=42)\n", 2336 | "\n", 2337 | "#grbt_reg = GradientBoostingRegressor()\n", 2338 | "\n", 2339 | "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", 2340 | " scoring='neg_mean_squared_error', return_train_score=True)\n", 2341 | "grid_search.fit(data_analysis_prepared, data_analysis_labels)" 2342 | ] 2343 | }, 2344 | { 2345 | "cell_type": "code", 2346 | "execution_count": 80, 2347 | "metadata": {}, 2348 | "outputs": [ 2349 | { 2350 | "name": "stdout", 2351 | "output_type": "stream", 2352 | "text": [ 2353 | "5.230290790238236 {'max_features': 2, 'n_estimators': 3}\n", 2354 | "4.828374365895161 {'max_features': 2, 'n_estimators': 10}\n", 2355 | "4.755202551631041 {'max_features': 2, 'n_estimators': 30}\n", 2356 | "5.064133429590988 {'max_features': 4, 'n_estimators': 3}\n", 2357 | "4.770620452203674 {'max_features': 4, 'n_estimators': 10}\n", 2358 | "4.698809097897846 {'max_features': 4, 'n_estimators': 30}\n", 2359 | "5.09938179972705 {'max_features': 6, 'n_estimators': 3}\n", 2360 | "4.7206651591273845 {'max_features': 6, 'n_estimators': 10}\n", 2361 | "4.687999289996666 {'max_features': 6, 'n_estimators': 30}\n", 2362 | "5.072523426909564 {'max_features': 8, 'n_estimators': 3}\n", 2363 | "4.75724488427645 {'max_features': 8, 'n_estimators': 10}\n", 2364 | "4.708816709801628 {'max_features': 8, 'n_estimators': 30}\n", 2365 | "5.388487805697899 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}\n", 2366 | "5.0528879660893775 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}\n", 2367 | "5.316193794492318 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}\n", 2368 | "5.022162677508516 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}\n", 2369 | "5.281169036748664 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}\n", 2370 | "5.056574946174836 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}\n" 2371 | ] 2372 | } 2373 | ], 2374 | "source": [ 2375 | "cvres = grid_search.cv_results_\n", 2376 | "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n", 2377 | " print(np.sqrt(-mean_score), params)" 2378 | ] 2379 | }, 2380 | { 2381 | "cell_type": "code", 2382 | "execution_count": 81, 2383 | "metadata": {}, 2384 | "outputs": [ 2385 | { 2386 | "data": { 2387 | "text/plain": [ 2388 | "{'max_features': 6, 'n_estimators': 30}" 2389 | ] 2390 | }, 2391 | "execution_count": 81, 2392 | "metadata": {}, 2393 | "output_type": "execute_result" 2394 | } 2395 | ], 2396 | "source": [ 2397 | "grid_search.best_params_" 2398 | ] 2399 | }, 2400 | { 2401 | "cell_type": "markdown", 2402 | "metadata": {}, 2403 | "source": [ 2404 | "## 变量重要性" 2405 | ] 2406 | }, 2407 | { 2408 | "cell_type": "code", 2409 | "execution_count": 82, 2410 | "metadata": {}, 2411 | "outputs": [], 2412 | "source": [ 2413 | "feature_importances = grid_search.best_estimator_.feature_importances_\n", 2414 | "#feature_importances" 2415 | ] 2416 | }, 2417 | { 2418 | "cell_type": "code", 2419 | "execution_count": 83, 2420 | "metadata": {}, 2421 | "outputs": [], 2422 | "source": [ 2423 | "num_attribs = list(data_analysis_num)\n", 2424 | "cat_attribs = ['公司规模','学历要求','工作经验']" 2425 | ] 2426 | }, 2427 | { 2428 | "cell_type": "code", 2429 | "execution_count": 84, 2430 | "metadata": {}, 2431 | "outputs": [ 2432 | { 2433 | "data": { 2434 | "text/plain": [ 2435 | "[(0.06256050964151738, 'Sql'),\n", 2436 | " (0.05996157379002209, 'Hive'),\n", 2437 | " (0.05833431092085402, 'Python'),\n", 2438 | " (0.05536655983841162, 'Sas'),\n", 2439 | " (0.039090728357453634, 'Excel'),\n", 2440 | " (0.0372204358630469, 'Spss'),\n", 2441 | " (0.03173383704493009, 'Spark'),\n", 2442 | " (0.029334029896999316, '学历要求'),\n", 2443 | " (0.027283707992806678, '工作经验'),\n", 2444 | " (0.026729166721593346, 'Tableau'),\n", 2445 | " (0.025363715544763292, 'Hadoop'),\n", 2446 | " (0.018739070532311868, 'Ppt'),\n", 2447 | " (0.016549484575144015, '公司规模')]" 2448 | ] 2449 | }, 2450 | "execution_count": 84, 2451 | "metadata": {}, 2452 | "output_type": "execute_result" 2453 | } 2454 | ], 2455 | "source": [ 2456 | "# 变量重要性排序\n", 2457 | "attributes = num_attribs + cat_attribs\n", 2458 | "sorted(zip(feature_importances, attributes), reverse=True)" 2459 | ] 2460 | }, 2461 | { 2462 | "cell_type": "markdown", 2463 | "metadata": {}, 2464 | "source": [ 2465 | "- **公司规模**对薪资的影响相比之下比较小。" 2466 | ] 2467 | }, 2468 | { 2469 | "cell_type": "markdown", 2470 | "metadata": {}, 2471 | "source": [ 2472 | "---" 2473 | ] 2474 | }, 2475 | { 2476 | "cell_type": "markdown", 2477 | "metadata": {}, 2478 | "source": [ 2479 | "# 最终模型" 2480 | ] 2481 | }, 2482 | { 2483 | "cell_type": "code", 2484 | "execution_count": 85, 2485 | "metadata": {}, 2486 | "outputs": [ 2487 | { 2488 | "data": { 2489 | "text/plain": [ 2490 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", 2491 | " max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,\n", 2492 | " min_impurity_split=None, min_samples_leaf=1,\n", 2493 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 2494 | " n_estimators=30, n_jobs=None, oob_score=False, random_state=42,\n", 2495 | " verbose=0, warm_start=False)" 2496 | ] 2497 | }, 2498 | "execution_count": 85, 2499 | "metadata": {}, 2500 | "output_type": "execute_result" 2501 | } 2502 | ], 2503 | "source": [ 2504 | "final_model = grid_search.best_estimator_\n", 2505 | "final_model" 2506 | ] 2507 | }, 2508 | { 2509 | "cell_type": "code", 2510 | "execution_count": 86, 2511 | "metadata": {}, 2512 | "outputs": [], 2513 | "source": [ 2514 | "scores = cross_val_score(final_model, data_analysis_prepared, data_analysis_labels,\n", 2515 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 2516 | "final_model_rmse_scores = np.sqrt(-scores)" 2517 | ] 2518 | }, 2519 | { 2520 | "cell_type": "code", 2521 | "execution_count": 87, 2522 | "metadata": {}, 2523 | "outputs": [ 2524 | { 2525 | "name": "stdout", 2526 | "output_type": "stream", 2527 | "text": [ 2528 | "Scores: [4.53008814 5.08441071 3.97171206 4.80954518 5.01055618 4.71745052\n", 2529 | " 4.66927057 4.74793007 4.18567529 4.34334473]\n", 2530 | "Mean: 4.606998345081946\n", 2531 | "Standard deviation: 0.3353022753836264\n" 2532 | ] 2533 | } 2534 | ], 2535 | "source": [ 2536 | "display_scores(final_model_rmse_scores)" 2537 | ] 2538 | }, 2539 | { 2540 | "cell_type": "markdown", 2541 | "metadata": {}, 2542 | "source": [ 2543 | "- 交叉验证误差为**4.56**" 2544 | ] 2545 | }, 2546 | { 2547 | "cell_type": "markdown", 2548 | "metadata": {}, 2549 | "source": [ 2550 | "---" 2551 | ] 2552 | }, 2553 | { 2554 | "cell_type": "markdown", 2555 | "metadata": {}, 2556 | "source": [ 2557 | "# 薪资预测" 2558 | ] 2559 | }, 2560 | { 2561 | "cell_type": "code", 2562 | "execution_count": 88, 2563 | "metadata": {}, 2564 | "outputs": [], 2565 | "source": [ 2566 | "final_predictions = final_model.predict(data_analysis_test)" 2567 | ] 2568 | }, 2569 | { 2570 | "cell_type": "code", 2571 | "execution_count": 89, 2572 | "metadata": {}, 2573 | "outputs": [], 2574 | "source": [ 2575 | "salary_test_series = Series(final_predictions,index=data_test.index)" 2576 | ] 2577 | }, 2578 | { 2579 | "cell_type": "code", 2580 | "execution_count": 90, 2581 | "metadata": {}, 2582 | "outputs": [], 2583 | "source": [ 2584 | "data_test_prediction = data_test.copy()\n", 2585 | "data_test_prediction.insert(7,'prediction',salary_test_series)" 2586 | ] 2587 | }, 2588 | { 2589 | "cell_type": "code", 2590 | "execution_count": 91, 2591 | "metadata": { 2592 | "scrolled": false 2593 | }, 2594 | "outputs": [ 2595 | { 2596 | "data": { 2597 | "text/html": [ 2598 | "
\n", 2599 | "\n", 2612 | "\n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | " \n", 2656 | " \n", 2657 | " \n", 2658 | " \n", 2659 | " \n", 2660 | " \n", 2661 | " \n", 2662 | " \n", 2663 | " \n", 2664 | " \n", 2665 | " \n", 2666 | " \n", 2667 | " \n", 2668 | " \n", 2669 | " \n", 2670 | " \n", 2671 | " \n", 2672 | " \n", 2673 | " \n", 2674 | " \n", 2675 | " \n", 2676 | " \n", 2677 | " \n", 2678 | " \n", 2679 | " \n", 2680 | " \n", 2681 | " \n", 2682 | " \n", 2683 | " \n", 2684 | " \n", 2685 | " \n", 2686 | " \n", 2687 | " \n", 2688 | " \n", 2689 | " \n", 2690 | " \n", 2691 | " \n", 2692 | " \n", 2693 | " \n", 2694 | " \n", 2695 | " \n", 2696 | " \n", 2697 | " \n", 2698 | " \n", 2699 | " \n", 2700 | " \n", 2701 | " \n", 2702 | " \n", 2703 | " \n", 2704 | " \n", 2705 | " \n", 2706 | " \n", 2707 | " \n", 2708 | " \n", 2709 | " \n", 2710 | " \n", 2711 | " \n", 2712 | " \n", 2713 | " \n", 2714 | " \n", 2715 | " \n", 2716 | " \n", 2717 | " \n", 2718 | " \n", 2719 | " \n", 2720 | " \n", 2721 | " \n", 2722 | " \n", 2723 | " \n", 2724 | " \n", 2725 | " \n", 2726 | " \n", 2727 | " \n", 2728 | " \n", 2729 | " \n", 2730 | " \n", 2731 | " \n", 2732 | " \n", 2733 | " \n", 2734 | " \n", 2735 | " \n", 2736 | " \n", 2737 | " \n", 2738 | " \n", 2739 | " \n", 2740 | " \n", 2741 | " \n", 2742 | " \n", 2743 | " \n", 2744 | " \n", 2745 | " \n", 2746 | " \n", 2747 | " \n", 2748 | " \n", 2749 | " \n", 2750 | " \n", 2751 | " \n", 2752 | " \n", 2753 | " \n", 2754 | " \n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | " \n", 2766 | " \n", 2767 | " \n", 2768 | " \n", 2769 | " \n", 2770 | " \n", 2771 | " \n", 2772 | " \n", 2773 | " \n", 2774 | " \n", 2775 | " \n", 2776 | " \n", 2777 | " \n", 2778 | " \n", 2779 | " \n", 2780 | " \n", 2781 | " \n", 2782 | " \n", 2783 | " \n", 2784 | " \n", 2785 | " \n", 2786 | " \n", 2787 | " \n", 2788 | " \n", 2789 | " \n", 2790 | " \n", 2791 | " \n", 2792 | " \n", 2793 | " \n", 2794 | " \n", 2795 | " \n", 2796 | " \n", 2797 | " \n", 2798 | " \n", 2799 | " \n", 2800 | " \n", 2801 | " \n", 2802 | " \n", 2803 | " \n", 2804 | " \n", 2805 | " \n", 2806 | " \n", 2807 | " \n", 2808 | " \n", 2809 | " \n", 2810 | " \n", 2811 | " \n", 2812 | " \n", 2813 | " \n", 2814 | " \n", 2815 | " \n", 2816 | " \n", 2817 | " \n", 2818 | " \n", 2819 | " \n", 2820 | " \n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | " \n", 2825 | " \n", 2826 | " \n", 2827 | " \n", 2828 | " \n", 2829 | " \n", 2830 | " \n", 2831 | " \n", 2832 | " \n", 2833 | " \n", 2834 | " \n", 2835 | " \n", 2836 | " \n", 2837 | " \n", 2838 | " \n", 2839 | " \n", 2840 | " \n", 2841 | " \n", 2842 | " \n", 2843 | " \n", 2844 | " \n", 2845 | " \n", 2846 | " \n", 2847 | " \n", 2848 | " \n", 2849 | " \n", 2850 | " \n", 2851 | " \n", 2852 | " \n", 2853 | " \n", 2854 | " \n", 2855 | " \n", 2856 | " \n", 2857 | " \n", 2858 | " \n", 2859 | "
公司名称公司规模地区学历要求工作经验职位名称salaryprediction融资情况SqlPythonExcelSasSpssHiveHadoopPptTableauSpark
602旗计智能1000-9999人上海 浦东新区 张江本科3-5年数据分析主管/经理(风险方向)25.017.123016已上市1111000000
260速网电商100-499人上海 闵行区 莘庄本科1-3年数据分析10.011.195915不需要融资0000000000
512Oriente500-999人上海 黄浦区 人民广场本科3-5年数据分析师27.519.633333B轮1101001000
1027儒傲会软件定制20-99人上海 嘉定区 江桥本科5-10年系统数据分析师招募要求(儒傲会)17.519.751667天使轮1000000000
787华数康100-499人上海 长宁区 古北学历不限经验不限数据分析师22.511.050000B轮0001000000
85上海深界信息科技0-20人上海 徐汇区 漕河泾大专1-3年数据分析师12.513.820000不需要融资0101010000
368汉云信息100-499人上海 长宁区 天山路本科1-3年互联网数据分析师9.010.550000不需要融资1010000000
907饿了么1000-9999人上海 普陀区 金沙江路本科3-5年运力规划/数据分析22.517.009510D轮及以上0000000000
1050上海翔鸢信息科技20-99人上海 青浦区 赵巷大专经验不限王者荣耀数据分析师7.07.72500020-99人0010000000
917观安信息100-499人上海 普陀区 金沙江路本科5-10年大数据安全产品经理25.025.044444B轮0000000000
\n", 2860 | "
" 2861 | ], 2862 | "text/plain": [ 2863 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 \\\n", 2864 | "602 旗计智能 1000-9999人 上海 浦东新区 张江 本科 3-5年 数据分析主管/经理(风险方向) \n", 2865 | "260 速网电商 100-499人 上海 闵行区 莘庄 本科 1-3年 数据分析 \n", 2866 | "512 Oriente 500-999人 上海 黄浦区 人民广场 本科 3-5年 数据分析师 \n", 2867 | "1027 儒傲会软件定制 20-99人 上海 嘉定区 江桥 本科 5-10年 系统数据分析师招募要求(儒傲会) \n", 2868 | "787 华数康 100-499人 上海 长宁区 古北 学历不限 经验不限 数据分析师 \n", 2869 | "85 上海深界信息科技 0-20人 上海 徐汇区 漕河泾 大专 1-3年 数据分析师 \n", 2870 | "368 汉云信息 100-499人 上海 长宁区 天山路 本科 1-3年 互联网数据分析师 \n", 2871 | "907 饿了么 1000-9999人 上海 普陀区 金沙江路 本科 3-5年 运力规划/数据分析 \n", 2872 | "1050 上海翔鸢信息科技 20-99人 上海 青浦区 赵巷 大专 经验不限 王者荣耀数据分析师 \n", 2873 | "917 观安信息 100-499人 上海 普陀区 金沙江路 本科 5-10年 大数据安全产品经理 \n", 2874 | "\n", 2875 | " salary prediction 融资情况 Sql Python Excel Sas Spss Hive Hadoop \\\n", 2876 | "602 25.0 17.123016 已上市 1 1 1 1 0 0 0 \n", 2877 | "260 10.0 11.195915 不需要融资 0 0 0 0 0 0 0 \n", 2878 | "512 27.5 19.633333 B轮 1 1 0 1 0 0 1 \n", 2879 | "1027 17.5 19.751667 天使轮 1 0 0 0 0 0 0 \n", 2880 | "787 22.5 11.050000 B轮 0 0 0 1 0 0 0 \n", 2881 | "85 12.5 13.820000 不需要融资 0 1 0 1 0 1 0 \n", 2882 | "368 9.0 10.550000 不需要融资 1 0 1 0 0 0 0 \n", 2883 | "907 22.5 17.009510 D轮及以上 0 0 0 0 0 0 0 \n", 2884 | "1050 7.0 7.725000 20-99人 0 0 1 0 0 0 0 \n", 2885 | "917 25.0 25.044444 B轮 0 0 0 0 0 0 0 \n", 2886 | "\n", 2887 | " Ppt Tableau Spark \n", 2888 | "602 0 0 0 \n", 2889 | "260 0 0 0 \n", 2890 | "512 0 0 0 \n", 2891 | "1027 0 0 0 \n", 2892 | "787 0 0 0 \n", 2893 | "85 0 0 0 \n", 2894 | "368 0 0 0 \n", 2895 | "907 0 0 0 \n", 2896 | "1050 0 0 0 \n", 2897 | "917 0 0 0 " 2898 | ] 2899 | }, 2900 | "execution_count": 91, 2901 | "metadata": {}, 2902 | "output_type": "execute_result" 2903 | } 2904 | ], 2905 | "source": [ 2906 | "data_test_prediction.sample(10)" 2907 | ] 2908 | }, 2909 | { 2910 | "cell_type": "markdown", 2911 | "metadata": {}, 2912 | "source": [ 2913 | "- 预测结果与实际薪资相比误差在**可接受范围内**。" 2914 | ] 2915 | }, 2916 | { 2917 | "cell_type": "markdown", 2918 | "metadata": {}, 2919 | "source": [ 2920 | "# 预测函数接口" 2921 | ] 2922 | }, 2923 | { 2924 | "cell_type": "code", 2925 | "execution_count": 92, 2926 | "metadata": {}, 2927 | "outputs": [ 2928 | { 2929 | "data": { 2930 | "text/html": [ 2931 | "
\n", 2932 | "\n", 2945 | "\n", 2946 | " \n", 2947 | " \n", 2948 | " \n", 2949 | " \n", 2950 | " \n", 2951 | " \n", 2952 | " \n", 2953 | " \n", 2954 | " \n", 2955 | " \n", 2956 | " \n", 2957 | " \n", 2958 | " \n", 2959 | " \n", 2960 | " \n", 2961 | " \n", 2962 | " \n", 2963 | " \n", 2964 | " \n", 2965 | " \n", 2966 | " \n", 2967 | " \n", 2968 | " \n", 2969 | " \n", 2970 | " \n", 2971 | " \n", 2972 | " \n", 2973 | " \n", 2974 | " \n", 2975 | " \n", 2976 | " \n", 2977 | " \n", 2978 | " \n", 2979 | " \n", 2980 | " \n", 2981 | " \n", 2982 | " \n", 2983 | " \n", 2984 | " \n", 2985 | " \n", 2986 | " \n", 2987 | " \n", 2988 | " \n", 2989 | " \n", 2990 | " \n", 2991 | " \n", 2992 | "
公司名称公司规模地区学历要求工作经验职位名称salary融资情况SqlPythonExcelSasSpssHiveHadoopPptTableauSpark
361携程旅行网10000人以上上海 长宁区 北新泾本科3-5年海外运营数据分析师20.5已上市1100000000
\n", 2993 | "
" 2994 | ], 2995 | "text/plain": [ 2996 | " 公司名称 公司规模 地区 学历要求 工作经验 职位名称 salary 融资情况 Sql \\\n", 2997 | "361 携程旅行网 10000人以上 上海 长宁区 北新泾 本科 3-5年 海外运营数据分析师 20.5 已上市 1 \n", 2998 | "\n", 2999 | " Python Excel Sas Spss Hive Hadoop Ppt Tableau Spark \n", 3000 | "361 1 0 0 0 0 0 0 0 0 " 3001 | ] 3002 | }, 3003 | "execution_count": 92, 3004 | "metadata": {}, 3005 | "output_type": "execute_result" 3006 | } 3007 | ], 3008 | "source": [ 3009 | "data_test.head(1)" 3010 | ] 3011 | }, 3012 | { 3013 | "cell_type": "code", 3014 | "execution_count": 93, 3015 | "metadata": {}, 3016 | "outputs": [], 3017 | "source": [ 3018 | "from pandas import DataFrame" 3019 | ] 3020 | }, 3021 | { 3022 | "cell_type": "markdown", 3023 | "metadata": {}, 3024 | "source": [ 3025 | "### 预测功能函数" 3026 | ] 3027 | }, 3028 | { 3029 | "cell_type": "code", 3030 | "execution_count": 94, 3031 | "metadata": {}, 3032 | "outputs": [], 3033 | "source": [ 3034 | "def prediction_function(scale,degree,experience,v_skills):\n", 3035 | " predict_X = {\n", 3036 | " '公司规模':[scale],\n", 3037 | " '学历要求':[degree],\n", 3038 | " '工作经验':[experience],\n", 3039 | " 'Sql':[v_skills[0]],\n", 3040 | " 'Python':[v_skills[1]],\n", 3041 | " 'Excel':[v_skills[2]],\n", 3042 | " 'Sas':[v_skills[3]],\n", 3043 | " 'Spss':[v_skills[4]],\n", 3044 | " 'Hive':[v_skills[5]],\n", 3045 | " 'Hadoop':[v_skills[6]],\n", 3046 | " 'Ppt':[v_skills[7]],\n", 3047 | " 'Tableau':[v_skills[8]],\n", 3048 | " 'Spark':[v_skills[9]],\n", 3049 | " }\n", 3050 | " predict_tmp = pd.DataFrame(predict_X)\n", 3051 | " X_predict = full_pipeline.transform(predict_tmp)\n", 3052 | " return X_predict" 3053 | ] 3054 | }, 3055 | { 3056 | "cell_type": "markdown", 3057 | "metadata": {}, 3058 | "source": [ 3059 | "### 技能转换函数" 3060 | ] 3061 | }, 3062 | { 3063 | "cell_type": "code", 3064 | "execution_count": 95, 3065 | "metadata": {}, 3066 | "outputs": [], 3067 | "source": [ 3068 | "def skills_switch(skill_list):\n", 3069 | " tmp_list = []\n", 3070 | " skills = ['Sql','Python','Excel','Sas','Spss','Hive','Hadoop','Ppt','Tableau','Spark']\n", 3071 | " for skill in skills:\n", 3072 | " # 大小写转换\n", 3073 | " if skill in skill_list:\n", 3074 | " tmp_list.append(1)\n", 3075 | " else:\n", 3076 | " tmp_list.append(0)\n", 3077 | " return tmp_list" 3078 | ] 3079 | }, 3080 | { 3081 | "cell_type": "markdown", 3082 | "metadata": {}, 3083 | "source": [ 3084 | "### 预测主函数" 3085 | ] 3086 | }, 3087 | { 3088 | "cell_type": "code", 3089 | "execution_count": 96, 3090 | "metadata": {}, 3091 | "outputs": [], 3092 | "source": [ 3093 | "def predict(scale,degree,experience,v_skills):\n", 3094 | " X_predict = prediction_function(scale,degree,experience,v_skills)\n", 3095 | " Y_predict = final_model.predict(X_predict)\n", 3096 | " print('预测薪资为:',Y_predict[0],'k/month')" 3097 | ] 3098 | }, 3099 | { 3100 | "cell_type": "markdown", 3101 | "metadata": {}, 3102 | "source": [ 3103 | "---" 3104 | ] 3105 | }, 3106 | { 3107 | "cell_type": "markdown", 3108 | "metadata": {}, 3109 | "source": [ 3110 | "
\n",
3111 |     "企业规模:['10000人以上' '1000-9999人' '20-99人' '0-20人' '500-999人' '100-499人']\n",
3112 |     "学历:['本科' '大专' '学历不限' '硕士']\n",
3113 |     "工作经验:['3-5年' '1-3年' '经验不限' '5-10年' '1年以内' '应届生']\n",
3114 |     "Skills:[Sql,Python,Excel,Sas,Spss,Hive,Hadoop,Ppt,Tableau,Spark]\n",
3115 |     "
" 3116 | ] 3117 | }, 3118 | { 3119 | "cell_type": "markdown", 3120 | "metadata": {}, 3121 | "source": [ 3122 | "### 预测函数" 3123 | ] 3124 | }, 3125 | { 3126 | "cell_type": "code", 3127 | "execution_count": 97, 3128 | "metadata": {}, 3129 | "outputs": [ 3130 | { 3131 | "name": "stdout", 3132 | "output_type": "stream", 3133 | "text": [ 3134 | "预测薪资为: 12.14722222222222 k/month\n" 3135 | ] 3136 | } 3137 | ], 3138 | "source": [ 3139 | "#-----------设置变量\n", 3140 | "scale = '10000人以上'\n", 3141 | "degree = '本科'\n", 3142 | "experience = '1-3年'\n", 3143 | "# ------------------\n", 3144 | "# --------设置所掌握的技能(顺序无关)\n", 3145 | "mastered_skills = ['Sql','Python','Excel','Spss','Ppt']\n", 3146 | "v_skills = skills_switch(mastered_skills)\n", 3147 | "# -----------------------------------\n", 3148 | "predict(scale,degree,experience,v_skills)" 3149 | ] 3150 | }, 3151 | { 3152 | "cell_type": "code", 3153 | "execution_count": 98, 3154 | "metadata": {}, 3155 | "outputs": [ 3156 | { 3157 | "name": "stdout", 3158 | "output_type": "stream", 3159 | "text": [ 3160 | "10000人以上 | 本科 | 应届生 | Sql,Python,Excel,Spss,Ppt\n", 3161 | "预测薪资为: 13.066666666666666 k/month\n", 3162 | "------------------------------------------------------------\n", 3163 | "10000人以上 | 本科 | 1年以内 | Sql,Python,Excel,Spss,Ppt\n", 3164 | "预测薪资为: 13.308333333333334 k/month\n", 3165 | "------------------------------------------------------------\n", 3166 | "10000人以上 | 本科 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n", 3167 | "预测薪资为: 12.14722222222222 k/month\n", 3168 | "------------------------------------------------------------\n", 3169 | "10000人以上 | 本科 | 3-5年 | Sql,Python,Excel,Spss,Ppt\n", 3170 | "预测薪资为: 20.964444444444446 k/month\n", 3171 | "------------------------------------------------------------\n", 3172 | "10000人以上 | 本科 | 5-10年 | Sql,Python,Excel,Spss,Ppt\n", 3173 | "预测薪资为: 23.97222222222222 k/month\n", 3174 | "------------------------------------------------------------\n" 3175 | ] 3176 | } 3177 | ], 3178 | "source": [ 3179 | "experiences = ['应届生','1年以内', '1-3年','3-5年', '5-10年' ]\n", 3180 | "\n", 3181 | "for exp in experiences:\n", 3182 | " print(scale,'|',degree,'|',exp,'|',\",\".join(mastered_skills))\n", 3183 | " predict(scale,degree,exp,v_skills)\n", 3184 | " print('-'*60)" 3185 | ] 3186 | }, 3187 | { 3188 | "cell_type": "markdown", 3189 | "metadata": {}, 3190 | "source": [ 3191 | "---" 3192 | ] 3193 | }, 3194 | { 3195 | "cell_type": "code", 3196 | "execution_count": 101, 3197 | "metadata": {}, 3198 | "outputs": [ 3199 | { 3200 | "name": "stdout", 3201 | "output_type": "stream", 3202 | "text": [ 3203 | "500-999人 | 学历不限 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n", 3204 | "预测薪资为: 21.625 k/month\n", 3205 | "------------------------------------------------------------\n", 3206 | "500-999人 | 大专 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n", 3207 | "预测薪资为: 19.925 k/month\n", 3208 | "------------------------------------------------------------\n", 3209 | "500-999人 | 本科 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n", 3210 | "预测薪资为: 23.091666666666665 k/month\n", 3211 | "------------------------------------------------------------\n", 3212 | "500-999人 | 硕士 | 1-3年 | Sql,Python,Excel,Spss,Ppt\n", 3213 | "预测薪资为: 20.791666666666668 k/month\n", 3214 | "------------------------------------------------------------\n" 3215 | ] 3216 | } 3217 | ], 3218 | "source": [ 3219 | "experience = '1-3年'\n", 3220 | "scale = '500-999人'\n", 3221 | "degrees = ['学历不限','大专','本科','硕士']\n", 3222 | "\n", 3223 | "for degree in degrees:\n", 3224 | " print(scale,'|',degree,'|',experience,'|',\",\".join(mastered_skills))\n", 3225 | " predict(scale,degree,exp,v_skills)\n", 3226 | " print('-'*60)" 3227 | ] 3228 | }, 3229 | { 3230 | "cell_type": "code", 3231 | "execution_count": null, 3232 | "metadata": {}, 3233 | "outputs": [], 3234 | "source": [] 3235 | } 3236 | ], 3237 | "metadata": { 3238 | "kernelspec": { 3239 | "display_name": "Python 3", 3240 | "language": "python", 3241 | "name": "python3" 3242 | }, 3243 | "language_info": { 3244 | "codemirror_mode": { 3245 | "name": "ipython", 3246 | "version": 3 3247 | }, 3248 | "file_extension": ".py", 3249 | "mimetype": "text/x-python", 3250 | "name": "python", 3251 | "nbconvert_exporter": "python", 3252 | "pygments_lexer": "ipython3", 3253 | "version": "3.6.3" 3254 | } 3255 | }, 3256 | "nbformat": 4, 3257 | "nbformat_minor": 2 3258 | } 3259 | -------------------------------------------------------------------------------- /output_13_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/data-science-salary-forecast/53941e9cab2871eb9004b437f5bdde8a14886b83/output_13_0.png -------------------------------------------------------------------------------- /spider/shanghai_jobs_discovery.py: -------------------------------------------------------------------------------- 1 | # common imports 2 | import requests 3 | from lxml import etree 4 | import time 5 | import random 6 | import pymongo 7 | from retrying import retry 8 | 9 | 10 | # --------------------- 11 | 12 | # 页面获取函数 13 | def get_page(page, keyword): 14 | header = { 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 16 | 'Chrome/69.0.3497.12 Safari/537.36 ' 17 | } 18 | print('正在爬取第', page, '页') 19 | url = 'https://www.zhipin.com/c101020100/?query={k}&page={page}&ka=page-{page}'.format(page=page, k=keyword) 20 | response = requests.get(url, headers=header) 21 | return response.text 22 | 23 | 24 | # -------------- 25 | @retry(wait_fixed=8000) 26 | def job_detail(link): 27 | header = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 29 | 'Chrome/69.0.3497.12 Safari/537.36 ' 30 | } 31 | response = requests.get(link, headers=header) 32 | data = etree.HTML(response.text) 33 | 34 | # ---检验是否出现验证码 35 | tips = data.xpath('/html/head/title/text()') 36 | tips_title = 'BOSS直聘验证码' 37 | if tips[0] == tips_title: 38 | print('检查是否弹出验证码') 39 | # 弹出验证码则引发IOError来进行循环 40 | raise IOError 41 | # ---------------------- 42 | job_desc = data.xpath('//*[@id="main"]/div[3]/div/div[2]/div[3]/div[@class="job-sec"][1]/div/text()') 43 | 44 | jd = "".join(job_desc).strip() 45 | return jd 46 | 47 | 48 | def parse_page(html, keyword, page): 49 | # 观察数据结构可得 50 | data = etree.HTML(html) 51 | if page == 1: 52 | items = data.xpath('//*[@id="main"]/div/div[3]/ul/li') 53 | else: 54 | items = data.xpath('//*[@id="main"]/div/div[2]/ul/li') 55 | for item in items: 56 | district = item.xpath('./div/div[1]/p/text()[1]')[0] 57 | job_links = item.xpath('./div/div[1]/h3/a/@href')[0] 58 | job_title = item.xpath('./div/div[1]/h3/a/div[1]/text()')[0] 59 | job_salary = item.xpath('./div/div[1]/h3/a/span/text()')[0] 60 | job_company = item.xpath('./div/div[2]/div/h3/a/text()')[0] 61 | job_experience = item.xpath('./div/div[1]/p/text()[2]')[0] 62 | job_degree = item.xpath('./div/div[1]/p/text()[3]')[0] 63 | fin_status = item.xpath('./div/div[2]/div/p/text()[2]')[0] 64 | try: 65 | company_scale = item.xpath('./div/div[2]/div/p/text()[3]')[0] 66 | except Exception: 67 | company_scale = item.xpath('./div/div[2]/div/p/text()[2]')[0] 68 | job_link = host + job_links 69 | # print(job_link) 70 | # 获取职位描述 71 | detail = job_detail(job_link) 72 | # --------------- 73 | job = { 74 | 'Keyword': keyword, 75 | '地区': district, 76 | '职位名称': job_title, 77 | '职位薪资': job_salary, 78 | '公司名称': job_company, 79 | '工作经验': job_experience, 80 | '学历要求': job_degree, 81 | '公司规模': company_scale, 82 | '融资情况': fin_status, 83 | '职位描述': detail, 84 | } 85 | print(job) 86 | save_to_mongo(job) 87 | time.sleep(random.randint(6, 9)) 88 | # --------------------------------------- 89 | 90 | 91 | # 连接到MongoDB 92 | MONGO_URL = 'localhost' 93 | MONGO_DB = 'Graduation_project' 94 | MONGO_COLLECTION = 'shanghai_discovery' 95 | client = pymongo.MongoClient(MONGO_URL, port=27017) 96 | db = client[MONGO_DB] 97 | 98 | 99 | def save_to_mongo(data): 100 | # 保存到MongoDB中 101 | try: 102 | if db[MONGO_COLLECTION].insert(data): 103 | print('存储到 MongoDB 成功') 104 | except Exception: 105 | print('存储到 MongoDB 失败') 106 | 107 | 108 | if __name__ == '__main__': 109 | MAX_PAGE = 10 110 | host = 'https://www.zhipin.com' 111 | keywords = ['数据分析', '数据挖掘', '商业分析', '机器学习'] 112 | for keyword in keywords: 113 | for i in range(1, MAX_PAGE + 1): 114 | html = get_page(i, keyword) 115 | # ------------ 解析数据 --------------- 116 | parse_page(html, keyword, i) 117 | print('-' * 100) 118 | # ----------------- 119 | timewait = random.randint(15, 18) 120 | time.sleep(timewait) 121 | print('等待', timewait, '秒') 122 | --------------------------------------------------------------------------------