├── .gitignore
├── 实战篇
├── 3.电力窃漏电用户自动识别
│ ├── README.md
│ ├── data
│ │ ├── model.xls
│ │ ├── missing_data.xls
│ │ ├── ═╪╒╣╦╝┐╝╤∙▒╛╩¤╛▌.xls
│ │ └── missing_data_processed.xls
│ └── MODEL1.ipynb
├── 1.数据探索
│ ├── img
│ │ ├── img_1.png
│ │ ├── overview.png
│ │ ├── programmer_2.png
│ │ └── programmer_3.png
│ ├── output_23_1.png
│ ├── output_25_0.png
│ ├── output_27_0.png
│ ├── output_29_0.png
│ ├── output_41_0.png
│ ├── output_9_1.png
│ ├── data
│ │ ├── catering_sale.xls
│ │ ├── catering_sale_all.xls
│ │ └── catering_dish_profit.xls
│ ├── 数据探索_part02.ipynb
│ └── README.md
├── 2.数据预处理
│ ├── tmp
│ │ └── sales.xls
│ ├── data
│ │ ├── leleccum.mat
│ │ ├── catering_sale.xls
│ │ ├── electricity_data.xls
│ │ ├── normalization_data.xls
│ │ ├── discretization_data.xls
│ │ └── principal_component.xls
│ ├── 数据预处理介绍
│ │ ├── output_37_1.png
│ │ ├── output_37_2.png
│ │ └── output_37_3.png
│ └── 数据预处理_part02.ipynb
└── 4.地震后建筑修复建议预测
│ ├── img
│ ├── output_13_2.png
│ ├── output_17_2.png
│ ├── output_20_1.png
│ ├── output_25_2.png
│ ├── output_26_2.png
│ ├── output_5_2.png
│ └── output_73_2.png
│ └── README.md
├── img
├── 1.jpg
├── 2.jpg
├── 3.jpg
├── 4.jpg
├── 5.jpg
├── 6.jpg
├── 7.jpg
├── 8.jpg
├── 9.jpg
├── 10.jpg
├── 11.jpg
├── 12.jpg
├── 13.jpg
├── 数据科学算法.png
├── modeling_total.png
└── feature_engineering_total.png
├── 高级特征工程
├── img
│ ├── pe.png
│ ├── NMF.png
│ ├── sele.png
│ ├── fusion.png
│ ├── kfold.jpg
│ ├── model1.png
│ ├── model12.png
│ ├── model2.png
│ ├── NMF_note.png
│ ├── leaveone.jpg
│ ├── notebook.png
│ ├── bagging_code.png
│ ├── dropconnect.png
│ ├── interaction1.png
│ ├── interaction2.png
│ ├── kfold_code.jpg
│ ├── model_best.png
│ ├── model_weight.png
│ ├── reminder_set.png
│ ├── weight_based.png
│ ├── label_encoding.jpg
│ ├── mean_encoding.jpg
│ ├── modeling_total.png
│ ├── residual_error.png
│ ├── residual_pred.png
│ ├── stacking_data.png
│ ├── stacking_data2.png
│ ├── stacking_past.png
│ ├── interaction_tree.png
│ ├── residual_new_pred.png
│ ├── tree_interaction.png
│ ├── interge_interaction.png
│ ├── statistic_ctr_data.png
│ ├── statistic_ctr_data2.png
│ ├── statistic_ctr_data_code.png
│ └── feature_engineering_total.png
├── Tips and tricks.ipynb
├── Emsembling.ipynb
├── Hyperparameter tuning.ipynb
├── Advanced Feature Engineering II.ipynb
└── Advanced Feature Engineering I.ipynb
├── 1.环境安装.md
├── README.md
├── 5.Pandas学习笔记.md
├── 8.SKlearn模型评估方法.md
├── 2.KMeans算法与交通事故理赔审核预测.md
├── 9.Kaggle杂记.md
├── 4.NumPy学习笔记.md
├── 6.数据预处理笔记.md
├── 7.机器学习部分.md
└── 3.Matplotlib学习笔记.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 |
--------------------------------------------------------------------------------
/实战篇/3.电力窃漏电用户自动识别/README.md:
--------------------------------------------------------------------------------
1 | # 电力窃漏电用户自动识别
2 | 这是《Python数据分析与挖掘实战》中的案例
3 |
--------------------------------------------------------------------------------
/img/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/1.jpg
--------------------------------------------------------------------------------
/img/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/2.jpg
--------------------------------------------------------------------------------
/img/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/3.jpg
--------------------------------------------------------------------------------
/img/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/4.jpg
--------------------------------------------------------------------------------
/img/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/5.jpg
--------------------------------------------------------------------------------
/img/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/6.jpg
--------------------------------------------------------------------------------
/img/7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/7.jpg
--------------------------------------------------------------------------------
/img/8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/8.jpg
--------------------------------------------------------------------------------
/img/9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/9.jpg
--------------------------------------------------------------------------------
/img/10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/10.jpg
--------------------------------------------------------------------------------
/img/11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/11.jpg
--------------------------------------------------------------------------------
/img/12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/12.jpg
--------------------------------------------------------------------------------
/img/13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/13.jpg
--------------------------------------------------------------------------------
/img/数据科学算法.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/数据科学算法.png
--------------------------------------------------------------------------------
/高级特征工程/img/pe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/pe.png
--------------------------------------------------------------------------------
/高级特征工程/img/NMF.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/NMF.png
--------------------------------------------------------------------------------
/高级特征工程/img/sele.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/sele.png
--------------------------------------------------------------------------------
/img/modeling_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/modeling_total.png
--------------------------------------------------------------------------------
/高级特征工程/img/fusion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/fusion.png
--------------------------------------------------------------------------------
/高级特征工程/img/kfold.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/kfold.jpg
--------------------------------------------------------------------------------
/高级特征工程/img/model1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/model1.png
--------------------------------------------------------------------------------
/高级特征工程/img/model12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/model12.png
--------------------------------------------------------------------------------
/高级特征工程/img/model2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/model2.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/img/img_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/img/img_1.png
--------------------------------------------------------------------------------
/高级特征工程/img/NMF_note.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/NMF_note.png
--------------------------------------------------------------------------------
/高级特征工程/img/leaveone.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/leaveone.jpg
--------------------------------------------------------------------------------
/高级特征工程/img/notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/notebook.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/img/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/img/overview.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/output_23_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/output_23_1.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/output_25_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/output_25_0.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/output_27_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/output_27_0.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/output_29_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/output_29_0.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/output_41_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/output_41_0.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/output_9_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/output_9_1.png
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/tmp/sales.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/tmp/sales.xls
--------------------------------------------------------------------------------
/高级特征工程/img/bagging_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/bagging_code.png
--------------------------------------------------------------------------------
/高级特征工程/img/dropconnect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/dropconnect.png
--------------------------------------------------------------------------------
/高级特征工程/img/interaction1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/interaction1.png
--------------------------------------------------------------------------------
/高级特征工程/img/interaction2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/interaction2.png
--------------------------------------------------------------------------------
/高级特征工程/img/kfold_code.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/kfold_code.jpg
--------------------------------------------------------------------------------
/高级特征工程/img/model_best.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/model_best.png
--------------------------------------------------------------------------------
/高级特征工程/img/model_weight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/model_weight.png
--------------------------------------------------------------------------------
/高级特征工程/img/reminder_set.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/reminder_set.png
--------------------------------------------------------------------------------
/高级特征工程/img/weight_based.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/weight_based.png
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/data/leleccum.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/data/leleccum.mat
--------------------------------------------------------------------------------
/高级特征工程/img/label_encoding.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/label_encoding.jpg
--------------------------------------------------------------------------------
/高级特征工程/img/mean_encoding.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/mean_encoding.jpg
--------------------------------------------------------------------------------
/高级特征工程/img/modeling_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/modeling_total.png
--------------------------------------------------------------------------------
/高级特征工程/img/residual_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/residual_error.png
--------------------------------------------------------------------------------
/高级特征工程/img/residual_pred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/residual_pred.png
--------------------------------------------------------------------------------
/高级特征工程/img/stacking_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/stacking_data.png
--------------------------------------------------------------------------------
/高级特征工程/img/stacking_data2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/stacking_data2.png
--------------------------------------------------------------------------------
/高级特征工程/img/stacking_past.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/stacking_past.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/img/programmer_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/img/programmer_2.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/img/programmer_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/img/programmer_3.png
--------------------------------------------------------------------------------
/实战篇/3.电力窃漏电用户自动识别/data/model.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/3.电力窃漏电用户自动识别/data/model.xls
--------------------------------------------------------------------------------
/高级特征工程/img/interaction_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/interaction_tree.png
--------------------------------------------------------------------------------
/高级特征工程/img/residual_new_pred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/residual_new_pred.png
--------------------------------------------------------------------------------
/高级特征工程/img/tree_interaction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/tree_interaction.png
--------------------------------------------------------------------------------
/img/feature_engineering_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/img/feature_engineering_total.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/data/catering_sale.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/data/catering_sale.xls
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/data/catering_sale.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/data/catering_sale.xls
--------------------------------------------------------------------------------
/高级特征工程/img/interge_interaction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/interge_interaction.png
--------------------------------------------------------------------------------
/高级特征工程/img/statistic_ctr_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/statistic_ctr_data.png
--------------------------------------------------------------------------------
/高级特征工程/img/statistic_ctr_data2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/statistic_ctr_data2.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/data/catering_sale_all.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/data/catering_sale_all.xls
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/data/electricity_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/data/electricity_data.xls
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/数据预处理介绍/output_37_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/数据预处理介绍/output_37_1.png
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/数据预处理介绍/output_37_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/数据预处理介绍/output_37_2.png
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/数据预处理介绍/output_37_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/数据预处理介绍/output_37_3.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_13_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_13_2.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_17_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_17_2.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_20_1.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_25_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_25_2.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_26_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_26_2.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_5_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_5_2.png
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/img/output_73_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/4.地震后建筑修复建议预测/img/output_73_2.png
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/data/normalization_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/data/normalization_data.xls
--------------------------------------------------------------------------------
/实战篇/3.电力窃漏电用户自动识别/data/missing_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/3.电力窃漏电用户自动识别/data/missing_data.xls
--------------------------------------------------------------------------------
/高级特征工程/img/statistic_ctr_data_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/statistic_ctr_data_code.png
--------------------------------------------------------------------------------
/实战篇/1.数据探索/data/catering_dish_profit.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/1.数据探索/data/catering_dish_profit.xls
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/data/discretization_data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/data/discretization_data.xls
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/data/principal_component.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/2.数据预处理/data/principal_component.xls
--------------------------------------------------------------------------------
/高级特征工程/img/feature_engineering_total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/高级特征工程/img/feature_engineering_total.png
--------------------------------------------------------------------------------
/实战篇/3.电力窃漏电用户自动识别/data/═╪╒╣╦╝┐╝╤∙▒╛╩¤╛▌.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/3.电力窃漏电用户自动识别/data/═╪╒╣╦╝┐╝╤∙▒╛╩¤╛▌.xls
--------------------------------------------------------------------------------
/实战篇/3.电力窃漏电用户自动识别/data/missing_data_processed.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wmpscc/DataMiningNotesAndPractice/HEAD/实战篇/3.电力窃漏电用户自动识别/data/missing_data_processed.xls
--------------------------------------------------------------------------------
/实战篇/2.数据预处理/数据预处理_part02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "**Python 主要数据预处理函数**\n",
8 | "\n",
9 | "|函数名|函数功能|所属库|\n",
10 | "|:-:|:-:|:-:|\n",
11 | "|interpolate|一维、高维数据插值|Scipy|\n",
12 | "|unique|去除数据中的重复元素,得到单值元素列表,它是对象的方法名|Pandas/Numpy|\n",
13 | "|isnull|判断是否空值|Pandas|\n",
14 | "|notnull|判断是否非空值|Pandas|\n",
15 | "|PCA|对指标变量矩阵进行主成分分析|Scikit-Learn|\n",
16 | "|random|生成随机矩阵|Numpy|"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {
23 | "collapsed": true
24 | },
25 | "outputs": [],
26 | "source": []
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": []
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": []
45 | }
46 | ],
47 | "metadata": {
48 | "kernelspec": {
49 | "display_name": "Python 3",
50 | "language": "python",
51 | "name": "python3"
52 | },
53 | "language_info": {
54 | "codemirror_mode": {
55 | "name": "ipython",
56 | "version": 3
57 | },
58 | "file_extension": ".py",
59 | "mimetype": "text/x-python",
60 | "name": "python",
61 | "nbconvert_exporter": "python",
62 | "pygments_lexer": "ipython3",
63 | "version": "3.5.4"
64 | }
65 | },
66 | "nbformat": 4,
67 | "nbformat_minor": 2
68 | }
69 |
--------------------------------------------------------------------------------
/1.环境安装.md:
--------------------------------------------------------------------------------
1 | 本篇文章将带你安装好用Python进行科学计算所需的的环境。
2 |
3 | ### Python
4 | 题外话,关于Python,如果你还没学又想看懂的话,欢迎看我的另一篇文章。[Python与C语言的异同](https://mp.weixin.qq.com/s?__biz=MzUzOTczMTQyOA==&mid=2247483659&idx=1&sn=881b83f05b883d6b4814ae26dbddf7a9&chksm=fac2b580cdb53c960c806e1c28b3034840059c9d5726078d23dc38942e2dccb8e7e74fb2516a#rd)
5 | ### 0.Anaconda介绍
6 | Anaconda是一个用于科学计算的Python发行版,支持 Linux, Mac, Windows系统,提供了包管理与环境管理的功能,可以很方便地解决多版本python并存、切换以及各种第三方包安装问题。Anaconda利用工具/命令conda来进行package和environment的管理,并且已经包含了Python和相关的配套工具。
7 |
8 | ### 1.Anaconda下载与安装
9 | 注意:本次安装使用的是64位WIN10版本,如果你是32位的系统请下载32-Bit版本。
10 | - 进入Anaconda官网下载安装包:`https://www.anaconda.com/download/`
11 | 
12 | - 如果你下载速度非常慢,还可以到清华镜像站进行下载。
13 | 
14 |
15 | - 下载完成后,与平常软件安装相同,但是这里建议勾上。另外如果你的电脑上如果以前没有安装过python建议把下面那个选项也勾上。
16 | 
17 |
18 | - 安装完成后,在开始面板点击
19 | 
20 | 这里打开比较慢,点击后需要稍等片刻
21 | ### 2.安装必要的库
22 | - 接下来安装一些必要的库
23 | 
24 | 为了方便以后使用,建议将这三个全部安装。
25 | - anaconda默认的镜像源在国外,如果你受不了这贼慢的速度可以将默认镜像源改到国内。以下是将镜像源改到国内清华镜像站的方法。
26 | ```
27 | conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
28 | conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
29 | conda config --set show_channel_urls yes
30 | ```
31 | 打开CMD,逐条输入以上命令。
32 | ### 3.更新
33 | - 经过上面的操作已经装好了Python常用的科学计算所需要的库了。但是有些库可能不是最新的,所以建议先将所有库进行更新。
34 | 在CMD中输入以下命令
35 | ```
36 | conda update -- all
37 | ```
38 | 
39 |
40 | ### 4.使用Jupyter Notebook测试是否成功
41 | - 在开始界面打开Jupyter Notebook
42 | 
43 | - 点击后会打开一个网页,没打开也没关系,我们手动打开
44 | 
45 | 复制这条链接到浏览器打开。
46 | - 新建一个Python3文件
47 | 
48 | - 点击新建的文件(默认的第一个文件叫`Untitled1`)
49 | 输入一下代码,点击RUN运行。若未报错,则按照成功。
50 | 
51 |
52 | ### 5.介绍Jupyter notebook
53 | - 几个键
54 | 
55 | - 插入、运行。看字就知道啥意思了,自己试试。
56 | 
57 | 
58 | - 快捷键有很多,这里介绍最常用的几个
59 | - Shift-Enter : 运行本单元,选中下个单元
60 | - Ctrl-Enter : 运行本单元
61 | - Alt-Enter : 运行本单元,在其下插入新单元
62 | - Shift-Tab : 输出帮助信息,部分函数、类、方法等会显示其定义原型,如果在其后加 ? 再运行会显示更加详细的帮助
63 | - Tab : 代码补全
64 | - ESC : 进入命令模式
65 | - A : 命令模式下使用,向上增加单元
66 | - B : 命令模式下使用,向下增加单元
67 |
68 | ### 6.总结
69 | 安装就这么简单,接下来的几篇文章我将分享一些小比赛的实战。
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DataMiningNotesAndPractice
2 | 记录我关于在数据挖掘中,一些从入门到实战所积累的技巧。
3 | 代码很多都是直接从项目中复制出来的,比较乱,见谅。
4 | ## 目录
5 | - [配置科学计算环境](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/1.%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85.md)
6 | - [KMeans算法与交通事故理赔审核预测](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/2.KMeans%E7%AE%97%E6%B3%95%E4%B8%8E%E4%BA%A4%E9%80%9A%E4%BA%8B%E6%95%85%E7%90%86%E8%B5%94%E5%AE%A1%E6%A0%B8%E9%A2%84%E6%B5%8B.md)
7 | - [Matplotlib学习笔记](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/3.Matplotlib%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0.md)
8 | - [NumPy学习笔记](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/4.NumPy%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0.md)
9 | - [Pandas学习笔记](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/5.Pandas%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0.md)
10 | - [数据预处理笔记](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/6.%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86%E7%AC%94%E8%AE%B0.md)
11 | - [机器学习部分](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/7.%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E9%83%A8%E5%88%86.md)
12 | - [SKlearn模型评估方法](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/8.SKlearn%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BC%B0%E6%96%B9%E6%B3%95.md)
13 | - [Kaggle杂记](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/9.Kaggle%E6%9D%82%E8%AE%B0.md)
14 | ## 实战篇
15 | - 1.[数据探索](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E5%AE%9E%E6%88%98%E7%AF%87/1.%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2/README.md)
16 | - 2.[数据预处理](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E5%AE%9E%E6%88%98%E7%AF%87/2.%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86/README.md)
17 | - 3.[电力窃漏电用户自动识别](https://github.com/wmpscc/DataMiningNotesAndPractice/tree/master/%E5%AE%9E%E6%88%98%E7%AF%87/3.%E7%94%B5%E5%8A%9B%E7%AA%83%E6%BC%8F%E7%94%B5%E7%94%A8%E6%88%B7%E8%87%AA%E5%8A%A8%E8%AF%86%E5%88%AB/README.md)
18 | - 4.[地震后建筑修复建议预测](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E5%AE%9E%E6%88%98%E7%AF%87/4.%E5%9C%B0%E9%9C%87%E5%90%8E%E5%BB%BA%E7%AD%91%E4%BF%AE%E5%A4%8D%E5%BB%BA%E8%AE%AE%E9%A2%84%E6%B5%8B/README.md)
19 | - 5.[Kaggle Titanic](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E5%AE%9E%E6%88%98%E7%AF%87/5.Titanic/Kaggle%20Titanic%20Best%20Score.ipynb)
20 |
21 | ## 高级特征工程
22 | 该部分为Coursera上的[How to Win a Data Science Competition: Learn from Top Kagglers](https://www.coursera.org/learn/competitive-data-science/home/welcome)课程笔记。下载后,请使用jupyter notebook打开。
23 | - [Tips and tricks](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E9%AB%98%E7%BA%A7%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B/Tips%20and%20tricks.ipynb)
24 | - [Advanced Feature Engineering I](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E9%AB%98%E7%BA%A7%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B/Advanced%20Feature%20Engineering%20I.ipynb)
25 | - [Hyperparameter tuning](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E9%AB%98%E7%BA%A7%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B/Hyperparameter%20tuning.ipynb)
26 | - [Advanced Feature Engineering II](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E9%AB%98%E7%BA%A7%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B/Advanced%20Feature%20Engineering%20II.ipynb)
27 | - [Emsembling](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E9%AB%98%E7%BA%A7%E7%89%B9%E5%BE%81%E5%B7%A5%E7%A8%8B/Emsembling.ipynb)
28 | - feature engineering
29 | 
30 | - modeling
31 | 
32 |
--------------------------------------------------------------------------------
/实战篇/1.数据探索/数据探索_part02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python 主要数据探索函数"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### Pandas统计函数"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "**Pandas主要统计特征函数**\n",
22 | "\n",
23 | "|方法名|函数功能|\n",
24 | "|:-:|:-:|:-:|\n",
25 | "|sum()|计算数据样本的总和(按列计算)|\n",
26 | "|mean()|计算数据样本的算数平均数|\n",
27 | "|var()|计算数据样本的方差|\n",
28 | "|std()|计算数据样本的标准差|\n",
29 | "|corr()|计算数据样本的Spearman(Pearson)相关系数矩阵|\n",
30 | "|cov()|计算数据样本的协方差矩阵|\n",
31 | "|skew()|样本值的偏度(三阶矩)|\n",
32 | "|kurt()|样本值的峰度(四阶矩)|\n",
33 | "|describe()|给出样本的基本描述(基本统计量如均值、标注差等)|"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "**Pandas累积统计特征函数**\n",
41 | "\n",
42 | "|方法名|函数功能|\n",
43 | "|:-:|:-:|\n",
44 | "|cumsum(`n`)|依次给出前n个数的和|\n",
45 | "|cumprod(`n`)|依次给出前n个数的积|\n",
46 | "|cummax(`n`)|依次给出前n个数的最大值|\n",
47 | "|cummin(`n`)|依次给出前n个数的最小值|"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "**Pandas滚动统计特征函数**\n",
55 | "\n",
56 | "|方法名|函数功能|\n",
57 | "|:-:|:-:|\n",
58 | "|rolling_sum()|计算数据样本的总和(按列计算)|\n",
59 | "|rolling_mean()|数据样本的算数平均数|\n",
60 | "|rolling_var()|计算数据样本的方差|\n",
61 | "|rolling_std()|计算数据样本的标准差|\n",
62 | "|rolling_corr()|计算数据样本的Spearman(Pearson)相关系数矩阵|\n",
63 | "|rolling_cov()|计算数据样本的协方差矩阵|\n",
64 | "|rolling_skew()|样本值的偏度(三阶矩)|\n",
65 | "|rolling_kurt()|样本的峰度(四阶矩)|\n",
66 | "\n",
67 | "调用方法:pd.rolling_mean(D, k),意思是每k列计算依次均值,滚动计算。\n"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### Pandas绘图函数\n",
75 | "Pandas 基于 Matplotlib并对某些命令进行了简化,因此作图通常是 Matplotlib 和 Pandas 相互结合着使用。"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "**Pandas主要统计作图函数**\n",
83 | "\n",
84 | "|函数名|函数功能|\n",
85 | "|:-:|:-:|\n",
86 | "|plot()|绘制线性二维图,折线图|\n",
87 | "|pie()|绘制饼形图|\n",
88 | "|hist()|绘制二维条形直方图,可显示数据的分配情况|\n",
89 | "|boxplot()|绘制样本数据的箱型图|\n",
90 | "|plot(logy = True)|绘制y轴的对数图形|\n",
91 | "|plot(yerr = error)|绘制误差条形图|"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## 总结\n",
99 | "- 1.数据质量分析要求我们拿到数据后先检测是否存在缺失值和异常值;\n",
100 | "- 2.数据质量分析要求我们在数据挖掘建模前,通过频率分布分析、对比分析、帕累托分析、周期性分析、相关性分析等方法,对采集的样本数据的特征规律进行分析,以了解数据的规律和趋势,为数据挖掘的后续环节提供支持。"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": []
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "collapsed": true
117 | },
118 | "outputs": [],
119 | "source": []
120 | }
121 | ],
122 | "metadata": {
123 | "kernelspec": {
124 | "display_name": "Python 3",
125 | "language": "python",
126 | "name": "python3"
127 | },
128 | "language_info": {
129 | "codemirror_mode": {
130 | "name": "ipython",
131 | "version": 3
132 | },
133 | "file_extension": ".py",
134 | "mimetype": "text/x-python",
135 | "name": "python",
136 | "nbconvert_exporter": "python",
137 | "pygments_lexer": "ipython3",
138 | "version": "3.5.4"
139 | }
140 | },
141 | "nbformat": 4,
142 | "nbformat_minor": 2
143 | }
144 |
--------------------------------------------------------------------------------
/5.Pandas学习笔记.md:
--------------------------------------------------------------------------------
1 | ## Pandas 方法
2 | ### pd.read_csv(csv_path) 读入csv文件
3 | 读入csv文件,一般用于返回~
4 | ### ~head() 获取前五行数据
5 | 供快速参考。
6 | ### ~info() 迅速获取数据描述
7 | 获取总行数、每个属性的类型、非空值的数量。
8 | ### ~value_counts() 获取每个值出现的次数
9 |
10 | ``` Python
11 | housing["ocean_proximity"].value_counts()
12 |
13 | # 输出
14 | <1H OCEAN 9136
15 | INLAND 6551
16 | NEAR OCEAN 2658
17 | NEAR BAY 2290
18 | ISLAND 5
19 | Name: ocean_proximity, dtype: int64
20 | ```
21 | ### pd.set_option() 设置指定的值
22 | [详细内容](http://python.usyiyi.cn/documents/Pandas_0j2/generated/pandas.set_option.html)
23 |
24 | 设置最大输出行数
25 | ``` Python
26 | pd.set_option('max_rows', 7)
27 | ```
28 |
29 |
30 | ### ~describe() 简要显示数据的数字特征
31 | 例如:总数、平均值、标准差、最大值最小值、25%/50%/75%值
32 |
33 | ### ~hist() 以直方图形式绘制所有属性
34 | hist()方法依赖于Matplotlib,而Matplotlib又依赖于一个用户指定的图形后端去在你的屏幕上绘制。在Jupyter notebook中可用“%matplotlib inline”告诉Jupyter安装 Matplotlib使用时,使用Jupyter拥有的后端。
35 |
36 | Jupyter中,show()方法是可选的,因为如果有图形需要输出,它会自动绘制。
37 | ``` Python
38 | %matplotlib inline # only in a Jupyter notebook
39 | import matplotlib.pyplot as plt
40 | housing.hist(bins=50, figsize=(20,15))
41 | save_fig("attribute_histogram_plots")
42 | plt.show()
43 | ```
44 |
45 | ### ~loc[] 纯粹基于标签位置的索引器
46 |
47 | ``` Python
48 | strat_train_set = housing.loc[train_index]
49 | strat_test_set = housing.loc[test_index]
50 | ```
51 |
52 | ### ~where() 通过判断自身的值来修改自身对应的值
53 | ``` Python
54 | housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
55 | ```
56 | - cond 如果为True则保持原始值,若为False则使用第二个参数other替换值。
57 | - other 替换的目标值
58 | - inplace 是否在数据上执行操作
59 |
60 | ### pandas.DataFrame() Pandas表格
61 | 具有标签轴,且算术运算在行和列标签上对齐。
62 | ``` Python
63 | compare_props = pd.DataFrame({
64 | "Overall": income_cat_proportions(housing),
65 | "Stratified": income_cat_proportions(strat_test_set),
66 | "Random": income_cat_proportions(test_set),
67 | }).sort_index()
68 | compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
69 | compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
70 | ```
71 |
72 | ### ~drop()
73 | 返回删除了请求轴标签的新对象。
74 | ``` Python
75 | for set_ in (strat_train_set, strat_test_set):
76 | set_.drop("income_cat", axis=1, inplace=True)
77 | ```
78 | - labels 索引或列标签
79 | - axis 从索引(0),还是列(1)中删除
80 | - inplace 若为True则在原数据执行操作
81 |
82 | ### ~corr() 计算相关系数
83 | - method: 可选 {‘pearson’, ‘kendall’, ‘spearman’}
84 | - pearson : standard correlation coefficient
85 | - kendall : Kendall Tau correlation coefficient
86 | - spearman : Spearman rank correlation
87 | - min_periods: Minimum number of observations required per pair of columns to have a valid result. Currently only available for pearson and spearman correlation
88 |
89 | ``` Python
90 | # 计算标准相关系数
91 | corr_matrix = housing.corr()
92 | corr_matrix["median_house_value"].sort_values(ascending=False)
93 |
94 | #输出:
95 | # median_house_value 1.000000
96 | # median_income 0.687160
97 | # total_rooms 0.135097
98 | # housing_median_age 0.114110
99 | # households 0.064506
100 | # total_bedrooms 0.047689
101 | # population -0.026920
102 | # longitude -0.047432
103 | # latitude -0.142724
104 | # Name: median_house_value, dtype: float64
105 |
106 | ```
107 | ### scatter_matrix() 通过绘图比较相关性
108 | ``` Python
109 | from pandas.plotting import scatter_matrix
110 |
111 | attributes = ["median_house_value", "median_income", "total_rooms",
112 | "housing_median_age"]
113 | scatter_matrix(housing[attributes], figsize=(12, 8))
114 | save_fig("scatter_matrix_plot")
115 | ```
116 |
117 | ### ~dropna() 返回略去丢失数据部分后的剩余数据
118 | Return object with labels on given axis omitted where alternately any or all of the data are missing
119 | ``` Python
120 | sample_incomplete_rows.dropna(subset=["total_bedrooms"])
121 | ```
122 |
123 | ### ~fillna() 用指定的方法填充
124 | ``` Python
125 | # 用中位数填充
126 | median = housing["total_bedrooms"].median()
127 | sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True)
128 | ```
129 | ### ~factorize() 将数据转换为数值类型特征
130 | ``` Python
131 | housing_cat = housing['ocean_proximity']
132 | housing_cat.head(10)
133 | # 输出
134 | # 17606 <1H OCEAN
135 | # 18632 <1H OCEAN
136 | # 14650 NEAR OCEAN
137 | # 3230 INLAND
138 | # 3555 <1H OCEAN
139 | # 19480 INLAND
140 | # 8879 <1H OCEAN
141 | # 13685 INLAND
142 | # 4937 <1H OCEAN
143 | # 4861 <1H OCEAN
144 | # Name: ocean_proximity, dtype: object
145 |
146 | housing_cat_encoded, housing_categories = housing_cat.factorize()
147 | housing_cat_encoded[:10]
148 | # 输出
149 | # array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
150 | ```
--------------------------------------------------------------------------------
/8.SKlearn模型评估方法.md:
--------------------------------------------------------------------------------
1 | # SKlearn模型评估方法
2 | ### 准确率
3 | #### 1.accuracy_score
4 | ``` Python
5 | # 准确率
6 | import numpy as np
7 | from sklearn.metrics import accuracy_score
8 | y_pred = [0, 2, 1, 3,9,9,8,5,8]
9 | y_true = [0, 1, 2, 3,2,6,3,5,9]
10 |
11 | accuracy_score(y_true, y_pred)
12 | Out[127]: 0.33333333333333331
13 |
14 | accuracy_score(y_true, y_pred, normalize=False) # 类似海明距离,每个类别求准确后,再求微平均
15 | Out[128]: 3
16 | ```
17 | #### 2.metrics
18 | - 宏平均比微平均更合理,但也不是说微平均一无是处,具体使用哪种评测机制,还是要取决于数据集中样本分布
19 | - 宏平均(Macro-averaging),是先对每一个类统计指标值,然后在对所有类求算术平均值。
20 | - 微平均(Micro-averaging),是对数据集中的每一个实例不分类别进行统计建立全局混淆矩阵,然后计算相应指标。
21 | ``` Python
22 | from sklearn import metrics
23 | metrics.precision_score(y_true, y_pred, average='micro') # 微平均,精确率
24 | Out[130]: 0.33333333333333331
25 |
26 | metrics.precision_score(y_true, y_pred, average='macro') # 宏平均,精确率
27 | Out[131]: 0.375
28 |
29 | metrics.precision_score(y_true, y_pred, labels=[0, 1, 2, 3], average='macro') # 指定特定分类标签的精确率
30 | Out[133]: 0.5
31 | ```
32 | - 其中average参数有五种:(None, ‘micro’, ‘macro’, ‘weighted’, ‘samples’)
33 | ### 召回率
34 | ``` Python
35 | metrics.recall_score(y_true, y_pred, average='micro')
36 | Out[134]: 0.33333333333333331
37 |
38 | metrics.recall_score(y_true, y_pred, average='macro')
39 | Out[135]: 0.3125
40 | ```
41 | ### F1
42 | ``` Python
43 | metrics.f1_score(y_true, y_pred, average='weighted')
44 | Out[136]: 0.37037037037037035
45 | ```
46 | ### F2
47 | 根据公式计算
48 | ``` Python
49 | from sklearn.metrics import precision_score, recall_score
50 | def calc_f2(label, predict):
51 | p = precision_score(label, predict)
52 | r = recall_score(label, predict)
53 | f2_score = 5*p*r / (4*p + r)
54 | return f2_score
55 | ```
56 | ### 混淆矩阵
57 | ``` Python
58 |
59 | from sklearn.metrics import confusion_matrix
60 | confusion_matrix(y_true, y_pred)
61 |
62 | Out[137]:
63 | array([[1, 0, 0, ..., 0, 0, 0],
64 | [0, 0, 1, ..., 0, 0, 0],
65 | [0, 1, 0, ..., 0, 0, 1],
66 | ...,
67 | [0, 0, 0, ..., 0, 0, 1],
68 | [0, 0, 0, ..., 0, 0, 0],
69 | [0, 0, 0, ..., 0, 1, 0]])
70 | ```
71 | ### 分类报告
72 | 包含:precision/recall/fi-score/均值/分类个数
73 | ``` Python
74 | # 分类报告:precision/recall/fi-score/均值/分类个数
75 | from sklearn.metrics import classification_report
76 | y_true = [0, 1, 2, 2, 0]
77 | y_pred = [0, 0, 2, 2, 0]
78 | target_names = ['class 0', 'class 1', 'class 2']
79 | print(classification_report(y_true, y_pred, target_names=target_names))
80 | ```
81 | 输出
82 | ```
83 | precision recall f1-score support
84 |
85 | class 0 0.67 1.00 0.80 2
86 | class 1 0.00 0.00 0.00 1
87 | class 2 1.00 1.00 1.00 2
88 |
89 | avg / total 0.67 0.80 0.72 5
90 | ```
91 |
92 | ### kappa score
93 | - kappa score是一个介于(-1, 1)之间的数. score>0.8意味着好的分类;0或更低意味着不好(实际是随机标签)
94 | ``` Python
95 | from sklearn.metrics import cohen_kappa_score
96 | y_true = [2, 0, 2, 2, 0, 1]
97 | y_pred = [0, 0, 2, 2, 0, 2]
98 | cohen_kappa_score(y_true, y_pred)
99 | ```
100 | ### ROC
101 | #### 1.计算ROC值
102 | ``` Python
103 | import numpy as np
104 | from sklearn.metrics import roc_auc_score
105 | y_true = np.array([0, 0, 1, 1])
106 | y_scores = np.array([0.1, 0.4, 0.35, 0.8])
107 | roc_auc_score(y_true, y_scores)
108 | ```
109 | #### 2.ROC曲线
110 | ``` Python
111 | y = np.array([1, 1, 2, 2])
112 | scores = np.array([0.1, 0.4, 0.35, 0.8])
113 | fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)
114 | ```
115 | ### 海明距离
116 | ``` Python
117 | from sklearn.metrics import hamming_loss
118 | y_pred = [1, 2, 3, 4]
119 | y_true = [2, 2, 3, 4]
120 | hamming_loss(y_true, y_pred)
121 | 0.25
122 | ```
123 | ### Jaccard距离
124 | ``` Python
125 | import numpy as np
126 | from sklearn.metrics import jaccard_similarity_score
127 | y_pred = [0, 2, 1, 3,4]
128 | y_true = [0, 1, 2, 3,4]
129 | jaccard_similarity_score(y_true, y_pred)
130 | 0.5
131 | jaccard_similarity_score(y_true, y_pred, normalize=False)
132 | 2
133 | ```
134 | ### 可释方差值(Explained variance score)
135 | ``` Python
136 | from sklearn.metrics import explained_variance_score
137 | y_true = [3, -0.5, 2, 7]
138 | y_pred = [2.5, 0.0, 2, 8]
139 | explained_variance_score(y_true, y_pred)
140 | ```
141 | ### 平均绝对误差(Mean absolute error)
142 | ``` Python
143 | from sklearn.metrics import mean_absolute_error
144 | y_true = [3, -0.5, 2, 7]
145 | y_pred = [2.5, 0.0, 2, 8]
146 | mean_absolute_error(y_true, y_pred)
147 | ```
148 | ### 均方误差(Mean squared error)
149 | ``` Python
150 | from sklearn.metrics import mean_squared_error
151 | y_true = [3, -0.5, 2, 7]
152 | y_pred = [2.5, 0.0, 2, 8]
153 | mean_squared_error(y_true, y_pred)
154 | ```
155 | ### 中值绝对误差(Median absolute error)
156 | ``` Python
157 | from sklearn.metrics import median_absolute_error
158 | y_true = [3, -0.5, 2, 7]
159 | y_pred = [2.5, 0.0, 2, 8]
160 | median_absolute_error(y_true, y_pred)
161 | ```
162 | ### R方值,确定系数
163 | ``` Python
164 | from sklearn.metrics import r2_score
165 | y_true = [3, -0.5, 2, 7]
166 | y_pred = [2.5, 0.0, 2, 8]
167 | r2_score(y_true, y_pred)
168 | ```
169 | # 参考文献
170 | - [python + sklearn ︱分类效果评估——acc、recall、F1、ROC、回归、距离](https://blog.csdn.net/sinat_26917383/article/details/75199996)
171 | - [sklearn中的模型评估](http://d0evi1.com/sklearn/model_evaluation/)
--------------------------------------------------------------------------------
/2.KMeans算法与交通事故理赔审核预测.md:
--------------------------------------------------------------------------------
1 |
2 | > 上一篇文章讲解了[数据挖掘环境的配置](https://mp.weixin.qq.com/s?__biz=MzUzOTczMTQyOA==&mid=2247483674&idx=1&sn=97a4e277bd69caf303aa8c10a594bed1&chksm=fac2b591cdb53c87cc213c7083068bf72ed40b6ca32e92a0a9fae22520f8174bff137c6c4a76#rd),那这次就从一个小的实战开始吧。这次要学习的是KMeans算法,要挑战的是sofasofa上的一个竞赛(交通事故理赔审核预测)。现在开始吧
3 |
4 | # K-means
5 | ### 介绍
6 | K-Means是基于划分的聚类方法,他是数据挖掘十大算法之一。基于划分的方法是将样本集组成的矢量空间划分为多个区域,每个区域都存在一个样本中心,通过建立映射关系,可以将所有样本分类到其相应的中心。
7 | 
8 | 假设有样本集合D={Xj},KMeans算法的目标是将数据划分为K类:S={S1,S2,...Sk},并且使划分后的K个子集合满足类内误差平方和最小。
9 | 目标函数:
10 | 其中 
11 | Ci即划分后的子集合的中心。
12 |
13 | ### 求解步骤
14 | 求解目标函数是一个NP-hard问题,无法保证得到的就是全局最优解。在经典K-Means聚类算法中采取迭代优化策略,一般包含以下四个步骤
15 | - 1.初始化聚类中心
16 | - 2.分配个样本xj到相近的聚类集合,依据是(p!=j)
17 | - 3.根据步骤二结果,更新聚类中心。
18 | - 4.若达到最大迭代步数或两次迭代差小于设定的阈值则算法结束,否则重复步骤2。
19 |
20 | 
21 | ### 算法改进
22 | 经典的K-means算法在初始化聚类中心时采用的是随机采样的方式,不能保证得到期望的聚类结果,可以选择重复训练多个模型,选取其中表现最好的。但有没有更好的方法呢?David Arthur提出的K-means++算法能够有效的产生初始化的聚类中心。
23 | 首先随机初始化一个聚类中心C1,然后通过迭代计算最大概率值X*,将其加入到中心点中,重复该过程,直到选择k个中心。
24 |
25 | # 交通事故理赔审核预测
26 | > SofaSofa是专门为数据挖掘新人准备练手比赛的地方,这的比赛都会提供几个标杆模型的代码给你参考,新手想要快速入门可以多去这个网站上看看。
27 |
28 | ### 赛题
29 | 这个比赛的链接:http://sofasofa.io/competition.php?id=2
30 | - 任务类型:二元分类
31 | - 背景介绍:在交通摩擦(事故)发生后,理赔员会前往现场勘察、采集信息,这些信息往往影响着车主是否能够得到保险公司的理赔。训练集数据包括理赔人员在现场对该事故方采集的36条信息,信息已经被编码,以及该事故方最终是否获得理赔。我们的任务是根据这36条信息预测该事故方没有被理赔的概率。
32 | - 数据介绍:
33 | 
34 |
35 | - 评价方法:Precision-Recall AUC
36 |
37 | ### 代码
38 | 在官方下载好数据集,在本地解压。打开jupyter notebook开始动手。
39 | 首先导入必要的包
40 | ``` Python
41 | import pandas as pd
42 | import numpy as np
43 | import os
44 | import matplotlib.pyplot as plt
45 | %matplotlib inline
46 | ```
47 | 读入数据集
48 | ``` Python
49 | homePath = "data"
50 | trainPath = os.path.join(homePath, "train.csv")
51 | testPath = os.path.join(homePath, "test.csv")
52 | submitPath = os.path.join(homePath, "sample_submit.csv")
53 | trainData = pd.read_csv(trainPath)
54 | testData = pd.read_csv(testPath)
55 | submitData = pd.read_csv(submitPath)
56 | ```
57 | 参照数据说明,CaseID这列是没有意义的编号,因此这里将他丢弃。
58 | - ~drop()函数:`axis`指沿着哪个轴,0为行,1为列;`inplace`指是否在原数据上直接操作
59 | ``` Python
60 | # 去掉没有意义的一列
61 | trainData.drop("CaseId", axis=1, inplace=True)
62 | testData.drop("CaseId", axis=1, inplace=True)
63 | ```
64 | 快速了解数据
65 | - ~head():默认显示前5行数据,可指定显示多行,例如.head(50)显示前50行
66 | ``` Python
67 | trainData.head()
68 | ```
69 | 
70 | 显示数据简略信息,可以每列有多少非空的值,以及每列数据对应的数据类型。
71 | ``` Python
72 | trainData.info()
73 | ```
74 | 
75 | 以图的形式,快速了解数据
76 | - ~hist():绘制直方图,参数`figsize`可指定输出图片的尺寸。
77 | - 关于绘图可参考我之前的一篇文章,[一文教会你使用Matplotlib绘图](https://mp.weixin.qq.com/s?__biz=MzUzOTczMTQyOA==&mid=2247483654&idx=1&sn=39c1b07182e8dec43a3512626213a5e2&chksm=fac2b58dcdb53c9b6e6f392b4c493ba5eb98e0cf25bd8fe0b1aa9bbbc2c37644c9674a2e9d98#rd)
78 | ``` Python
79 | trainData.hist(figsize=(20, 20))
80 | ```
81 | 
82 | 想要了解特征之间的相关性,可计算相关系数矩阵。然后可对某个特征来排序。
83 | ``` Python
84 | corr_matrix = trainData.corr()
85 | corr_matrix["Evaluation"].sort_values(ascending=False) # ascending=False 降序排列
86 | ```
87 | 
88 | 从训练集中分离标签
89 | ``` Python
90 | y = trainData['Evaluation']
91 | trainData.drop("Evaluation", axis=1, inplace=True)
92 | ```
93 | 使用K-Means训练模型
94 | - KMeans():`n_clusters`指要预测的有几个类;`init`指初始化中心的方法,默认使用的是`k-means++`方法,而非经典的K-means方法的随机采样初始化,当然你可以设置为`random`使用随机初始化;`n_jobs`指定使用CPU核心数,-1为使用全部CPU。
95 | ``` Python
96 | from sklearn.cluster import KMeans
97 | est = KMeans(n_clusters=2, init="k-means++", n_jobs=-1)
98 | est.fit(trainData, y)
99 | y_pred = est.predict(testData)
100 | ```
101 | 保存预测的结果
102 | ``` Python
103 | submitData['Evaluation'] = y_pred
104 | submitData.to_csv("submit_data.csv", index=False)
105 | ```
106 | 现在你可以在运行目录找到这个文件,在比赛网站上可提交查看实际分数。
107 |
108 | # 标杆模型:随机森林
109 | 使用K-means可能得到的结果没那么理想。在官网上,举办方给出了两个标杆模型,效果最好的是随机森林。以下是代码,读者可以自己测试。
110 | ``` Python
111 | # -*- coding: utf-8 -*-
112 | import pandas as pd
113 | from sklearn.ensemble import RandomForestClassifier
114 |
115 | # 读取数据
116 | train = pd.read_csv("data/train.csv")
117 | test = pd.read_csv("data/test.csv")
118 | submit = pd.read_csv("data/sample_submit.csv")
119 |
120 | # 删除id
121 | train.drop('CaseId', axis=1, inplace=True)
122 | test.drop('CaseId', axis=1, inplace=True)
123 |
124 | # 取出训练集的y
125 | y_train = train.pop('Evaluation')
126 |
127 | # 建立随机森林模型
128 | clf = RandomForestClassifier(n_estimators=100, random_state=0)
129 | clf.fit(train, y_train)
130 | y_pred = clf.predict_proba(test)[:, 1]
131 |
132 | # 输出预测结果至my_RF_prediction.csv
133 | submit['Evaluation'] = y_pred
134 | submit.to_csv('my_RF_prediction.csv', index=False)
135 | ```
136 | # 总结
137 | K-means算法是数据挖掘的十大经典算法之一,但实际中如果想要得到满意的效果,还是非常难的,以后会讲到集成学习,使弱学习器进阶为强学习器。
138 |
139 |
140 | > 关于数据挖掘的更多内容,我将持续更新在该项目,欢迎感兴趣的朋友赏个star:https://github.com/wmpscc/DataMiningNotesAndPractice
141 |
142 |
--------------------------------------------------------------------------------
/9.Kaggle杂记.md:
--------------------------------------------------------------------------------
1 | # 特征处理和生成
2 | ## 数值特征
3 | ### scaling
4 | - 基于树的模型不依赖scaling,非基于树的模型恰恰相反
5 | - 当两个属性数量级的差距很大时,原来微小的距离,将变的很大,这对KNN、linear models有很大影响。
6 | - 梯度下降法在没有适当放缩的情况下会变的很糟糕,由于这个原因,神经网络在特征预处理上与线性模型相似。
7 | - 标准化不影响分布
8 | - 在MinMaxScaling或StandardScaling转换之后,特性对非树模型的影响大致相同。
9 | -
10 | ### outliers离群点
11 | - 离群点既可以出现在特征值X里,也可以在目标值y中,这会对模型产生影响
12 | - 我们可以将特征值控制在两个设定的下界和上界之间,例如第一百分位数和99百分位数之间。
13 |
14 | ### rank
15 | 例子:
16 | ```
17 | rank([-100, 0, 1e5]) => [0,1,2]
18 | rank([1000, 1, 10]) => [2,0,1]
19 | ```
20 | - 这个转换可能比MinMaxScaler更好,因为秩转换将使异常值更接近其他对象
21 | - 如果我们没有时间手动处理异常值,线性模型、KNN和神经网络可以从这种转换中获益
22 | - 需要注意的是,它也需要被应用在测试集上,你可以合并后一起处理。
23 | - 可以在scipy.stats.rankdata中找到
24 |
25 | ### 转换
26 | - Log transform:
`np.log(1 + x)`
27 | - Raising to the power < 1:
`np.sqrt(x + 2/3)`
28 | - 这两种转换都是有用的,因为它会使大的值更接近特征的平均值,使接近零的值更容易区分。
29 | - 有时候,在不同预处理产生的连接数据帧上训练模型,或者在混合模型上训练不同的预处理数据是有益的。
30 | - 它能帮助非基于树的模型,例如线性模型、KNN特别是神经网络。
31 |
32 | ### Feature generation
33 | 他是用关于特征的知识和任务来生成新特征,它让模型更简单有效。简单来说就是用先验知识、逻辑推理、直觉来创建新的特征。
34 | - 房价上,知道面积和房价之后,可以创建'每平米'的价钱
35 | - 在Forest Cover Type Prediction dataset上,可以对当前点到水源地建立不同的距离特征
36 | - 还可以提取价格的小数部分,这可以区分消费概念。甚至可以借着这个区分是否为机器生成的异常数据,例如小数部分是0.212895..很长一串
37 |
38 | ## Categorical and ordinal features(标签和顺序特征)
39 | - 区别
40 | 标签:无顺序上区别的,例如男、女
41 | 顺序特征:在顺序上有特别意义,例如小学、初中、大学,这是有递增关系的
42 |
43 | - 将标签映射为数字
44 | - Alphabetical(sorted):[S,C,Q] -> [2,1,3]
`sklearn.preprocessing.LabelEncoder`
45 | - Order od appearance:[S,C,Q] -> [1,2,3]
46 | `Pandas.factorize`
47 |
48 | - 将标签替换为其频率,这代表了值的分布信息,可用于树模型和线性模型
49 | [S,C,Q] -> [0.5,0.3,0.2]
50 | ``` Python
51 | encoding = titanic.groupby('Embarked').size()
52 | encoding = encoding / len(titanic)
53 | titanic['enc'] = titanic.Embarked.map(encoding)
54 | ```
55 | - Label and Frequency编码常用于树模型
56 | - One-hot编码常用于非树模型
57 | - 标签的相互联结可以帮助线性模型和KNN
58 |
59 | |pclass|sex|pclass_sex|
60 | |:-:|:-:|:-:|
61 | |3|male|3male|
62 | |1|femal|1femal|
63 | |3|femal|3fmeal|
64 | |1|femal|1femal|
65 |
66 |
67 | ## Datetime and coordinates
68 | ### Datetime
69 | - Periodicity 周期性
70 | 在周、月、季节、年中的天数;秒、分、小时
71 | - Time since row-independent/dependent event
72 | - 行无关时刻:since 00:00:00 UTC,1 January 1970
73 | - 行相关重要时刻:离下一个假期天数、上一个假期过去天数
74 |
75 | |Data|week day| daynumber_since_year_2014|is_holiday|days_till_holidays|***sales***|
76 | |:-:|:-:|:-:|:-:|:-:|:-:|
77 | |01.01.14|5|0|True|0|1213|
78 | |02.01.14|6|1|False|3|938|
79 | |03.01.14|0|2|False|2|2448|
80 | |04.01.14|1|3|False|1|1744|
81 | |05.01.14|2|4|True|0|1732|
82 | |06.01.14|3|5|False|9|1022|
83 |
84 | - Difference between dates
85 | datetime_feature_1 - datetime_feature_2
86 |
87 | |user_id|registration_date|***last_purchase_date***|***last_call_date***|date_diff|churn|
88 | |:-:|:-:|:-:|:-:|:-:|:-:|
89 | |14|10.02.2016|21.04.2016|26.04.2016|5|0|
90 | |15|10.02.2016|03.06.2016|01.06.2016|-2|1|
91 | |16|11.02.2016|11.01.2017|11.01.2017|1|1|
92 | |20|12.02.2016|06.11.2016|08.02.2017|94|0|
93 |
94 | ### Coordinates
95 | - Interesting places from train/test data or additional data
96 |
使用各种距离构造特征
97 | - Centers of clusters
98 |
利用到聚类中心的距离
99 | - Aggregated statistic
100 |
计算周边对象的汇总统计信息
101 | - trick
102 |
利用决策树根据经纬度信息,将地区分为两个部分
103 |
104 | ## 处理缺失值
105 | - 根据具体情况决定填充缺失值
106 | - 通常用`-999`、`-1`、mean、median替换缺失值
107 | - Missing values already can be replaced with something by organizers.
108 | - 二值特征"isnull"也是有用的
109 | - 通常来说,在feature generation之前应该避免去填充nan
110 | - Xgboost可以处理NaN
111 |
112 |
113 | # 从文本和图像中提取特征
114 | ### Bag of words
115 | 为了统一尺度,我们使用正则化后的"Bag of words"
116 | - Term frequency
117 | ``` Python
118 | tf = 1/x.sum(axis=1)[:, None]
119 | x = x*tf
120 | ```
121 | 为了提取重点,降低常出现词的频率,有了这个:
122 | - Inverse Document Frequency
123 | ``` Python
124 | idf = np.log(x.shape[0] / (x > 0).sum(0))
125 | x = x*idf
126 | ```
127 | 这可以在`sklearn.feature_extraction.text.TfidfVectorizer`找到。
128 |
129 |
130 | ### N-grams
131 | 可以利用于句子上。
132 | > sklearn.feature_exraction.text.CountVectorizer:
133 | > Ngram_range, analyzer
134 |
135 |
136 | ## Pipeline of applying BOW(Conclusion)
137 | ### 1.Preprocessing:
138 | **Lowercase**: Very和very的区别
139 | **stemming**:democracy, democratic, and democratization -> democr
140 | **lemmatization**:democracy, democratic, and democratization -> democracy
141 | **stopwords**:一般在网上可以搜到这样的表
142 | > sklearn.feature_extraction.text.CountVectorizer:
143 | > max_df
144 |
145 | ### 2.Ngrams can help to use local context
146 |
147 | ### 3.Postprocessing:TFiDF
148 |
149 |
150 | ## BOW and w2v comparison
151 | - 1.Bag of words
152 | - a. Very large vectors
153 | - b. Meaning of each value in vector is known
154 | - Word2vec
155 | - a. Relatively small vectors
156 | - b. Values in vevtor can be interpreted only in some cases
157 | - c. The words with similar meaning often have similar embedding
158 |
159 | ## Feature extraction from text and images
160 | - 1.**Texts**
161 | - a.Preprocessing
Lowercase, stemming, lemmarization, stopwords
162 | - b.Bag of words
1.Huge vectors
2.Ngrams can help to use local context
3.TFiDF can be of use as postprocessing
163 | - c.Word2vec
1.Relatively small vectors
2.Pretrained models
164 |
165 |
166 | - 2.**Image**
167 | - a.Features can be extracted from different layers
168 | - b.Careful choosing of pretrained network can help
169 | - c.Finetuing allows to refine pretrained models
170 | - d.Data augmentation can improve the model.
171 |
172 |
173 | # 探索数据分析
174 | ## Building intuition about the data
175 | - **Get domain knowledge**
176 | - It helps to deeper understand the problem
177 | - **Check if the data is intuitive**
178 | - And agrees with domain knowledge
179 | - **Understand how the data was generated**
180 | - As it is crucial to set up a proper validation(很可能训练集数据分布与测试集不同,导致验证集错误)
181 |
182 | ## Exploring anonymized data
183 | - **Try to decode the features**
184 | - Guess the true meaning of the feature
185 | - **Guess the feature types**
186 | - Each type needs its own preprocessing
187 |
188 |
189 | # Mean encodings
190 | ## Using target to generate features
191 | - **example**
192 |
193 | ||feature|feature_label|feature_mean|target|
194 | |:-:|:-:|:-:|:-:|:-:|
195 | |0|Moscow|1|0.4|0|
196 | |1|Moscow|1|0.4|1|
197 | |2|Moscow|1|0.4|1|
198 | |3|Moscow|1|0.4|0|
199 | |4|Moscow|1|0.4|0|
200 | |5|Tver|2|0.8|1|
201 | |6|Tver|2|0.8|1|
202 | |7|Tver|2|0.8|1|
203 | |8|Tver|2|0.8|0|
204 | |9|Klin|0|0.0|0|
205 | |10|klin|0|0.0|0|
206 |
207 | - `feature_mean`=mean(target)
208 | > 这个操作可以让本来无序的类别标签变得有序。
209 |
--------------------------------------------------------------------------------
/4.NumPy学习笔记.md:
--------------------------------------------------------------------------------
1 | 本文是我在学习过程中收集的numpy方法,并做了记录。
2 |
3 | ## NumPy 方法
4 | ### np.ceil(x, y) 限制元素范围
5 | - x 输入的数据
6 | - y float型,每个元素的上限
7 | ``` Python
8 | housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # 每个元素都除1.5
9 | ```
10 |
11 | ### permutation(x) 随机生成一个排列或返回一个range
12 | 如果x是一个多维数组,则只会沿着它的第一个索引进行混洗。
13 | ``` Python
14 | import numpy as np
15 |
16 | shuffle_index = np.random.permutation(60000)
17 | X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
18 | ```
19 | ### numpy.argmax() 返回沿轴的最大值的索引
20 | 返回沿轴的最大值的索引。
21 | ``` Python
22 | # some_digit_scores 内容
23 | # array([[-311402.62954431, -363517.28355739, -446449.5306454 ,
24 | # -183226.61023518, -414337.15339485, 161855.74572176,
25 | # -452576.39616343, -471957.14962573, -518542.33997148,
26 | # -536774.63961222]])
27 | np.argmax(some_digit_scores)
28 | # Out
29 | # 5
30 | ```
31 | - a : array_like; 输入数组
32 | - axis : int, optional; 默认情况下,索引是放在平面数组中,否则沿着指定的轴。
33 | - out : array, optional; 如果提供,结果将被插入到这个数组中。它应该是适当的形状和dtype。
34 |
35 |
36 | ### np.linalg.inv() 计算矩阵的逆
37 | ``` Python
38 | X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance
39 | theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
40 | ```
41 | - a : (..., M, M) array_like;被求逆的矩阵
42 |
43 | ### numpy.dot(a, b, out=None) 计算两个数组的点积
44 | ``` Python
45 | >>> np.dot(3, 4)
46 | 12
47 |
48 | # Neither argument is complex-conjugated:
49 | >>> np.dot([2j, 3j], [2j, 3j])
50 | (-13+0j)
51 |
52 | # For 2-D arrays it is the matrix product:
53 | >>> a = [[1, 0], [0, 1]]
54 | >>> b = [[4, 1], [2, 2]]
55 | >>> np.dot(a, b)
56 | array([[4, 1],
57 | [2, 2]])
58 |
59 | >>> a = np.arange(3*4*5*6).reshape((3,4,5,6))
60 | >>> b = np.arange(3*4*5*6)[::-1].reshape((5,4,6,3))
61 | >>> np.dot(a, b)[2,3,2,1,2,2]
62 | 499128
63 | >>> sum(a[2,3,2,:] * b[1,2,:,2])
64 | 499128
65 | ```
66 | - a : array_like;First argument.
67 | - b : array_like;Second argument.
68 |
69 | ### numpy.ndarray.T() 计算矩阵的转置
70 | 与`self.transpose()`相同,如果`self.ndim < 2`则返回它自身。
71 | ``` Python
72 | >>> x = np.array([[1.,2.],[3.,4.]])
73 | >>> x
74 | array([[ 1., 2.],
75 | [ 3., 4.]])
76 | >>> x.T
77 | array([[ 1., 3.],
78 | [ 2., 4.]])
79 | >>> x = np.array([1.,2.,3.,4.])
80 | >>> x
81 | array([ 1., 2., 3., 4.])
82 | >>> x.T
83 | array([ 1., 2., 3., 4.])
84 | ```
85 |
86 | ### numpy.random.seed() 生成器种子
87 | 该方法由` RandomState`初始化,它可以被重新设置。
88 | ``` Python
89 | np.random.seed(42)
90 | theta = np.random.randn(2,1) # random initialization
91 | ```
92 | - seed : int or array_like, optional;必须为32位无符号整数。
93 |
94 |
95 | ### numpy.random.randn() 从标准正太分布返回样本
96 | ``` Python
97 | >>> theta = np.random.randn(2,1)
98 | array([[ 4.21509616],
99 | [ 2.77011339]])
100 | ```
101 | ###### 参数
102 | - d0, d1, ..., dn : int, optional;返回的数组维度,应该都是正值。如果没有给出,将返回一个Python float值。
103 |
104 | ###### 返回值
105 | - Z : ndarray or float;一个经过标准正态分布抽样的,`(d0, d1, ..., dn)`维度的浮点数组。
106 |
107 | ### numpy.array() 创建一个数组
108 | ``` Python
109 | theta_path_bgd = np.array(theta_path_bgd)
110 | theta_path_sgd = np.array(theta_path_sgd)
111 | theta_path_mgd = np.array(theta_path_mgd)
112 | ```
113 | - object : array_like
114 | - dtype : data-type, optional
115 |
116 | ### numpy.random.rand()&nmbsp;生成给定shap的随机值
117 | ``` Python
118 | m = 100
119 | X = 6 * np.random.rand(m, 1) - 3
120 | y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)
121 | ```
122 | ``` Python
123 | >>> np.random.rand(3,2)
124 | array([[ 0.14022471, 0.96360618], #random
125 | [ 0.37601032, 0.25528411], #random
126 | [ 0.49313049, 0.94909878]]) #random
127 | ```
128 | - d0, d1, ..., dn : int, optional;返回的数组维度,必须是正值。如果为空,则返回一个Python float值。
129 |
130 | ### numpy.linspace() 在指定区间返回间隔均匀的样本[start, stop]
131 | ``` Python
132 | X_new=np.linspace(-3, 3, 100).reshape(100, 1)
133 | X_new_poly = poly_features.transform(X_new)
134 | y_new = lin_reg.predict(X_new_poly)
135 | plt.plot(X, y, "b.")
136 | plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
137 | plt.xlabel("$x_1$", fontsize=18)
138 | plt.ylabel("$y$", rotation=0, fontsize=18)
139 | plt.legend(loc="upper left", fontsize=14)
140 | plt.axis([-3, 3, 0, 10])
141 | save_fig("quadratic_predictions_plot")
142 | plt.show()
143 | ```
144 | - start : scalar;序列的起始值
145 | - stop : scalar;序列的结束值
146 | - num : int, optional;要生成的样本数量,默认为50个。
147 | - endpoint : bool, optional;若为True则包括结束值,否则不包括结束值,即[start, stop)区间。默认为True。
148 | - dtype : dtype, optional;输出数组的类型,若未给出则从输入数据推断类型。
149 |
150 | ### meshgrid() 从坐标向量返回坐标矩阵
151 | ``` Python
152 | >>> nx, ny = (3, 2)
153 | >>> x = np.linspace(0, 1, nx)
154 | >>> y = np.linspace(0, 1, ny)
155 | >>> xv, yv = np.meshgrid(x, y)
156 | >>> xv
157 | array([[ 0. , 0.5, 1. ],
158 | [ 0. , 0.5, 1. ]])
159 | >>> yv
160 | array([[ 0., 0., 0.],
161 | [ 1., 1., 1.]])
162 | >>> xv, yv = np.meshgrid(x, y, sparse=True) # make sparse output arrays
163 | >>> xv
164 | array([[ 0. , 0.5, 1. ]])
165 | >>> yv
166 | array([[ 0.],
167 | [ 1.]])
168 | ```
169 | - x1, x2,..., xn : array_like;代表网格坐标的一维数组。
170 | - indexing : {‘xy’, ‘ij’}, optional;输出的笛卡儿('xy',默认)或矩阵('ij')索引。
171 | - sparse : bool, optional;如果为True则返回稀疏矩阵以减少内存,默认为False。
172 | ### norm() 矩阵或向量范数
173 | ``` Python
174 | t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5
175 |
176 | # ignoring bias term
177 | t1s = np.linspace(t1a, t1b, 500)
178 | t2s = np.linspace(t2a, t2b, 500)
179 | t1, t2 = np.meshgrid(t1s, t2s)
180 | T = np.c_[t1.ravel(), t2.ravel()]
181 | Xr = np.array([[-1, 1], [-0.3, -1], [1, 0.1]])
182 | yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]
183 |
184 | J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)
185 |
186 | N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)
187 | N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)
188 |
189 | t_min_idx = np.unravel_index(np.argmin(J), J.shape)
190 | t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]
191 |
192 | t_init = np.array([[0.25], [-1]])
193 | ```
194 | - x : array_like;输入的数组,如果`axis`是None,则`x`必须是1-D或2-D。
195 | - ord : {non-zero int, inf, -inf, ‘fro’, ‘nuc’}, optional;范数的顺序,inf表示numpy的inf对象。
196 | - axis : {int, 2-tuple of ints, None}, optional
197 | - keepdims : bool, optional
198 |
199 | 以下范数可以被计算:
200 | | ord | norm for matrices | norm for vectors|
201 | |--|--|--|
202 | |None | Frobenius norm | 2-norm|
203 | |‘fro’ | Frobenius norm | –|
204 | |‘nuc’ | nuclear norm | –|
205 | |inf | max(sum(abs(x), axis=1)) | max(abs(x))|
206 | |-inf | min(sum(abs(x), axis=1)) | min(abs(x))|
207 | |0 | – | sum(x != 0)|
208 | |1 | max(sum(abs(x), axis=0)) | as below|
209 | |-1 | min(sum(abs(x), axis=0)) | as below|
210 | |2 | 2-norm (largest sing. value) | as below|
211 | |-2 | smallest singular value | as below|
212 | |other | – | sum(abs(x)**ord)**(1./ord)|
213 |
214 | 对于`ord <= 0`的值,它严格来说不是数学规范的范数,但它作为数值目的任然有用。
215 | ### unravel_index() 将平面索引或平面索引数组转换为坐标数组的元组
216 | ``` Python
217 | >>> np.unravel_index([22, 41, 37], (7,6))
218 | (array([3, 6, 6]), array([4, 5, 1]))
219 | >>> np.unravel_index([31, 41, 13], (7,6), order='F')
220 | (array([3, 6, 6]), array([4, 5, 1]))
221 |
222 | >>> np.unravel_index(1621, (6,7,8,9))
223 | (3, 1, 4, 1)
224 | ```
225 | - indices : array_like;一个整数数组,其元素是索引到维数组dims的平坦版本中。
226 | - dims : tuple of ints;用于分解索引的数组的形状。
227 | - order : {‘C’, ‘F’}, optional;决定`indices`应该按row-major (C-style) or column-major (Fortran-style) 顺序。
228 |
229 | ### mean() 计算沿指定轴的算术平均值
230 | ``` Python
231 | >>> a = np.array([[1, 2], [3, 4]])
232 | >>> np.mean(a)
233 | 2.5
234 | >>> np.mean(a, axis=0)
235 | array([ 2., 3.])
236 | >>> np.mean(a, axis=1)
237 | array([ 1.5, 3.5])
238 |
239 |
240 | >>> a = np.zeros((2, 512*512), dtype=np.float32)
241 | >>> a[0, :] = 1.0
242 | >>> a[1, :] = 0.1
243 | >>> np.mean(a)
244 | 0.54999924
245 | ```
246 | - a : array_like;包含要求平均值的数组,如果不是数组,则尝试进行转换。
247 | - axis : None or int or tuple of ints, optional;计算平均值的轴,默认计算扁平数组。
248 | - dtype : data-type, optional;用于计算平均值的类型。
249 | - out : ndarray, optional
--------------------------------------------------------------------------------
/高级特征工程/Tips and tricks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tips and tricks"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Practical guide\n",
15 | "经验之谈,这将节省你大量时间。"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "### Before you enter a competition\n",
23 | "当我们想参加比赛时,先确定你的目标和试着估计你的参与可以得到什么。\n",
24 | "- 你可能想了解有关这个问题的更多信息\n",
25 | "- 你可能希望熟悉新的软件工具包\n",
26 | "- 你可能想要尝试去获取奖牌\n",
27 | "\n",
28 | "**这些目标中的每一个都将影响您选择参加的竞赛。**
\n",
29 | "如果您想了解更多有趣的问题,您可能希望在论坛上进行广泛讨论。例如,如果您对数据科学,医学应用感兴趣, 您可以尝试在2017年数据科学碗中预测肺癌。
\n",
30 | "如果您想熟悉新的软件工具,您可能希望比赛有必要的教程。 例如,如果您想学习神经网络库。您可以选择任何具有自然保护特征,有监督学习的图像比赛。
\n",
31 | "如果你想尝试得到奖牌,您可能想要检查参与者有多少次提交机会。 如果人们有超过一百个提交的点数,这可能是明显的问题或验证困难的明显标志。如果排在前列的人很少,这通常意味这应该有一个非平凡的方法来参加这场比赛,或者只有少数人发现了它。除此之外,你还需要关注一下排在前列团队的规模,如果他是一个人,那么你要是聚集一支优秀的团队,胜算会大一些。"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "### After you enter a competition:"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "- **1.组织你的想法**\n",
46 | "\n",
47 | "一旦熟悉了数据,开始写下你以后想要尝试的方法。什么东西可以在这里工作?你可能采取什么方法。
\n",
48 | "完成后,去论坛阅读帖子和话题高相关度的内容。强烈建议你参与论坛上的讨论,您可以在论坛上获得大量信息并结识新朋友。"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "- **2.挑选一些重要的、有前途的想法**\n",
56 | "\n",
57 | "在初始管道准备好之后,你提出了一些想法,你可能想要开始改进你的解决方案
\n",
58 | "您可能希望将想法排序,将最重要和最有希望的首先实施。或者可以将它们归类到各自主题中,例如特征生成、验证、度量优化等。
"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "- **3.尝试去理解为什么它会或不会工作**\n",
66 | "\n",
67 | "现在选择一个想法并实施它,尝试在途中获得一些简介。特别是,试着理解为什么某些东西有效或无效。从某种意义上讲,分析工作的能力和在你尝试自己的想法时得出的结论将使您走上正确的轨道,去揭示隐藏的数据模式和泄露。"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### After you enter a competition:Everything is a hyperparameter"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "**我喜欢按照这些原则对所有参数进行排序:**\n",
82 | "- 1.重要性
将参数从重要到不重要排序,这些可能取决于数据结构、目标、度量等等\n",
83 | "- 2.可行性
例如Rate参数很容易调整,但这可能永远需要调整\n",
84 | "- 3.理解
评价参数,我知道它在做什么或不知道。这里重要的是要了解每个参数在整个管道中的变化。\n",
85 | "\n",
86 | "注意:改变一个参数可能影响整个管道"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "### Data loading\n",
94 | "一开始就对资源做一些优化,将节省大量时间和计算资源。"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "- **从经典的处理方法开始然后将csv/txt文件转为hdf5/npy文件以更快的加载**\n",
102 | "\n",
103 | "我通常从基本数据预处理开始,如标签,编码,标签恢复,使用额外的数据。然后,我将结果数据转储为HDF5或npy格式。\n",
104 | "HDF5是Pandas的数据帧,npy存储非bit数组。运行试验通常需要大量重启内核,这将导致重新加载所有数据,加载csv文件可能需要几分钟,从HDF5或npy格式加载数据只需几秒钟即可完成。"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "- **将64位阵列转为32位,节省两倍内存**\n",
112 | "\n",
113 | "默认情况下,Pandas以64位阵列存储数据,这在大多数情况下是不必要的。将所有内容向下转换为32位将节省两倍的内存"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "- **大数据集可以被分块处理**\n",
121 | "\n",
122 | "Pandas支持即时的数据重新链接。因此,大多数数据集可以在没有大量内存的情况下处理。例如您可以简单的对训练集进行拆分来验证你的模型,而不是使用完整数据来做交叉验证。"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {},
128 | "source": [
129 | "### Performance evaluation\n",
130 | "- **并不总是需要进行广泛的验证**\n",
131 | "\n",
132 | "- **从最快的模型开始-LightGBM**\n",
133 | "\n",
134 | "我通常从LightGBM开始,找到一些相当不错的参数,并评估我的特征的性能。
不要从SVM、随机森林、神经网络开始,你会浪费太多时间等待它们的训练。只有当我对特征工程感到满意时,才会转向去调整模型、采样和stacking。\n",
135 | "\n"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "### Fast and dirty always better\n",
143 | "在某些方面,我将我的方法描述为“fast and dirty”,总是更好。"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "- **不要过于关注代码质量**\n",
151 | "\n",
152 | "关注真正重要的事——数据。做数据探索,尝试不同的特征。谷歌特定领域的知识。您的代码是次要的。创建不必要的类和个人框架可能只会让事情更难以改变,这会导致浪费你的时间,所以要保持简单合理。"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "- **把事情简单化:只保留重要的东西**\n",
160 | "\n",
161 | "不要跟踪每个小变化"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "- **如果你对计算资源感到不爽-那就去租一台服务器**\n",
169 | "\n",
170 | " 最后,如果您对给定的计算资源感到非常不舒服,不要挣扎数周,只需租一台更大的服务器。 "
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "### Initial pipeline"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "- **从简单甚至原始的解决方案开始**\n",
185 | "\n",
186 | "- **用完整的管道调试**\n",
187 | "\n",
188 | "这种解决方案的主要目的不是建立一个好的模型,而是从数据的最开始到我们将提交文件写成确定的格式,建立完整的调试管道。我建议你从构建初始管道开始,通常你可以在kernel中找到组织者提供的baseline解决方案。我建议你仔细阅读并自己写。\n",
189 | "\n",
190 | "- **“从见到到复杂”**\n",
191 | "\n",
192 | "另外,我建议你在其他方面遵循从简单到复杂的方法。例如我更喜欢随机森林而不是GBDT,至少随机森林工作得非常快,几乎不需要调整混合参数。"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "### Best Practices from Software Development\n",
200 | "- **使用好的变量名**\n",
201 | "\n",
202 | "无论你多么聪明,如果你的变量名起的不好,你肯定会对它感到困惑,这是迟早会发生的。\n",
203 | "\n",
204 | "- **让你的研究可重复**\n",
205 | " - 固定所有随机种子\n",
206 | " - 准确记下功能的生成方式\n",
207 | " - 将代码存储在版本控制系统下,例如git。
很多时候,你需要回退你的模型到两星期前做模型集成。\n",
208 | "\n",
209 | "- **复用代码**\n",
210 | "\n",
211 | "在训练和测试阶段使用相同的代码非常重要。例如,为了保证它们以一致的方式训练,应该使用相同的代码准备和转换特征。这地方一般很难察觉到,所以最好小心点。我建议将可重用代码移动到单独的函数中,甚至是单独的模块。"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "### Read papers\n",
219 | "- 这可以获取到ML相关的电子\n",
220 | " - 例如,如何去优化AUC\n",
221 | "- 便于熟悉相关领域问题\n",
222 | " - 尤其对特征生成有用"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "### My pipeline\n",
230 | "- **Read forums and examine kernels first**\n",
231 | " - There are always discussions happening!\n",
232 | " \n",
233 | "- **Start with EDA and a baseline**\n",
234 | " - To make sure the data is loaded correctly\n",
235 | " - To check if validation is stable\n",
236 | "\n",
237 | "- **I ad features in bulks**\n",
238 | " - At start I create all the features I can make up\n",
239 | " - I evaluate many features at once(not \"add one and evaluate\")\n",
240 | "\n",
241 | "- **Hyperparameters optimization**\n",
242 | " - First find the parameters to overfit train dataset\n",
243 | " - And then try to trim model"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "### Code organization:keeping it clean\n",
251 | "- **Very important to have reproducible results!**\n",
252 | " - Keep important code clean\n",
253 | "\n",
254 | "- **Long execution history leads to mistakes**\n",
255 | "\n",
256 | "- **Your notebooks can become a total mess**\n",
257 | "``` Python\n",
258 | "s = qq.sum(1)\n",
259 | "ss = s[:,3]/qq.var()\n",
260 | "sss = ss[0]\n",
261 | "```\n",
262 | "注意代码质量\n",
263 | "\n",
264 | "- **One notebook per submission(and use git)**\n",
265 | "\n",
266 | "- **Before creating a submission restart the kernel** \n",
267 | " - Use \"Restart and run all\" button\n",
268 | " "
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "collapsed": true
276 | },
277 | "outputs": [],
278 | "source": []
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {
284 | "collapsed": true
285 | },
286 | "outputs": [],
287 | "source": []
288 | }
289 | ],
290 | "metadata": {
291 | "kernelspec": {
292 | "display_name": "Python 3",
293 | "language": "python",
294 | "name": "python3"
295 | },
296 | "language_info": {
297 | "codemirror_mode": {
298 | "name": "ipython",
299 | "version": 3
300 | },
301 | "file_extension": ".py",
302 | "mimetype": "text/x-python",
303 | "name": "python",
304 | "nbconvert_exporter": "python",
305 | "pygments_lexer": "ipython3",
306 | "version": "3.5.4"
307 | }
308 | },
309 | "nbformat": 4,
310 | "nbformat_minor": 2
311 | }
312 |
--------------------------------------------------------------------------------
/实战篇/3.电力窃漏电用户自动识别/MODEL1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import os\n",
12 | "import numpy as np\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "import pandas as pd\n",
15 | "import seaborn as sns\n",
16 | "%matplotlib inline"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 3,
22 | "metadata": {
23 | "collapsed": true
24 | },
25 | "outputs": [],
26 | "source": [
27 | "data = pd.read_excel('data/model.xls', header=None)"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 8,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stderr",
37 | "output_type": "stream",
38 | "text": [
39 | "/home/heolis/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel_launcher.py:20: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(input_dim=3, units=10)`\n",
40 | "/home/heolis/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel_launcher.py:22: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(input_dim=10, units=1)`\n",
41 | "/home/heolis/anaconda3/envs/tensorflow/lib/python3.5/site-packages/ipykernel_launcher.py:26: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`.\n"
42 | ]
43 | },
44 | {
45 | "ename": "ValueError",
46 | "evalue": "('Some keys in session_kwargs are not supported at this time: %s', dict_keys(['class_mode']))",
47 | "output_type": "error",
48 | "traceback": [
49 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
50 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
51 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'binary_crossentropy'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'adam'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclass_mode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"binary\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#编译模型,使用adam方法求解\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnb_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#训练模型,循环1000次\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_weights\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnetfile\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#保存模型\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
52 | "\u001b[0;32m~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[1;32m 1006\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1007\u001b[0m \u001b[0mins\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msample_weights\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1008\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_train_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1009\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1010\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
53 | "\u001b[0;32m~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36m_make_train_function\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0mupdates\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mupdates\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'train_function'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m **self._function_kwargs)\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_test_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
54 | "\u001b[0;32m~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36mfunction\u001b[0;34m(inputs, outputs, updates, **kwargs)\u001b[0m\n\u001b[1;32m 2693\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Invalid argument \"%s\" passed to K.function with TensorFlow backend'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2694\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2695\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mFunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mupdates\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mupdates\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2697\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
55 | "\u001b[0;32m~/anaconda3/envs/tensorflow/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, inputs, outputs, updates, name, **session_kwargs)\u001b[0m\n\u001b[1;32m 2540\u001b[0m raise ValueError('Some keys in session_kwargs are not '\n\u001b[1;32m 2541\u001b[0m \u001b[0;34m'supported at this '\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2542\u001b[0;31m 'time: %s', session_kwargs.keys())\n\u001b[0m\u001b[1;32m 2543\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_callable_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2544\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_feed_arrays\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
56 | "\u001b[0;31mValueError\u001b[0m: ('Some keys in session_kwargs are not supported at this time: %s', dict_keys(['class_mode']))"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "import pandas as pd\n",
62 | "from random import shuffle\n",
63 | "\n",
64 | "datafile = 'data/model.xls'\n",
65 | "data = pd.read_excel(datafile)\n",
66 | "data = data.as_matrix()\n",
67 | "shuffle(data)\n",
68 | "\n",
69 | "p = 0.8 #设置训练数据比例\n",
70 | "train = data[:int(len(data)*p),:]\n",
71 | "test = data[int(len(data)*p):,:]\n",
72 | "\n",
73 | "#构建LM神经网络模型\n",
74 | "from keras.models import Sequential #导入神经网络初始化函数\n",
75 | "from keras.layers.core import Dense, Activation #导入神经网络层函数、激活函数\n",
76 | "\n",
77 | "netfile = 'tmp/net.model' #构建的神经网络模型存储路径\n",
78 | "\n",
79 | "net = Sequential() #建立神经网络\n",
80 | "net.add(Dense(input_dim = 3, output_dim = 10)) #添加输入层(3节点)到隐藏层(10节点)的连接\n",
81 | "net.add(Activation('relu')) #隐藏层使用relu激活函数\n",
82 | "net.add(Dense(input_dim = 10, output_dim = 1)) #添加隐藏层(10节点)到输出层(1节点)的连接\n",
83 | "net.add(Activation('sigmoid')) #输出层使用sigmoid激活函数\n",
84 | "net.compile(loss = 'binary_crossentropy', optimizer = 'adam', class_mode = \"binary\") #编译模型,使用adam方法求解\n",
85 | "\n",
86 | "net.fit(train[:,:3], train[:,3], nb_epoch=1000, batch_size=1) #训练模型,循环1000次\n",
87 | "net.save_weights(netfile) #保存模型\n",
88 | "\n",
89 | "predict_result = net.predict_classes(train[:,:3]).reshape(len(train)) #预测结果变形\n",
90 | "'''这里要提醒的是,keras用predict给出预测概率,predict_classes才是给出预测类别,而且两者的预测结果都是n x 1维数组,而不是通常的 1 x n'''\n",
91 | "\n",
92 | "from cm_plot import * #导入自行编写的混淆矩阵可视化函数\n",
93 | "cm_plot(train[:,3], predict_result).show() #显示混淆矩阵可视化结果\n",
94 | "\n",
95 | "from sklearn.metrics import roc_curve #导入ROC曲线函数\n",
96 | "\n",
97 | "predict_result = net.predict(test[:,:3]).reshape(len(test))\n",
98 | "fpr, tpr, thresholds = roc_curve(test[:,3], predict_result, pos_label=1)\n",
99 | "plt.plot(fpr, tpr, linewidth=2, label = 'ROC of LM') #作出ROC曲线\n",
100 | "plt.xlabel('False Positive Rate') #坐标轴标签\n",
101 | "plt.ylabel('True Positive Rate') #坐标轴标签\n",
102 | "plt.ylim(0,1.05) #边界范围\n",
103 | "plt.xlim(0,1.05) #边界范围\n",
104 | "plt.legend(loc=4) #图例\n",
105 | "plt.show() #显示作图结果"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {
112 | "collapsed": true
113 | },
114 | "outputs": [],
115 | "source": []
116 | }
117 | ],
118 | "metadata": {
119 | "kernelspec": {
120 | "display_name": "Python 3",
121 | "language": "python",
122 | "name": "python3"
123 | },
124 | "language_info": {
125 | "codemirror_mode": {
126 | "name": "ipython",
127 | "version": 3
128 | },
129 | "file_extension": ".py",
130 | "mimetype": "text/x-python",
131 | "name": "python",
132 | "nbconvert_exporter": "python",
133 | "pygments_lexer": "ipython3",
134 | "version": "3.5.4"
135 | }
136 | },
137 | "nbformat": 4,
138 | "nbformat_minor": 2
139 | }
140 |
--------------------------------------------------------------------------------
/高级特征工程/Emsembling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Examined ensemble methods\n",
8 | "- Averaging (or blending)\n",
9 | "- Weighted averaging\n",
10 | "- Conditional averaging\n",
11 | "- Bagging\n",
12 | "- Boosting\n",
13 | "- Stacking\n",
14 | "- StackNet"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Averaging ensemble methods"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "举个例子,假设我们有一个名为age的变量,就像年龄一样,我们试着预测它。我们有两个模型:\n",
29 | "- 低于50,模型效果更好\n",
30 | "\n",
31 | "- 高于50,模型效果更好\n",
32 | "\n",
33 | "\n",
34 | "那么如果我们试图结合它们将会发生什么呢?\n",
35 | "\n",
36 | "**Averaging(or blending)**
\n",
37 | "- **(model1 + model2) / 2**\n",
38 | "\n",
39 | "\n",
40 | "$R^2$上升到0.95,较之前有所改善。但该模型并没有比单模型做的好的地方更好,尽管如此,它平均表现更好。也许可能会有更好的组合呢?来试试加权平均\n",
41 | "\n",
42 | "**Weighted averaging**
\n",
43 | "- **(model1 x 0.7 + model 2 x 0.3)**\n",
44 | "\n",
45 | "看起来没有之前的好\n",
46 | "\n",
47 | "**Conditional averaging**
\n",
48 | "- **各取好的部分**\n",
49 | "\n",
50 | "理想情况下,我们希望得到类似的结果"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Bagging"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "### Why Bagging\n",
65 | "建模中有两个主要误差来源\n",
66 | "- 1.由于偏差而存在误差(underfitting)\n",
67 | "- 2.由于方差而存在误差(overfitting)\n",
68 | "\n",
69 | "通过略微不同的模型,确保预测不会有读取非常高的方差。这通常使它更具普遍性。"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "### Parameters that control bagging?\n",
77 | "- Changing the seed\n",
78 | "- Row(Sub) sampling or Bootstrapping\n",
79 | "- Shuffling\n",
80 | "- Column(Sub) sampling\n",
81 | "- Model-specific parameters\n",
82 | "- Number of models (or bags)\n",
83 | "- (Optionally) parallelism"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "### Examples of bagging\n",
91 | ""
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "## Boosting\n",
99 | "Boosting是对每个模型构建的模型进行加权平均的一种形式,顺序地考虑以前的模型性能。"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "### Weight based boosting\n",
107 | "\n",
108 | "假设我们有一个表格数据集,有四个特征。 我们称它们为x0,x1,x2和x3,我们希望使用这些功能来预测目标变量y。\n",
109 | "我们将预测值称为pred,这些预测有一定的误差。我们可以计算这些绝对误差,`|y - pred|`。我们可以基于此生成一个新列或向量,在这里我们创建一个权重列,使用1加上绝对误差。当然有不同的方法来计算这个权重,现在我们只是以此为例。\n",
110 | "\n",
111 | "所有接下来要做的是用这些特征去拟合新的模型,但每次也要增加这个权重列。这就是按顺序添加模型的方法。\n",
112 | "\n",
113 | "#### Weight based boosting parameters\n",
114 | "- Learning rate (or shrinkage or eta)\n",
115 | " - 每个模型只相信一点点:`predictionN = pred0*eta + pred1*eta + ... + predN*eta`\n",
116 | "- Number of estimators\n",
117 | " - estimators扩大一倍,eta减小一倍\n",
118 | "- Input model - can be anything that accepts weights\n",
119 | "- Sub boosting type:\n",
120 | " - AdaBoost-Good implementation in sklearn(python)\n",
121 | " - LogitBoost-Good implementation in Weka(Java)"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "### Residual based boosting [&]\n",
129 | "我们使用同样的数据集做相同的事。预测出pred后\n",
130 | "\n",
131 | "接下来会计算误差\n",
132 | "\n",
133 | "将error作为新的y得到新的预测new_pred\n",
134 | "\n",
135 | "以Rownum=1为例:
\n",
136 | "最终预测=0.75 + 0.20 = 0.95更接近于1\n",
137 | "\n",
138 | "这种方法很有效,可以很好的减小误差。\n",
139 | "\n",
140 | "#### Residual based boosting parameters\n",
141 | "- Learning rate (or shrinkage or eta)\n",
142 | " - `predictionN = pred0 + pred1*eta + ... + predN*eta`\n",
143 | " - 前面的例子,如果eta为0.1,则Prediction=0.75 + 0.2*(0.1) = 0.77\n",
144 | "- Number of estimators\n",
145 | "- Row (sub)sampling\n",
146 | "- Column (sub)sampling\n",
147 | "- Input model - better be trees.\n",
148 | "- Sub boosting type:\n",
149 | " - Full gradient based\n",
150 | " - Dart\n",
151 | "\n",
152 | "#### Residual based favourite implementations\n",
153 | "- Xgboost\n",
154 | "- Lightgbm\n",
155 | "- H2O's GBM\n",
156 | "- Catboost\n",
157 | "- Sklearn's GBM"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "## Stacking"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "### Methodology\n",
172 | "- Wolpert in 1992 introduced stacking. It involves:\n",
173 | " - 1. **Splitting** the train set into two disjoint sets.\n",
174 | " - 2. **Train** several base learners on the first part.\n",
175 | " - 3. **Make predictions** with the base learners on the second (validation) part."
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "### 具体步骤\n",
183 | "假设有A,B,C三个数据集,其中A,B的目标变量y已知。\n",
184 | "\n",
185 | "然后\n",
186 | "- 算法0拟合A,预测B和C,然后保存pred0到B1,C1\n",
187 | "- 算法1拟合A,预测B和C,然后保存pred1到B1,C1\n",
188 | "- 算法2拟合A,预测B和C,然后保存pred2到B1,C1\n",
189 | "\n",
190 | "- 算法3拟合B1,预测C1,得到最终结果preds3"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "### Stacking example"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "``` Python\n",
205 | "from sklearn.ensemble import RandomForestRegressor\n",
206 | "from sklearn.linear_model import LinearRegression\n",
207 | "import numpy as np\n",
208 | "from sklearn.model_selection import train_test_split\n",
209 | "train = '' # your training set\n",
210 | "y = '' # your target variable\n",
211 | "# split train data in 2 part, training and valdiation.\n",
212 | "training, valid, ytraining, yvalid = train_test_split(train, y, test_size=0.5)\n",
213 | "# specify models\n",
214 | "model1 = RandomForestRegressor()\n",
215 | "model2 = LinearRegression()\n",
216 | "#fit models\n",
217 | "model1.fit(training, ytraining)\n",
218 | "model2.fit(trainging, ytraining)\n",
219 | "# make predictions for validation\n",
220 | "preds1 = model1.predict(valid)\n",
221 | "preds2 = model2.predict(valid)\n",
222 | "# make predictions for test data\n",
223 | "test_preds1 = model1.predict(test)\n",
224 | "test_preds2 = model2.predict(test)\n",
225 | "# From a new dataset for valid and test via stacking the predictions\n",
226 | "stacked_predictions = np.colum_stack((preds1, preds2))\n",
227 | "stacked_test_predictions = np.column_stack((test_preds1, test_preds2))\n",
228 | "# specify meta model\n",
229 | "meta_model = LinearRegression()\n",
230 | "meta_model.fit(stacked_predictions, yvalid)\n",
231 | "# make predictions on the stacked predictions of the test data\n",
232 | "final_predictions = meta_model.predict(stacked_test_predictions)\n",
233 | "```"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "### Stacking(past) example\n",
241 | "\n",
242 | "\n",
243 | "可以看到,它与我们使用`Conditional averaging`的结果非常近似。只是在50附件做的不够好,这是有道理的,因为模型没有见到目标变量,无法准确识别出50这个缺口。所以它只是尝试根据模型的输入来确定。"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "### Things to be mindful of\n",
251 | "- With time sensitive data - respect time \n",
252 | " - 如果你的数据带有时间元素,你需要指定你的stacking,以便尊重时间。\n",
253 | "- Diversity as important as performance\n",
254 | " - 单一模型表现很重要,但模型的多样性也非常重要。当模型是坏的或弱的情况,你不需太担心,stacking实际上可以从每个预测中提取到精华,得到好的结果。因此,你真正需要关注的是,我正在制作的模型能给我带来哪些信息,即使它通常很弱。\n",
255 | "- Diversity may come from:\n",
256 | " - Different algorithms\n",
257 | " - Different input features\n",
258 | "- Performance plateauing after N models\n",
259 | "- Meta model is normally modest"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "## StackNet\n",
267 | "https://github.com/kaz-Anova/StackNet"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "## Ensembling Tips and Tricks"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "### $1^{st}$ level tips\n",
282 | "- Diversity based on algorithms:\n",
283 | " - 2-3 gradient boosted trees (lightgbm, xgboost, H2O, catboost)\n",
284 | " - 2-3 Neural nets (keras, pytorch)\n",
285 | " - 1-2 ExtraTrees/RandomForest (sklearn)\n",
286 | " - 1-2 linear models as in logistic/ridge regression, linear svm (sklearn)\n",
287 | " - 1-2 knn models (sklearn)\n",
288 | " - 1 Factorization machine (libfm)\n",
289 | " - 1 svm with nonlinear kernel(like RBF) if size/memory allows (sklearn)\n",
290 | "- Diversity based on input data:\n",
291 | " - Categorical features: One hot, label encoding, target encoding, likelihood encoding, frequency or counts\n",
292 | " - Numerical features: outliers, binning, derivatives, percentiles, scaling\n",
293 | " - Interactions: col1*/+-col2, groupby, unsupervised"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {},
299 | "source": [
300 | "### $2^{st}$ level tips\n",
301 | "- Simpler (or shallower) Algorithms:\n",
302 | " - gradient boosted trees with small depth(like 2 or 3)\n",
303 | " - Linear models with high regularization\n",
304 | " - Extra Trees (just don't make them too big)\n",
305 | " - Shallow networks (as in 1 hidden layer, with not that many hidden neurons)\n",
306 | " - knn with BrayCurtis Distance\n",
307 | " - Brute forcing a search for best linear weights based on cv\n",
308 | " \n",
309 | "- Feature engineering:\n",
310 | " - pairwise differences between meta features\n",
311 | " - row-wise statistics like averages or stds\n",
312 | " - Standard feature selection techniques\n",
313 | "- For every 7.5 models in previous level we add 1 in meta (经验)\n",
314 | "- Be mindful to target leakage"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "## Additional materials\n",
322 | "- [MLWave.com的Kaggle 集成指南(方法概述)](https://mlwave.com/kaggle-ensembling-guide/)\n",
323 | "- [StackNet - 一个计算,可扩展和分析的元建模框架(KazAnova)](https://github.com/kaz-Anova/StackNet)\n",
324 | "- [Heamy - 一套用于竞争数据科学(包括整合)的有用工具](https://github.com/rushter/heamy)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {
331 | "collapsed": true
332 | },
333 | "outputs": [],
334 | "source": []
335 | }
336 | ],
337 | "metadata": {
338 | "kernelspec": {
339 | "display_name": "Python 3",
340 | "language": "python",
341 | "name": "python3"
342 | },
343 | "language_info": {
344 | "codemirror_mode": {
345 | "name": "ipython",
346 | "version": 3
347 | },
348 | "file_extension": ".py",
349 | "mimetype": "text/x-python",
350 | "name": "python",
351 | "nbconvert_exporter": "python",
352 | "pygments_lexer": "ipython3",
353 | "version": "3.5.4"
354 | }
355 | },
356 | "nbformat": 4,
357 | "nbformat_minor": 2
358 | }
359 |
--------------------------------------------------------------------------------
/高级特征工程/Hyperparameter tuning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "以下是Coursera上的[How to Win a Data Science Competition: Learn from Top Kagglers](https://www.coursera.org/learn/competitive-data-science/home/week/3)课程笔记。"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Hyperparameter Optimization\n",
15 | "- List most important hyperparameters in major models; describe their impact\n",
16 | "- Understand the hyperparameter tuning process in general\n",
17 | "- Arrange hyperparameters by their importance"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Hyperparameter tuning I\n",
25 | "**Plan for the lecture**\n",
26 | "- Hyperparameter tuning in general\n",
27 | " - General pipeline\n",
28 | " - Manual and automatic tuning\n",
29 | " - What should we understand about hyperparameters?\n",
30 | "- Models,libraries and hyperparameter optimization\n",
31 | " - Tree-based models\n",
32 | " - Neural networks\n",
33 | " - Linear models\n",
34 | " \n",
35 | "**Plan for the lecture:models**\n",
36 | "- Tree-based models\n",
37 | " - GBDT: XGBoost, LightGBM, CatBoost\n",
38 | " - RandomForest/ExtraTrees\n",
39 | "- Neural nets\n",
40 | " - Pytorch, Tensorflow, Keras...\n",
41 | "- Linear models\n",
42 | " - SVM, logistic regression\n",
43 | " - Vowpal Wabbit, FTRL\n",
44 | "- *Factorization Machines(out of scope)*\n",
45 | " - libFM, libFFM"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### How do we tune hyperparameters\n",
53 | "- 1.Select the most influential parameters\n",
54 | " - a.There are tons of parameters and we can'ttune all of them\n",
55 | "- 2.Understand,how exactly they influence the training\n",
56 | "- 3.Tune them\n",
57 | " - a.Manually(change and examine)\n",
58 | " - b.Automatically(hyperopt, etc)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "- 1.\n",
66 | "无论如何,我们从来没有时间调整所有的参数,所以我们需要提出一个很好的子集来调整。假设我们是xgboost新手,不知道哪些参数是需要调的,可以在Github或Kaggle Kernels搜索到前人通常设置的参数。\n",
67 | "- 2.\n",
68 | "理解改变其中一个参数会发生什么。\n",
69 | "- 3.\n",
70 | "大多数人手动完成调参工作。也可以使用超参数优化工具,但手动执行通常会更快。\n"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### Hyperparameter optimization software自动调参工具\n",
78 | "运行调参工具可能需要很长时间,因此最好的策略是在夜间运行它。\n",
79 | "- A lot of libraries to try:\n",
80 | " - Hyperopt\n",
81 | " - Scikit-optimize\n",
82 | " - Spearmint\n",
83 | " - GPyOpt\n",
84 | " - RoBO\n",
85 | " - SMAC3"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "从广义上讲,不同的参数会导致三种不同的结果\n",
93 | "- 1.Underfitting(bad)\n",
94 | "- 2.Good fit and generalization(good)\n",
95 | "- 3.Overfitting(bad)\n",
96 | "\n",
97 | "因此我们需要把想要调整的参数分为两组。第一组是约束模型的参数,第二组与第一组效果相反。\n",
98 | "- **A parameter in red**\n",
99 | " - Increasing it impedes fitting\n",
100 | " - Increase it to reduce overfitting\n",
101 | " - Decrease to allow model fit easier\n",
102 | "- **A parameter in green**\n",
103 | " - Increasing it leads to a batter fit(overfit) on train set\n",
104 | " - Increase it, if model underfits\n",
105 | " - Decrease if overfits\n",
106 | "\n",
107 | "*上面提到的颜色只是视频中的标记*"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {
113 | "collapsed": true
114 | },
115 | "source": [
116 | "## Hyperparameter tuning II\n",
117 | "一些基于树模型的超参数优化\n",
118 | "- Tree-based models\n",
119 | "\n",
120 | "|Model|Where|\n",
121 | "|:-:|:-:|\n",
122 | "|GBDT|XGBoost-[dmlc/xgboost](https://github.com/dmlc/xgboost)
LightGBM-[Microsoft/LightGBM](https://github.com/Microsoft/LightGBM)
CatBoost-[catboost/catboost](https://github.com/catboost/catboost)|\n",
123 | "|RandomForest/ExtraTrees|*scikit-learn*|\n",
124 | "|Others|RGF-[baidu/fast_rgf](https://github.com/baidu/fast_rgf)|"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "### GBDT\n",
132 | "\n",
133 | "|XGBoost|LightGBM|\n",
134 | "|:-:|:-:|\n",
135 | "|max_depth|max_depth/num_leaves|\n",
136 | "|subsample|bagging_fraction|\n",
137 | "|colsample_bytree,
colsample_bylevel|frature_fraction|\n",
138 | "|*`min_child_weight,`
`lambda,alpha`*|*`min_data_in_leaf,`
`lambda_l1,lambda_l2`*|\n",
139 | "|eta
num_round|learning_rate
num_iterations|\n",
140 | "|Others:
seed|Others:
*_seed|\n"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "- **max_depth**:
树越深,越能拟合数据集,但这可以会导致过拟合。根据任务的不同,最大深度可能会有很大差异,有时是2,有时是27。建议max_depth大约从7开始,直到未过拟合的最大深度。需要注意的是深度增加,学习时间就更长。\n",
148 | " - **num_leaves**:
在LightGBM中,可以控制叶的数量,而不是最大深度。因为树可以很深,但如果叶子数量少就不会导致过拟合。\n",
149 | "- **subsample、bagging_fraction**:
这个参数可以控制每次喂给模型的数据量,取值在0,1之间。每次喂给它一小部分数据,可以让它不那么过拟合,并且可以得到更好的泛化效果,但是模型的训练会更慢。这有点像正则化的作用。\n",
150 | "- **colsample_bytree、colsample_bylevel**:
这个参数可以控制subsample中的分裂点。如果模型过拟合,可以尝试降低这些值。\n",
151 | "- **min_child_weight,lambda,alpha**:
正则化参数。\n",
152 | " - **min_child_weight**:
经验中,这是最重要的参数。增加它可以让模型更保守,减少它会让模型有更少约束。根据不同的任务,我发现最佳值为0,5,15,300,所以不要犹豫,尝试各种值,这取决于数据。\n",
153 | "- **eta、num_round**:eta本质上是一种学习权重,就像梯度下降一样。num_round是我们想要执行的学习步数,换句话说,是我们想要建多少棵树。每次迭代都会构建一个新树,以学习率eta添加到模型中。\n",
154 | " - 当我们找到合适的轮数时,可以做一个通常会提高分数的技巧。我们将num_round乘以α,将eta除以α,模型通常会变得更好。可能应用的参数也需要调整,但通常可以保留原样。\n",
155 | " \n",
156 | " **Other**\n",
157 | " - **seed**:
一般情况下随机种子对于模型影响不大。但如果随机种子对你的影响非常大时,建议你可以多次提交,或者根据随机性调整你的验证方案。"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "### sklearn.RandomForest/ExtraTrees"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "- **n_estimators**:
RandomForest构建每棵树是独立于其他树的,这意味这拥有大量树的模型不会导致过拟合,这于Gradient Boosting相反。我们通常首先将n_estimators设置为非常小的数字,例如10,并看看这将花费多少时间,如果不太长,就把它设为一个比较大的值,例如300。\n",
172 | "- **max_deep**:
控制树的深度,于XGBoost不同,它可以被设置为None,这对应于无限深度。当数据集中的特征具有重复值和重要交互时,它实际上非常有用。在其他情况下,无约束深度的模型将立即过拟合。建议随机森林的深度从7左右开始。通常随机深林的最佳深度高于Gradient Boosting,所有不要犹豫尝试10,20或更高的值。\n",
173 | "- **max_feature**:
与XGBoost中的参数相同。\n",
174 | "- **min_samples_leaf**:
是一个类似正则化的参数,与XGBoost的min_child_weight和LightGBM的min_data_leaf相同。\n",
175 | "\n",
176 | "**Other**\n",
177 | "- **criterion**:
根据我的经验,Gini更常见,但有时Entropy更好。\n",
178 | "- **random_state**:
随机种子参数\n",
179 | "- **n_jobs**:设置拥有多个核心数。默认情况下sklearn的RandomForest由于某种原因仅使用一个核心。"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "## Hyperparameter tuning III\n",
187 | "- Neural nets\n",
188 | " - Pytorch, Tensorflow, Keras...\n",
189 | "- Linear models\n",
190 | " - SVM, logistic regression\n",
191 | " - Vowpal Wabbit, FTRL"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "### Neural Nets\n",
199 | "这里讨论的是dense neural nets,即只含有全连接层的网络\n",
200 | "\n",
201 | "自适应算法已高亮+斜体显示\n",
202 | "- Number of neurons per layer\n",
203 | "- Number of layers\n",
204 | "- Optimizers\n",
205 | " - *`SGD + momentum`*\n",
206 | " - Adam/Adadelta/Adagrade/..\n",
207 | " - In pratice lead to more overfitting\n",
208 | "- Batch size\n",
209 | "- Learning rate\n",
210 | "- Regularization\n",
211 | " - L2/L1 for weights\n",
212 | " - Dropout/Dropconnect\n",
213 | " - Static Dropconect"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "- 建议从简单的开始,比如1层或2层,调试代码,确保训练时loss下降\n",
221 | "- 然后尝试找到一个能够过拟合的配置,之后在网络中调整一些东西\n",
222 | "- 神经网络的关键部分之一是优化方法\n",
223 | " - 自适应优化方法的确可以让你更快的拟合数据,但根据我的经验,这也会导致严重的过拟合。普通的SGD收敛速度较慢,但是训练好的模型通常会有更好的泛化效果。Adaptive methods are useful,but in the settings others in classification and regression.\n",
224 | "- Batch Size:事实证明批量过大会导致更多的过拟合。凭经验,batch_size为500就可以认为很大。建议选择32或64左右的值,如果网络仍然过拟合,请尝试减少batch_size,反之增加它。batch_size也不应该太小,否则梯度可能会有太多噪声。在调整batch_size后,必要时,应该去调整其他网络数量。\n",
225 | "- 学习率:学习率不能太高也不能太低。因此,最佳学习率取决于其他参数。通常从一个大的学习率开始,比如0.1,然后逐步去减小它。有一条经验法则,如果你将batch_size增加alpha倍,你也可以提高学习率alpha倍。\n",
226 | "- 早期,人们大多使用L2和L1正则化。如今大多数人都使用dropout正则化。对我来说,就是在数层之后立即将dropout作为第一层。\n",
227 | "- static dropconnect:通常我们有一个密集连接的输入层,比如128个单位。我们将改为一个非常巨大的隐藏层,比如4096个单位,对于一般的比赛来说,这是一个巨大的网络,它会严重过拟合。现在为了规范它,我们将对这一层随机dropout 99%,这是非常强的正则化,实践证明这是可以的。\n",
228 | "\n"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "### Linear models\n",
236 | "- **Scikit-learn**\n",
237 | " - SVC/SVR\n",
238 | " - Sklearn wraps `libLinear` and `libSVM`\n",
239 | " - Compile yourself for multicore support\n",
240 | " - LogisticRegression/LinearRegression + *regularizers*\n",
241 | " - SGDClassifier/SGDRegressor\n",
242 | " \n",
243 | " \n",
244 | "- **Vowpal Wabbit**\n",
245 | " - FTRL"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "- SVM几乎不需要调参,这是最大的益处\n",
253 | "- 最新版的`libLinear`和`libSVM`支持多核处理,但Sklearn中的不支持多核处理。所以我们需要动手变异这些库以使用此选项。\n",
254 | "- 几乎没有人使用`kernel SVC`,所以这里只讨论SVM\n",
255 | "- 对于不适合在内存中操作的数据,我们可以使用`Vowpal Wabbit`,它以在线的方式实现线性模型的学习。它只能直接从硬盘驱动器中逐行读取数据,永远不会将整个数据集加载到内存中。因此,允许学习非常庞大的数据集。\n",
256 | "- 线性模型的在线学习方法(FTRL)在前段时间特别受欢迎,他是`Vowpal Wabbit`中的实现。"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "### Linear models\n",
264 | "- Regularization parameter(X,alpha,lambda,..)\n",
265 | " - Start with very small value and increase it.\n",
266 | " - SVC starts to work sklowe as C increase\n",
267 | "- Regularization type\n",
268 | " - L1/L2/L1+L2 --try each\n",
269 | " - L1 can be used for feature selection"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "- C:对于SVM,我通常会从一个非常小的值开始,比如$10^{-6}$,每次乘以10。从小的值开始,是因为参数C越大,训练时间越长。\n",
277 | "- 选择L1还是L2?答案是尝试两者,在我看来,它们非常相识。并且L1还有一个好处,可以给我们提供一个稀疏权重,这可以用于特征选择。"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "metadata": {},
283 | "source": [
284 | "## Tips\n",
285 | "- **Don't spend too much time tuning hyperparameters**\n",
286 | " - Only if you don't have any more ideas or you have spare computational resources\n",
287 | "\n",
288 | "- **Be patient**\n",
289 | " - It can take thousands of rounds for GBDT or neural nets to fit.\n",
290 | " \n",
291 | "- **Average everything**\n",
292 | " - Over random seed\n",
293 | " - Or over small deviations from optimal parameters\n",
294 | " - e.g.average max_depth=4,5,6for an optimal 5"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {},
300 | "source": [
301 | "## 相关链接\n",
302 | "- [调整估计器的超参数(sklearn)](http://scikit-learn.org/stable/modules/grid_search.html)\n",
303 | "- [Python中梯度提升(GBM)中参数调整的完整指南](https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/)"
304 | ]
305 | }
306 | ],
307 | "metadata": {
308 | "kernelspec": {
309 | "display_name": "Python 3",
310 | "language": "python",
311 | "name": "python3"
312 | },
313 | "language_info": {
314 | "codemirror_mode": {
315 | "name": "ipython",
316 | "version": 3
317 | },
318 | "file_extension": ".py",
319 | "mimetype": "text/x-python",
320 | "name": "python",
321 | "nbconvert_exporter": "python",
322 | "pygments_lexer": "ipython3",
323 | "version": "3.5.4"
324 | }
325 | },
326 | "nbformat": 4,
327 | "nbformat_minor": 2
328 | }
329 |
--------------------------------------------------------------------------------
/高级特征工程/Advanced Feature Engineering II.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "以下是Coursera上的[How to Win a Data Science Competition: Learn from Top Kagglers](https://www.coursera.org/learn/competitive-data-science/home/week/3)课程笔记。"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Statistics and distance based features\n",
15 | "该部分专注于此高级特征工程:计算由另一个分组的一个特征的各种统计数据和从给定点的邻域分析得到的特征。\n",
16 | "> groupby and nearest neighbor methods"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "#### 例子:这里有一些CTR任务的数据\n",
24 | "\n",
25 | "\n",
26 | "> 我们可以暗示广告有 页面上的最低价格将吸引大部分注意力。 页面上的其他广告不会很有吸引力。 计算与这种含义相关的特征非常容易。 我们可以为每个广告的每个用户和网页添加最低和最高价格。 在这种情况下,具有最低价格的广告的位置也可以使用。 \n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "代码实现\n",
31 | "\n",
32 | "\n",
33 | "- More feature\n",
34 | " - How many pages user visited\n",
35 | " - Standard deviation of prices\n",
36 | " - Most visited page\n",
37 | " - Many, many more"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "如果没有特征可以像这样使用groupby呢?可以使用最近邻点"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "### Neighbors\n",
52 | "- Explicit group is not needed\n",
53 | "- More flexible\n",
54 | "- Much harder to implement"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "**Examples**\n",
62 | "- Number of houses in 500m, 1000m,..\n",
63 | "- Average price per square meter in 500m, 1000m,..\n",
64 | "- Number of schools/supermarkets/parking lots in 500m, 1000m,..\n",
65 | "- Distance to colsest subway station"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "讲师在`Springleaf`比赛中使用了它。"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "#### KNN features in springleaf\n",
80 | "- Mean encode all the variables\n",
81 | "- For every point, find 2000 nearst neighbors using Bray-Curtis metric\n",
82 | "$$\\frac{\\sum{|u_i - v_i|}}{\\sum{|u_i + v_i|}}$$\n",
83 | "- Calculate various features from those 2000 neighbors"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "**Evaluate** \n",
91 | "- Mean target of neatrest 5,10,15,500,2000, neighbors\n",
92 | "- Mean distance to 10 closest neighbors\n",
93 | "- Mean distance to 10 closest neighbors with target 1\n",
94 | "- Mean distance to 10 closest neighbors with target 0"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "# Matrix factorizations for feature extraction"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "- **Example of feature fusion**\n",
109 | ""
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "#### Notes about Matrix Fatorization\n",
117 | "- Can be apply only for some columns\n",
118 | "- Can provide additional diversity\n",
119 | " - Good for ensembles\n",
120 | "- It is **lossy** transformation.Its' efficirncy depends on:\n",
121 | " - Particular task\n",
122 | " - Number of latent factors\n",
123 | " - Usually 5-100"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "#### Implementtation\n",
131 | "- Serveral MF methods you can find in sklearn\n",
132 | "- SVD and PCA\n",
133 | " - Standart tools for Matrix Fatorization\n",
134 | "- TruncatedSVD\n",
135 | " - Works with sparse matrices\n",
136 | "- Non-negative Matrix Fatorization(NMF)\n",
137 | " - Ensures that all latent fators are non-negative\n",
138 | " - Good for counts-like data"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "#### NMF for tree-based methods\n",
146 | "> `non-negative matrix factorization`简称NMF,它以一种使数据更适合决策树的方式转换数据。\n",
147 | "\n",
148 | " 可以看出,NMF变换数据形成平行于轴的线。"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "#### 因子分解\n",
156 | "可以使用与线性模型的技巧来分解矩阵。\n",
157 | ""
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "### Conclusion\n",
165 | "- Matrix Factorization is a very general approach for dimensionality reduction and feature extraction\n",
166 | "- It can be applied for transforming categorical features into real-valued\n",
167 | "- Many of tricks trick suitable for linear models can be useful for MF"
168 | ]
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {
173 | "collapsed": true
174 | },
175 | "source": [
176 | "## Feature interactions\n",
177 | "特征值的所有组合"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {
183 | "collapsed": true
184 | },
185 | "source": [
186 | "- **Example:banner selection**\n",
187 | "\n",
188 | "假设我们正在构建一个预测模型,在网站上显示的最佳广告横幅。\n",
189 | "\n",
190 | "|...|category_ad|category_site|...|is_clicked|\n",
191 | "|:-:|:-:|:-:|:-:|:-:|\n",
192 | "|...|auto_part|game_news|...|0|\n",
193 | "|...|music_tickets|music_news|..|1|\n",
194 | "|...|mobile_phones|auto_blog|...|0|\n",
195 | "将广告横幅本身的类别和横幅将显示的网站类别,进行组合将构成一个非常强的特征。\n",
196 | "\n",
197 | "|...|ad_site|...|is_clicked|\n",
198 | "|:-:|:-:|:-:|:-:|\n",
199 | "|...|auto_part | game_news|...|0|\n",
200 | "|...|music_tickets | music_news|..|1|\n",
201 | "|...|mobile_phones | auto_blog|...|0|\n",
202 | "\n",
203 | "构建这两个特征的组合特征`ad_site`"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {
209 | "collapsed": true
210 | },
211 | "source": [
212 | " 从技术角度来看, 有两种方法可以构建这种交互。"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {
218 | "collapsed": true
219 | },
220 | "source": [
221 | "- **Example of interactions**\n",
222 | "\n",
223 | "**方法1**\n",
224 | "\n",
225 | "**方法2**\n",
226 | "\n"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "- **相似的想法也可用于数值变量**\n",
234 | "\n",
235 | "事实上,这不限于乘法操作,还可以是其他的\n",
236 | "- Multiplication\n",
237 | "- Sum\n",
238 | "- Diff\n",
239 | "- Division\n",
240 | "- .."
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "### **Practival Notes**\n",
248 | "\n",
249 | "- We have a lot of possible interactions -N*N for N features.\n",
250 | " - a. Even more if use several types in interactions\n",
251 | "- Need ti reduce it's number\n",
252 | " - a. Dimensionality reduction\n",
253 | " - b. Feature selection"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "通过这种方法生成了大量的特征,可以使用特征选择或降维的方法减少特征。以下用特征选择举例说明\n",
261 | ""
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "### Interactions' order\n",
269 | "- We looked at 2nd order interactions.\n",
270 | "- Such approach can be generalized for higher orders.\n",
271 | "- It is hard to do generation and selection automatically.\n",
272 | "- Manual building of high-order interactions is some kind of art."
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "### Extract features from DT\n",
280 | "\n",
281 | "> 看一下决策树。 让我们将每个叶子映射成二进制特征。 对象叶子的索引可以用作新分类特征的值。 如果我们不使用单个树而是使用它们的整体。 例如,随机森林, 那么这种操作可以应用于每个条目。 这是一种提取高阶交互的强大方法。 "
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "- **How to use it**\n",
289 | "\n",
290 | "In sklearn:\n",
291 | "``` Python\n",
292 | "tree_model.apply()\n",
293 | "```\n",
294 | "In xgboost:\n",
295 | "``` Python\n",
296 | "booster.predict(pred_leaf=True)\n",
297 | "```"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "### Conclusion\n",
305 | "- We looked at ways to build an interaction of categorical attributes\n",
306 | "- Extended this approach to real-valued features\n",
307 | "- Learn how to extract features via decision trees"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "## t-SNE\n",
315 | "用于探索数据分析。可以被视为从数据中获取特征的方法。"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "### Practical Notes\n",
323 | "- Result heavily depends on hyperparameters(perplexity)\n",
324 | " - Good practice is to use several projections with different perplexities(5-100)\n",
325 | "- Due to stochastic nature, tSNE provides different projections even for the same data\\hyperparams\n",
326 | " - Train and test should be projected together\n",
327 | "- tSNE runs for a long time with a big number of features\n",
328 | " - it is common to do dimensionality reduction before projection.\n",
329 | "- Implementation of tSNE can be found in sklearn library.\n",
330 | "- But personally I perfer you use stand-alone implementation python package tsne due to its' faster speed."
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "### Conclusion\n",
338 | "- tSNE is a great tool for visualization\n",
339 | "- It can be used as feature as well\n",
340 | "- Be careful with interpretation of results\n",
341 | "- Try different perplexities"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "#### 矩阵分解:\n",
349 | "- [矩阵分解方法概述(sklearn)](http://scikit-learn.org/stable/modules/decomposition.html)\n",
350 | "#### T-SNOW:\n",
351 | "- [多核t-SNE实现](https://github.com/DmitryUlyanov/Multicore-TSNE)\n",
352 | "- [流形学习方法的比较(sklearn)](http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html)\n",
353 | "- [如何有效使用t-SNE(distill.pub博客)](https://distill.pub/2016/misread-tsne/)\n",
354 | "- [tSNE主页(Laurens van der Maaten)](https://lvdmaaten.github.io/tsne/)\n",
355 | "- [示例:具有不同困惑的tSNE(sklearn)](http://scikit-learn.org/stable/auto_examples/manifold/plot_t_sne_perplexity.html#sphx-glr-auto-examples-manifold-plot-t-sne-perplexity-py)\n",
356 | "#### 互动:\n",
357 | "- [Facebook Research的论文关于从树中提取分类特征](https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/)\n",
358 | "- [示例:使用树集合进行要素转换(sklearn)](http://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {
365 | "collapsed": true
366 | },
367 | "outputs": [],
368 | "source": []
369 | }
370 | ],
371 | "metadata": {
372 | "kernelspec": {
373 | "display_name": "Python 3",
374 | "language": "python",
375 | "name": "python3"
376 | },
377 | "language_info": {
378 | "codemirror_mode": {
379 | "name": "ipython",
380 | "version": 3
381 | },
382 | "file_extension": ".py",
383 | "mimetype": "text/x-python",
384 | "name": "python",
385 | "nbconvert_exporter": "python",
386 | "pygments_lexer": "ipython3",
387 | "version": "3.5.4"
388 | }
389 | },
390 | "nbformat": 4,
391 | "nbformat_minor": 2
392 | }
393 |
--------------------------------------------------------------------------------
/高级特征工程/Advanced Feature Engineering I.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Mean encodings\n",
8 | "以下是Coursera上的[How to Win a Data Science Competition: Learn from Top Kagglers](https://www.coursera.org/learn/competitive-data-science/home/week/3)课程笔记。\n",
9 | "### 学习目标\n",
10 | "- Regularize mean encodings\n",
11 | "- Extend mean encodings\n",
12 | "- Summarize the concept of mean encodings"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Concept of mean encoding\n",
20 | "均值编码是一种非常强大的技术,它有很多名字,例如:likelihood encoding、target encoding,但这里我们叫它均值编码。我们举一个二分类任务的例子。"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "||feature|feature_label|feature_mean|target|\n",
28 | "|:-:|:-:|:-:|:-:|:-:|\n",
29 | "|0|Moscow|1|0.4|0|\n",
30 | "|1|Moscow|1|0.4|1|\n",
31 | "|2|Moscow|1|0.4|1|\n",
32 | "|3|Moscow|1|0.4|0|\n",
33 | "|4|Moscow|1|0.4|0|\n",
34 | "|5|Tver|2|0.8|1|\n",
35 | "|6|Tver|2|0.8|1|\n",
36 | "|7|Tver|2|0.8|1|\n",
37 | "|8|Tver|2|0.8|0|\n",
38 | "|9|Klin|0|0.0|0|\n",
39 | "|10|klin|0|0.0|0|\n",
40 | "|11|Tver|2|1|1|"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "我们想对`feature`变量进行编码,最直接、常用的方式就是`label encoding`,这就是第二列数据。
\n",
48 | "平均编码以不同的方式去完成这个任务,它用每个城市自身对应的目标均值来进行编码。例如,对于`Moscow`,我们有五行,三个0和两个1。 所以我们用2除以5或0.4对它进行编码。用同样的方法处理其他城市。"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "现在了解一下细节。当我们的数据集非常大,包含数百个不同的城市,让我们试着比较一下。我们绘制了0,1 class的直方图。\n",
56 | "\n",
57 | "在`label encoding`的情况下,我们得到的图看起来没有任何逻辑顺序。\n",
58 | "\n",
59 | "但是当我们使用`mean encoding`对目标进行编码时,类看起来更加可分了,像是被排序过。"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "一般来说,模型对复杂、非线性的特征目标越依赖,`均值编码`越有效。例如树模型的深度有限,可以用平均编码来补偿它,可以用它的短板来获得更好的分数。"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "以上只是一个例子,传递的是一种思想,实际上可以做很多类似的操作。\n",
74 | "#### Ways to use target variable\n",
75 | "> Goods-number of ones in a group,
\n",
76 | " Bads-number of zeros\n",
77 | "\n",
78 | "- $Likelihood = \\frac {Goods}{Goods+Bads} = mean(target)$\n",
79 | "- $Weight of Evidence = \\ln(\\frac{Goods}{Bads}) * 100$\n",
80 | "- $Count = Goods = sum(target)$\n",
81 | "- $Diff = Goods - Bads$"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "构造`Mean encoding`的例子\n",
89 | "\n",
90 | "``` Python\n",
91 | "means= X_tr.groupby(col).target.mean()\n",
92 | "train_new[col+'_mean_target'] = train_new[col].map(means)\n",
93 | "val_new[col+'_mean_target'] = val_new[col].map(means)\n",
94 | "```"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "将它运用到模型中,出现了严重的过拟合,但是为什么呢?\n",
102 | "\n",
103 | "- Train\n",
104 | "\n",
105 | "||feature|feature_label|feature_mean|target|\n",
106 | "|:-:|:-:|:-:|:-:|:-:|\n",
107 | "|8|Tver|2|0.8|0|\n",
108 | "|9|Klin|0|0.0|0|\n",
109 | "\n",
110 | "- Validation\n",
111 | "\n",
112 | "||feature|feature_label|feature_mean|target|\n",
113 | "|:-:|:-:|:-:|:-:|:-:|\n",
114 | "|10|klin|0|0.0|0|\n",
115 | "|11|Tver|2|1|1|\n",
116 | "\n",
117 | "> When they are categorized, it's pretty common to get results like in an example, target 0 in train and target 1 in validation. Mean encodings turns into a perfect feature for such categories. That's why we immediately get very good scores on train and fail hardly on validation. "
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "## Regularization\n",
125 | "在上一节,我们意识到平均编码不能按原样使用,需要对训练数据进行某种正规化。现在我们将实施四种不同的正则化方法。\n",
126 | "- 1.CV loop inside training data;\n",
127 | "- 2.Smoothing;\n",
128 | "- 3.Adding random noise;\n",
129 | "- 4.Sorting and calculating expanding mean.\n",
130 | "\n",
131 | "### Conclusion\n",
132 | "- There are a lot ways to regularize mean encodings\n",
133 | "- Unending battle with target variable leakage\n",
134 | "- **CV loop** or **Expanding mean** for partical tasks."
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "### 1.KFold scheme\n",
142 | "\n",
143 | "通常做四到五折的交叉验证就能得到不错的结果,无序调整此数字。"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "代码例子\n",
151 | ""
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "这个方法看起来已经完全避免了目标变量的泄露,但事实并非如此。
\n",
159 | "这里我们通过`留一法`对`Moscow`进行编码"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "\n",
167 | "||feature|feature_mean|target|\n",
168 | "|:-:|:-:|:-:|:-:|:-:|\n",
169 | "|0|Moscow|0.50|0|\n",
170 | "|1|Moscow|0.25|1|\n",
171 | "|2|Moscow|0.25|1|\n",
172 | "|3|Moscow|0.50|0|\n",
173 | "|4|Moscow|0.50|0|"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "对于第一行,我们得到0.5,因为有两个1和 其余行中有两个0。 同样,对于第二行,我们得到0.25,依此类推。 但仔细观察,所有结果和由此产生的特征。 它完美地分割数据,具有等于或等的特征的行 大于0.5的目标为0,其余行的目标为1。 我们没有明确使用目标变量,但我们的编码是有偏置的。\n",
181 | "
\n",
182 | "\n",
183 | "目标变量的泄露效果对于`KFold scheme`仍然是有效的,只是效果温和了点。\n",
184 | "
\n",
185 | "在实践中,如果您有足够的数据并使用四或五折,编码将通过这种正规化策略正常工作。 只是要小心并使用正确的验证。 "
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "### 2.Smoothing\n",
193 | "- Alpha controls the amount of regularization\n",
194 | "- Only works together with some other regularization method\n",
195 | "\n",
196 | "$$\\frac{mean(target)*nrows + globalmean*alpha}{nrows+alpha}$$\n",
197 | "它具有控制正则化量的超参数alpha。 当alpha为零时,我们没有正则化,并且当alpha接近无穷大时,一切都变成了globalmean。"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "在某种意义上,alpha等于我们可以信任的类别大小。也可以使用其他一些公式,基本上任何惩罚编码类别的东西都可以被认为是`smoothing`。"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "### 3.Nosie\n",
212 | "- Noise degrades the quality of encoding\n",
213 | "\n",
214 | "通过添加噪声,会降低训练数据的编码质量。这种方法很不稳定,很难使它工作。主要问题在于我们需要添加的噪声量。\n",
215 | "\n",
216 | "- How much noise should we add?\n",
217 | "\n",
218 | "太多的噪声会把这个特征变成垃圾,虽然噪声太小意味着更正规化。你需要努力地微调它。\n",
219 | "\n",
220 | "- Usually used together with LOO(Leave one out).\n",
221 | "\n",
222 | "这种方法通常与LOO正则化一起使用。如果你没有很多时间,它可能不是最好选择。"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "### 4.Expanding mean\n",
230 | "- Least amount of leakage\n",
231 | "- No hyper parameters\n",
232 | "- Irregular encoding quality\n",
233 | "- Built-in in CatBoost."
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "代码例子\n",
241 | "``` Python\n",
242 | "cumsum = df_tr.groupby(col)['target'].cumsum() - df_tr['target']\n",
243 | "cumcnt = df_tr.groupby(col).cumcount()\n",
244 | "train_new[col + '_mean_target'] = cusum/cumcnt\n",
245 | "```"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "cumsum存储目标变量的累计和,直到给定行,cumcnt存储累积计数。该方法引入的目标变量的泄漏量最少,唯一的缺点是特征质量不均匀。但这不是什么大不了的事,我们可以从不同的数据排列计算编码的平均模型。"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {},
258 | "source": [
259 | "它被用于CatBoost库中,证明了它在分类数据集上表现非常出色。"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "## Extensions and generalizations\n",
267 | "- 如何在回归和多分类任务中进行`Mean encoding`\n",
268 | "- 如何将编码应用于具有多对多关系的领域\n",
269 | "- 我们可以根据时间序列中的目标构建哪些功能\n",
270 | "- 编码交互和数字特征"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "### Many-to-many relations"
278 | ]
279 | },
280 | {
281 | "cell_type": "markdown",
282 | "metadata": {},
283 | "source": [
284 | "- 原始数据\n",
285 | "\n",
286 | "|User_id|APPS|Target|\n",
287 | "|:-:|:-:|:-:|\n",
288 | "|10|APP1;APP2;APP3|0|\n",
289 | "|11|APP4;APP1|1|\n",
290 | "|12|APP2|1|\n",
291 | "|100|APP3;APP9|0|"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "现在考虑一个例子,基于用在智能手机上已装的APP,预测它是否会安装,这是一个二分类任务。从表中数据可知,每个用户可能有多个应用程序,每个应用程序由多个用户使用,因此这是多对多的关系。而麻烦在于,如何从多对多的关系中提取均值。"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "- 长数据表示\n",
306 | "\n",
307 | "|User_id|APP_id|Target|\n",
308 | "|:-:|:-:|:-:|\n",
309 | "|10|APP1|0|\n",
310 | "|10|APP2|0|\n",
311 | "|10|APP3|0|\n",
312 | "|11|APP4|1|\n",
313 | "|11|APP1|1|"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "把原始数据转为长数据表示,如上表。使用此表,我们可以自然地计算APP的`均值编码`。但是如何将其映射回用户呢?\n",
321 | "\n",
322 | "每个用户都有许多APP,但不都是“APP1,APP2,APP3”。因此我们用向量表示(0.1,0.2,0.1),我们还可以从向量中收集各种统计数据,比如均值、标准差、最大最小值等等。"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "### Time series\n",
330 | "- Time structure allows us to make a lot of complicated features.\n",
331 | "- Rolling statistics of target variable."
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "一方面,这是一种限制,另一方面,它允许我们只做一些复杂的特征。考虑一个例子:"
339 | ]
340 | },
341 | {
342 | "cell_type": "markdown",
343 | "metadata": {},
344 | "source": [
345 | "|Day|User|Spend|Amount|Prev_user|Prev_spend_avg|\n",
346 | "|:-:|:-:|:-:|:-:|:-:|:-:|\n",
347 | "|1|101|FOOD|2.0|0.0|0.0|\n",
348 | "|1|101|GAS|4.0|0.0|0.0|\n",
349 | "|1|102|FOOD|3.0|0.0|0.0|\n",
350 | "|2|101|GAS|4.0|6.0|4.0|\n",
351 | "|2|101|TV|8.0|6.0|0.0|\n",
352 | "|2|102|FOOD|2.0|3.0|2.5|"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {},
358 | "source": [
359 | "我们需要预测用户会为哪个类别花钱。 我们有两天的时间,两个用户, 和三个支出类别。 一些好的特征是用户在前一天消费总额,所有用户在给定类别中花费的平均金额。 因此,在第1天,用户101花费6美元,用户102花费$3。 因此,我们认为这些数字是第2天的未来值。 同样,可以按类别划分平均金额。 \n",
360 | "\n",
361 | "我们拥有的数据越多,可以创造的特征就越复杂。"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "### Interactions and numerical features\n",
369 | "- Analyzing fitted model\n",
370 | "- Binning numeric and selecting interactions"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "在实践中,通常需要编码数字特征以及进行特征组合。要对数字特征进行编码,我们只需要对其进行分区,然后将其视为分类。我们以没有进行任何编码的原始特征和决策树模型为例。"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {},
383 | "source": [
384 | ""
385 | ]
386 | },
387 | {
388 | "cell_type": "markdown",
389 | "metadata": {},
390 | "source": [
391 | "- 如何为数字特征分组?\n",
392 | "如果数字特征有很多分裂点,则表示它于目标有一些复杂的依赖,并且试图去编码它。此外这些精确的分裂点可用于对特征进行分类,所以通过分析模型结构,我们既可以识别这些可疑的数字特征,又可以找到很好的方法去给它分组。\n",
393 | "\n",
394 | "- 如何挑选特征组合?\n",
395 | "先看决策树中如何提取交互特征。参照上图,如果两个特征在相邻的节点中,则这两个特征在相互作用。考虑到这点,我们可以遍历模型中的所有树,计算每个特征组合出现的次数。最常见的组合可能值得进行均值编码。
\n",
396 | "例如,如果我们发现`feature1`和`feature2`这一对特征最常见,我们可以在数据中连接这些特征,这意味编码产生交互。\n"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "### Correct validation reminder\n",
404 | "- Local experiments:\n",
405 | " - Estimate encodings on X_tr\n",
406 | " - Map them to X_tr and X_val\n",
407 | " - Regularize on X_tr\n",
408 | " - Validate model on X_tr/X_val split\n",
409 | "- Submission:\n",
410 | " - Estimate encodings on whole Train data\n",
411 | " - Map them on Train and Test\n",
412 | " - Regularize on Train\n",
413 | " - Fit on Train\n",
414 | "\n",
415 | ""
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "metadata": {},
421 | "source": [
422 | "## End\n",
423 | "- **Main advantages:**\n",
424 | " - Compact transformation of categorical variables\n",
425 | " - Powerful basis for feature engineering\n",
426 | "- **Disadvantages:**\n",
427 | " - Need careful validation, there a lot of ways to overfit\n",
428 | " - Significant improvements only on specific datasets"
429 | ]
430 | }
431 | ],
432 | "metadata": {
433 | "kernelspec": {
434 | "display_name": "Python 3",
435 | "language": "python",
436 | "name": "python3"
437 | },
438 | "language_info": {
439 | "codemirror_mode": {
440 | "name": "ipython",
441 | "version": 3
442 | },
443 | "file_extension": ".py",
444 | "mimetype": "text/x-python",
445 | "name": "python",
446 | "nbconvert_exporter": "python",
447 | "pygments_lexer": "ipython3",
448 | "version": "3.5.4"
449 | }
450 | },
451 | "nbformat": 4,
452 | "nbformat_minor": 2
453 | }
454 |
--------------------------------------------------------------------------------
/6.数据预处理笔记.md:
--------------------------------------------------------------------------------
1 | > 记录实战过程中在数据预处理环节用到的方法
2 |
3 | # 数据预处理
4 |
5 |
6 | ## 常用方法
7 | #### 生成随机数序列
8 | ``` Python
9 | randIndex = random.sample(range(trainSize, len(trainData_copy)), 5*trainSize)
10 | ```
11 | #### 计算某个值出现的次数
12 | ``` Python
13 | titleSet = set(titleData)
14 | for i in titleSet:
15 | count = titleData.count(i)
16 | ```
17 | 用文本出现的次数替换非空的地方。词袋模型 Word Count
18 | ``` Python
19 | titleData = allData['title']
20 | titleSet = set(list(titleData))
21 | title_counts = titleData.value_counts()
22 | for i in titleSet:
23 | if isNaN(i):
24 | continue
25 | count = title_counts[i]
26 | titleData.replace(i, count, axis=0, inplace=True)
27 | title = pd.DataFrame(titleData)
28 | allData['title'] = title
29 | ```
30 | #### 判断值是否为NaN
31 | ``` Python
32 | def isNaN(num):
33 | return num != num
34 | ```
35 | #### Matplotlib在jupyter中显示图像
36 | ```
37 | %matplotlib inline
38 | ```
39 | #### 处理日期
40 | ``` Python
41 | birth = trainData['birth_date']
42 | birthDate = pd.to_datetime(birth)
43 | end = pd.datetime(2018, 1, 1)
44 | # 计算天数
45 | birthDay = end - birthDate
46 | birthDay.astype('timedelta64[D]')
47 | # timedelta64 转到 int64
48 | trainData['birth_date'] = birthDay.dt.days
49 | ```
50 |
51 | #### 计算多列数的平均值等
52 | ``` Python
53 | trainData['operate_able'] = trainData.iloc[ : , 20:53].mean(axis=1)
54 | trainData['local_able'] = trainData.iloc[ : , 53:64].mean(axis=1)
55 | ```
56 | ### 数据分列(对列进行one-hot)
57 | ``` Python
58 | train_test = pd.get_dummies(train_test,columns=["Embarked"])
59 | train_test = pd.get_dummies(train_test,columns = ['SibSp','Parch','SibSp_Parch'])
60 | ```
61 | ### 正则提取指定内容
62 | df['Name].str.extract()是提取函数,配合正则一起使用
63 | ``` Python
64 | train_test['Name1'] = train_test['Name'].str.extract('.+,(.+)').str.extract( '^(.+?)\.').str.strip()
65 | ```
66 | ### 根据数据是否缺失进行处理
67 | ``` Python
68 | train_test.loc[train_test["Age"].isnull() ,"age_nan"] = 1
69 | train_test.loc[train_test["Age"].notnull() ,"age_nan"] = 0
70 | ```
71 |
72 | ### 按区间分割-数据离散化
73 | 返回x所属区间的索引值,半开区间
74 | ``` Python
75 | #将年龄划分四个阶段10以下,10-18,18-30,30-50,50以上
76 | train_test['Age'] = pd.cut(train_test['Age'], bins=[0,10,18,30,50,100],labels=[1,2,3,4,5])
77 | ```
78 |
79 |
80 |
81 | ## Numpy部分
82 | #### where索引列表
83 | ``` Python
84 | delLocal = np.array(np.where(np.array(trainData['acc_now_delinq']) == 1))
85 | ```
86 | #### permutation(x) 随机生成一个排列或返回一个range
87 | 如果x是一个多维数组,则只会沿着它的第一个索引进行混洗。
88 | ``` Python
89 | import numpy as np
90 |
91 | shuffle_index = np.random.permutation(60000)
92 | X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
93 | ```
94 | #### numpy.argmax() 返回沿轴的最大值的`索引`
95 | 返回沿轴的最大值的索引。
96 | ``` Python
97 | np.argmax(some_digit_scores)
98 | ```
99 | - a : array_like; 输入数组
100 | - axis : int, optional; 默认情况下,索引是放在平面数组中,否则沿着指定的轴。
101 | - out : array, optional; 如果提供,结果将被插入到这个数组中。它应该是适当的形状和dtype。
102 | #### numpy.dot(a, b, out=None) 计算两个数组的点积
103 | ``` Python
104 | >>> np.dot(3, 4)
105 | ```
106 | #### numpy.random.randn() 从标准正太分布返回样本
107 | ``` Python
108 | >>> np.random.seed(42) # 可设置随机数种子
109 | >>> theta = np.random.randn(2,1)
110 | array([[ 4.21509616],
111 | [ 2.77011339]])
112 | ```
113 | 参数
114 | - d0, d1, ..., dn : int, optional;返回的数组维度,应该都是正值。如果没有给出,将返回一个Python float值。
115 |
116 | ### numpy.linspace() 在指定区间返回间隔均匀的样本[start, stop]
117 | ``` Python
118 | X_new=np.linspace(-3, 3, 100).reshape(100, 1)
119 | X_new_poly = poly_features.transform(X_new)
120 | y_new = lin_reg.predict(X_new_poly)
121 | plt.plot(X, y, "b.")
122 | plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
123 | plt.xlabel("$x_1$", fontsize=18)
124 | plt.ylabel("$y$", rotation=0, fontsize=18)
125 | plt.legend(loc="upper left", fontsize=14)
126 | plt.axis([-3, 3, 0, 10])
127 | save_fig("quadratic_predictions_plot")
128 | plt.show()
129 | ```
130 | - start : scalar;序列的起始值
131 | - stop : scalar;序列的结束值
132 | - num : int, optional;要生成的样本数量,默认为50个。
133 | - endpoint : bool, optional;若为True则包括结束值,否则不包括结束值,即[start, stop)区间。默认为True。
134 | - dtype : dtype, optional;输出数组的类型,若未给出则从输入数据推断类型。
135 |
136 |
137 |
138 |
139 | ## Pandas部分
140 | #### Jupyter notebook中设置最大显示行列数
141 | ``` Python
142 | pd.set_option('display.max_columns', 64)
143 | pd.set_option('display.max_rows', 1000000)
144 | ```
145 |
146 | #### 读入数据
147 | ``` Python
148 | homePath = 'game'
149 | trainPath = os.path.join(homePath, 'train.csv')
150 | testPath = os.path.join(homePath, 'test.csv')
151 | trainData = pd.read_csv(trainPath)
152 | testData = pd.read_csv(testPath)
153 | ```
154 |
155 | #### 数据简单预览
156 | - ~head()
157 | 获取前五行数据,供快速参考。
158 | - ~info()
159 | 获取总行数、每个属性的类型、非空值的数量。
160 | - ~value_counts()
161 | 获取每个值出现的次数
162 | - ~hist()
163 | 直方图的形式展示数值型数据
164 | - ~describe()
165 | 简要显示数据的数字特征;例如:总数、平均值、标准差、最大值最小值、25%/50%/75%值
166 |
167 | #### 拷贝数据
168 | ``` Python
169 | mthsMajorTest = fullData.copy()
170 | ```
171 | #### 数据相关性
172 | - 计算相关性矩阵
173 | ``` Python
174 | corrMatrix = trainData.corr()
175 | corrMatrix['acc_now_delinq'].sort_values(ascending=False) # 降序排列
176 | ```
177 | - 相关系数矩阵图
178 | ``` Python
179 | import numpy
180 | correlations = data.corr() #计算变量之间的相关系数矩阵
181 | # plot correlation matrix
182 | fig = plt.figure() #调用figure创建一个绘图对象
183 | ax = fig.add_subplot(111)
184 | cax = ax.matshow(correlations, vmin=-1, vmax=1) #绘制热力图,从-1到1
185 | fig.colorbar(cax) #将matshow生成热力图设置为颜色渐变条
186 | ticks = numpy.arange(0,9,1) #生成0-9,步长为1
187 | ax.set_xticks(ticks) #生成刻度
188 | ax.set_yticks(ticks)
189 | ax.set_xticklabels(names) #生成x轴标签
190 | ax.set_yticklabels(names)
191 | plt.show()
192 | ```
193 | 颜色越深表明二者相关性越强
194 |
195 | #### 删除某列
196 | ``` Python
197 | trainData.drop('acc_now_delinq', axis=1, inplace=True)
198 | ```
199 | ``` Python
200 | # 此方法并不会从内存中释放内存
201 | del fullData['member_id']
202 | ```
203 | #### 列表类型转换
204 | ``` Python
205 | termData = list(map(int, termData))
206 | ```
207 | #### 替换数据
208 | ``` Python
209 | gradeData.replace(['A','B','C','D','E','F','G'], [7,6,5,4,3,2,1],inplace=True)
210 | ```
211 |
212 | #### 数据集合并
213 | ``` Python
214 | allData = trainData.append(testData)
215 | ```
216 | ``` Python
217 | allData = pd.concat([trainData, testData], axis=0, ignore_index=True)
218 | ```
219 |
220 | #### 分割
221 | ``` Python
222 | termData = termData.str.split(' ', n=2, expand=True)[1]
223 | ```
224 |
225 | #### ~where() 相当于三目运算符( ? : )
226 | 通过判断自身的值来修改自身对应的值,相当于三目运算符( ? : )
227 | ``` Python
228 | housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
229 | ```
230 | - cond 如果为True则保持原始值,若为False则使用第二个参数other替换值。
231 | - other 替换的目标值
232 | - inplace 是否在数据上执行操作
233 | #### np.ceil(x, y) 限制元素范围
234 | - x 输入的数据
235 | - y float型,每个元素的上限
236 | ``` Python
237 | housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) # 每个元素都除1.5
238 | ```
239 | #### ~loc[] 纯粹基于标签位置的索引器
240 |
241 | ``` Python
242 | strat_train_set = housing.loc[train_index]
243 | strat_test_set = housing.loc[test_index]
244 | ```
245 |
246 | #### ~dropna() 返回略去丢失数据部分后的剩余数据
247 | Return object with labels on given axis omitted where alternately any or all of the data are missing
248 | ``` Python
249 | sample_incomplete_rows.dropna(subset=["total_bedrooms"])
250 | ```
251 |
252 | #### ~fillna() 用指定的方法填充
253 | ``` Python
254 | # 用中位数填充
255 | median = housing["total_bedrooms"].median()
256 | sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True)
257 | ```
258 | #### 重置索引
259 | ``` Python
260 | allData = subTrain.reset_index()
261 | ```
262 |
263 |
264 |
265 |
266 | # 缺失值处理
267 | ## Sklearn 部分
268 | #### 数据标准化
269 | ``` Python
270 | from sklearn.preprocessing import StandardScaler
271 | ss = StandardScaler()
272 | ss.fit(mthsMajorTrain)
273 | mthsMajorTrain_d = ss.transform(mthsMajorTrain)
274 | mthsMajorTest_d = ss.transform(mthsMajorTest)
275 | ```
276 |
277 | #### 预测缺失值
278 | ``` Python
279 | from sklearn import linear_model
280 | lin = linear_model.BayesianRidge()
281 | lin.fit(mthsMajorTrain_d, mthsMajorTrainLabel)
282 | trainData.loc[(trainData['mths_since_last_major_derog'].isnull()), 'mths_since_last_major_derog'] = lin.predict(mthsMajorTest_d)
283 | ```
284 | #### Lightgbm提供的特征重要性
285 | ``` Python
286 | import lightgbm as lgb
287 |
288 | params = {
289 | 'task': 'train',
290 | 'boosting_type': 'gbdt',
291 | 'objective': 'regression',
292 | 'metric': {'l2', 'auc'},
293 | 'num_leaves': 31,
294 | 'learning_rate': 0.05,
295 | 'feature_fraction': 0.9,
296 | 'bagging_fraction': 0.8,
297 | 'bagging_freq': 5,
298 | 'verbose': 0
299 | }
300 |
301 | lgb_train = lgb.Dataset(totTrain[:400000], totLabel[:400000])
302 | lgb_eval = lgb.Dataset(totTrain[400000:], totLabel[400000:])
303 | gbm = lgb.train(params,
304 | lgb_train,
305 | num_boost_round=20,
306 | valid_sets=lgb_eval,
307 | early_stopping_rounds=5)
308 | lgb.plot_importance(gbm, figsize=(10,10))
309 | ```
310 | 对于缺失值,一般手动挑选几个重要的特征,然后进行预测
311 | ``` Python
312 | upFeatures = ['revol_util', 'revol_bal', 'annual_inc'] # 通过上一步挑选出的特征
313 | totTrain = totTrain[upFeatures]
314 | totTest = trainData.loc[(trainData['total_rev_hi_lim'].isnull())][upFeatures]
315 | totTest['annual_inc'].fillna(-9999, inplace=True)
316 |
317 | from sklearn.preprocessing import StandardScaler
318 | ss = StandardScaler()
319 | ss.fit(totTrain)
320 | train_d = ss.transform(totTrain)
321 | test_d = ss.transform(totTest)
322 |
323 | from sklearn import linear_model
324 | lin = linear_model.BayesianRidge()
325 | lin.fit(train_d, totLabel)
326 | trainData.loc[(trainData['total_rev_hi_lim'].isnull()), 'total_rev_hi_lim'] = lin.predict(test_d)
327 | ```
328 |
329 | #### 用中位数填充
330 | ``` Python
331 | trainData['total_acc'].fillna(trainData['total_acc'].median(), inplace=True)
332 | ```
333 |
334 | #### 用均值填充
335 | ``` Python
336 | trainData['total_acc'].fillna(trainData['total_acc'].mean(), inplace=True)
337 | ```
338 |
339 | ## Imputer() 处理丢失值
340 | 各属性必须是数值
341 | ``` Python
342 | from sklearn.preprocessing import Imputer
343 | # 指定用何值替换丢失的值,此处为中位数
344 | imputer = Imputer(strategy="median")
345 |
346 | # 使实例适应数据
347 | imputer.fit(housing_num)
348 |
349 | # 结果在statistics_ 变量中
350 | imputer.statistics_
351 |
352 | # 替换
353 | X = imputer.transform(housing_num)
354 | housing_tr = pd.DataFrame(X, columns=housing_num.columns,
355 | index = list(housing.index.values))
356 |
357 | # 预览
358 | housing_tr.loc[sample_incomplete_rows.index.values]
359 | ```
360 |
361 | ## 处理文本数据
362 |
363 | ### pandas.factorize() 将输入值编码为枚举类型或分类变量
364 | ``` Python
365 | housing_cat = housing['ocean_proximity']
366 | housing_cat.head(10)
367 | # 输出
368 | # 17606 <1H OCEAN
369 | # 18632 <1H OCEAN
370 | # 14650 NEAR OCEAN
371 | # 3230 INLAND
372 | # 3555 <1H OCEAN
373 | # 19480 INLAND
374 | # 8879 <1H OCEAN
375 | # 13685 INLAND
376 | # 4937 <1H OCEAN
377 | # 4861 <1H OCEAN
378 | # Name: ocean_proximity, dtype: object
379 |
380 | housing_cat_encoded, housing_categories = housing_cat.factorize()
381 | housing_cat_encoded[:10]
382 | # 输出
383 | # array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
384 | ```
385 | ##### 参数
386 | - values : ndarray (1-d);序列
387 | - sort : boolean, default False;根据值排序
388 | - na_sentinel : int, default -1;给未找到赋的值
389 | - size_hint : hint to the hashtable sizer
390 |
391 | ##### 返回值
392 | - labels : the indexer to the original array
393 | - uniques : ndarray (1-d) or Index;当传递的值是Index或Series时,返回独特的索引。
394 |
395 | ### OneHotEncoder 编码整数特征为one-hot向量
396 | 返回值为稀疏矩阵
397 | ``` Python
398 | from sklearn.preprocessing import OneHotEncoder
399 |
400 | encoder = OneHotEncoder()
401 | housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
402 | housing_cat_1hot
403 | ```
404 | 注意`fit_transform()`期望一个二维数组,所以这里将数据reshape了。
405 |
406 | #### 处理文本特征示例
407 | ``` Python
408 | housing_cat = housing['ocean_proximity']
409 | housing_cat.head(10)
410 | # 17606 <1H OCEAN
411 | # 18632 <1H OCEAN
412 | # 14650 NEAR OCEAN
413 | # 3230 INLAND
414 | # 3555 <1H OCEAN
415 | # 19480 INLAND
416 | # 8879 <1H OCEAN
417 | # 13685 INLAND
418 | # 4937 <1H OCEAN
419 | # 4861 <1H OCEAN
420 | # Name: ocean_proximity, dtype: object
421 |
422 | housing_cat_encoded, housing_categories = housing_cat.factorize()
423 | housing_cat_encoded[:10]
424 | # array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
425 |
426 | housing_categories
427 | # Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')
428 |
429 | from sklearn.preprocessing import OneHotEncoder
430 |
431 | encoder = OneHotEncoder()
432 | print(housing_cat_encoded.reshape(-1,1))
433 | housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
434 | housing_cat_1hot
435 | # [[0]
436 | # [0]
437 | # [1]
438 | # ...,
439 | # [2]
440 | # [0]
441 | # [3]]
442 | # <16512x5 sparse matrix of type ''
443 | # with 16512 stored elements in Compressed Sparse Row format>
444 | ```
445 |
446 |
447 | ### LabelEncoder 标签编码
448 | LabelEncoder`是一个可以用来将标签规范化的工具类,它可以将标签的编码值范围限定在[0,n_classes-1]。简单来说就是对不连续的数字或者文本进行编号。
449 | ``` Python
450 | >>> from sklearn import preprocessing
451 | >>> le = preprocessing.LabelEncoder()
452 | >>> le.fit([1, 2, 2, 6])
453 | LabelEncoder()
454 | >>> le.classes_
455 | array([1, 2, 6])
456 | >>> le.transform([1, 1, 2, 6])
457 | array([0, 0, 1, 2])
458 | >>> le.inverse_transform([0, 0, 1, 2])
459 | array([1, 1, 2, 6])
460 | ```
461 | 当然,它也可以用于非数值型标签的编码转换成数值标签(只要它们是可哈希并且可比较的):
462 | ``` Python
463 |
464 | >>> le = preprocessing.LabelEncoder()
465 | >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
466 | LabelEncoder()
467 | >>> list(le.classes_)
468 | ['amsterdam', 'paris', 'tokyo']
469 | >>> le.transform(["tokyo", "tokyo", "paris"])
470 | array([2, 2, 1])
471 | >>> list(le.inverse_transform([2, 2, 1]))
472 | ['tokyo', 'tokyo', 'paris']
473 | ```
474 |
475 | ### LabelBinarizer 标签二值化
476 | LabelBinarizer 是一个用来从多类别列表创建标签矩阵的工具类:
477 | ``` Python
478 | >>> from sklearn import preprocessing
479 | >>> lb = preprocessing.LabelBinarizer()
480 | >>> lb.fit([1, 2, 6, 4, 2])
481 | LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
482 | >>> lb.classes_
483 | array([1, 2, 4, 6])
484 | >>> lb.transform([1, 6])
485 | array([[1, 0, 0, 0],
486 | [0, 0, 0, 1]])
487 | ```
488 | 对于多类别是实例,可以使用:class:MultiLabelBinarizer:
489 | ``` Python
490 | >>> lb = preprocessing.MultiLabelBinarizer()
491 | >>> lb.fit_transform([(1, 2), (3,)])
492 | array([[1, 1, 0],
493 | [0, 0, 1]])
494 | >>> lb.classes_
495 | array([1, 2, 3])
496 | ```
--------------------------------------------------------------------------------
/实战篇/1.数据探索/README.md:
--------------------------------------------------------------------------------
1 | 若公式显示不全,请查看[notebook文件](https://github.com/wmpscc/DataMiningNotesAndPractice/blob/master/%E5%AE%9E%E6%88%98%E7%AF%87/1.%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2/%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2%E4%BB%8B%E7%BB%8D.ipynb)
2 | > 注意:本篇文章为读书笔记,若有侵权请联系我删除。
3 |
4 | 
5 |
6 | ## Python 主要数据探索函数
7 | ### Pandas统计函数
8 | **Pandas主要统计特征函数**
9 |
10 | |方法名|函数功能|
11 | |:-:|:-:|
12 | |sum()|计算数据样本的总和(按列计算)|
13 | |mean()|计算数据样本的算数平均数|
14 | |var()|计算数据样本的方差|
15 | |std()|计算数据样本的标准差|
16 | |corr()|计算数据样本的Spearman(Pearson)相关系数矩阵|
17 | |cov()|计算数据样本的协方差矩阵|
18 | |skew()|样本值的偏度(三阶矩)|
19 | |kurt()|样本值的峰度(四阶矩)|
20 | |describe()|给出样本的基本描述(基本统计量如均值、标注差等)|
21 |
22 |
23 | **Pandas累积统计特征函数**
24 |
25 | |方法名|函数功能|
26 | |:-:|:-:|
27 | |cumsum(`n`)|依次给出前n个数的和|
28 | |cumprod(`n`)|依次给出前n个数的积|
29 | |cummax(`n`)|依次给出前n个数的最大值|
30 | |cummin(`n`)|依次给出前n个数的最小值|
31 |
32 |
33 | **Pandas滚动统计特征函数**
34 |
35 | |方法名|函数功能|
36 | |:-:|:-:|
37 | |rolling_sum()|计算数据样本的总和(按列计算)|
38 | |rolling_mean()|数据样本的算数平均数|
39 | |rolling_var()|计算数据样本的方差|
40 | |rolling_std()|计算数据样本的标准差|
41 | |rolling_corr()|计算数据样本的Spearman(Pearson)相关系数矩阵|
42 | |rolling_cov()|计算数据样本的协方差矩阵|
43 | |rolling_skew()|样本值的偏度(三阶矩)|
44 | |rolling_kurt()|样本的峰度(四阶矩)|
45 |
46 | 调用方法:pd.rolling_mean(D, k),意思是每k列计算依次均值,滚动计算。
47 |
48 |
49 | ### Pandas绘图函数
50 | Pandas 基于 Matplotlib并对某些命令进行了简化,因此作图通常是 Matplotlib 和 Pandas 相互结合着使用。
51 |
52 | **Pandas主要统计作图函数**
53 |
54 | |函数名|函数功能|
55 | |:-:|:-:|
56 | |plot()|绘制线性二维图,折线图|
57 | |pie()|绘制饼形图|
58 | |hist()|绘制二维条形直方图,可显示数据的分配情况|
59 | |boxplot()|绘制样本数据的箱型图|
60 | |plot(logy = True)|绘制y轴的对数图形|
61 | |plot(yerr = error)|绘制误差条形图|
62 |
63 |
64 | ```python
65 | import os
66 | import numpy as np
67 | import matplotlib.pyplot as plt
68 | import pandas as pd
69 | %matplotlib inline
70 | ```
71 |
72 | # 数据探索
73 | 通过检验数据集的数据质量、绘制图表、计算某些特征量等手段,对样本数据集的结构和规律进行分析的过程就是数据探索。数据探索有助于选择合适的数据预处理和建模方法
74 |
75 | ## 数据质量分析
76 | 数据质量分析的主要任务是检查原始数据中是否存在涨数据,涨数据一般是指不符合要求,以及不能直接进行相应分析的数据。在常见的数据挖掘工作中,涨数据包括如下内容:
77 | - 缺失值
78 | - 异常值
79 | - 不一致的值
80 | - 重复数据及含有特殊符号(如 #、¥、*)的数据
81 |
82 | ### 缺失值分析
83 | 数据的缺失主要包括记录的缺失和记录中某个字段信息的缺失,两者都会造成分析结果的不准确。
84 |
85 | **(1)缺失值产生的原因**
86 | - 有些信息暂时无法获取,或者获取信息的代价太大。
87 | - 有些信息是被遗漏的,这可能是人为的或者某些意外造成的。例如忘记填写或设备故障等。
88 | - 属性值不存在。在某些情况下,对一些对象来说某些属性值是不存在的,如一个未婚者的配偶姓名、一个儿童的固定收入等。
89 |
90 | **(2)缺失值的影响**
91 | - 数据挖掘建模将丢失大量的有用信息。
92 | - 数据挖掘模型所表现数的不确定性更加显著,模型中蕴含的规律更难把握。
93 | - 包含空值的数据会使建模过程陷入混乱,导致不可靠的输出。
94 |
95 | **(3)缺失值分析**
96 | - 使用简单的统计分析,可以得到含有缺失值的属性个数,以及每个属性的未缺失数,缺失数于缺失率等。
97 | - 对于缺失值的处理后面会单独讲。
98 |
99 | ### 异常值分析
100 | 异常值分析是检验数据是否有录入错误以及含有不合常理的数据。异常值是指样本中的个别 值,其数值明显偏离其余的观测值。异常值也称为离群点。
101 |
102 | **(1)简单统计量分析**
103 | - 可以先对变量做一个描述性统计,例如用最大值和最小值来判断这个变量的取值是否超出了合理的范围。
104 |
105 | **(2)3$\sigma$原则**
106 | - 如果数据服从正态分布,在3$\sigma$原则下,异常值被定义为一组观测值中于平均值的偏差超过3倍标准差的值。在标准正态分布的假设下,距离平均值3$\sigma$之外的值出现的概率为$P(|x-\mu|>3\sigma) \le 0.003$,属于极个别的小概率事件。
107 | - 如果数据不服从正态分布,也可以用远离平均值的多少倍标准差来描述。
108 |
109 | **(3)箱型图分析**
110 | - 箱型图提供了识别异常值的一个标准:异常值通常被定义为小于$Q_L - 1.5IQR$或大于$Q_U +1.5IQR$的值。$Q_L$称为下四分位数,表示全部观察值中有四分之一的数据取值比它小;$Q_U$称为上四分为数,表示全部观察值中有四分之一的数据取值比它大;$IQR$称为四分位数间距,是上四分位数$Q_U$与下四分位数$Q_L$之差,期间包含了全部观察值的一半。 箱线图真实客观地表现数据分布的本来面貌;它判断异常值的标准以四分位数和四分位距为基础,四分位数具有一定的鲁棒性,在识别异常值方面有一定的优越性。
111 |
112 |
113 | ```python
114 | data = pd.read_excel('data/catering_sale.xls', index_col=u'日期') # 读取餐饮数据,指定“日期”列为索引列。
115 | data.describe() # 查看数据的基本情况
116 | ```
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 | |
126 | 销量 |
127 |
128 |
129 |
130 |
131 | | count |
132 | 200.000000 |
133 |
134 |
135 | | mean |
136 | 2755.214700 |
137 |
138 |
139 | | std |
140 | 751.029772 |
141 |
142 |
143 | | min |
144 | 22.000000 |
145 |
146 |
147 | | 25% |
148 | 2451.975000 |
149 |
150 |
151 | | 50% |
152 | 2655.850000 |
153 |
154 |
155 | | 75% |
156 | 3026.125000 |
157 |
158 |
159 | | max |
160 | 9106.440000 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 | ```python
170 | def programmer_1(data):
171 | plt.figure()
172 | # 画箱线图
173 | p = data.boxplot(return_type='dict')
174 | x = p['fliers'][0].get_xdata()
175 | y = p['fliers'][0].get_ydata()
176 | y.sort()
177 | print(y)
178 |
179 | for i in range(len(x)):
180 | # 处理临界情况, i=0时
181 | temp = y[i] - y[i - 1] if i != 0 else -78 / 3
182 | # 添加注释, xy指定标注数据,xytext指定标注的位置(所以需要特殊处理)
183 | plt.annotate(y[i], xy=(x[i], y[i]), xytext=(x[i] + 0.05 - 0.8 / temp, y[i]))
184 | plt.show()
185 |
186 | programmer_1(data)
187 | ```
188 |
189 | [ 22. 51. 60. 865. 4060.3 4065.2 6607.4 9106.44]
190 |
191 |
192 |
193 | 
194 |
195 |
196 | 根据上面的箱型图,结合具体业务可以把865、4060.3、4065.2归为正常值,将22、51、60、6607.4、9406.44归为异常值。最后确定过滤规则为:日销量在400以下5000以上则属于异常值。
197 |
198 | ### 一致性分析
199 | 数据不一致性是指数据的矛盾性、不相容性。不一致数据的产生主要发生在数据集成的过程中,这可能是由于被挖掘数据是来自于从不同的数据源、对于重复存放的数据未能进行一致性更新造成的。例如。两张表中存储了用户的电话号码,但在用户的电话号码发生改变时只更新了一张表中的数据,那么这两张表中就有了不一致的数据。
200 |
201 | ## 数据特征分析
202 | 对数据进行质量分析以后,接下来可通过绘制图标、计算某些特征量等手段进行数据的特征分析。
203 |
204 | ### 分布分析
205 | 分布分析能揭示数据的分布特征和分布类型。
206 | - 对于定量数据,可以绘制频率分布表、绘制频率分布直方图、绘制茎叶图进行直观分析。
207 | - 对于定性分类数据,可用饼图和条形图直观地显示分布情况。
208 |
209 | #### 1.定量数据的分布分析
210 | 对于定量变量而言,选择“组数”和“组宽”是做频率分布分析时最主要的问题,一般按照以下步骤进行。
211 | - 1.求极差
212 | - 2.决定组距于组数
213 | - 3.决定分点
214 | - 4.列出频率分布表
215 | - 5.绘制频率分布直方图
216 |
217 | 遵循的主要原则如下
218 | - 1.各组之间必须是互相排斥的
219 | - 2.各组必须将所有的数据包含在内
220 | - 3.各组的组宽最好相等
221 |
222 | 以“捞器生鱼片”菜品举例:
223 |
224 |
225 | ```python
226 | data.head()
227 | ```
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 | |
237 | 销量 |
238 |
239 |
240 | | 日期 |
241 | |
242 |
243 |
244 |
245 |
246 | | 2015-03-01 |
247 | 51.0 |
248 |
249 |
250 | | 2015-02-28 |
251 | 2618.2 |
252 |
253 |
254 | | 2015-02-27 |
255 | 2608.4 |
256 |
257 |
258 | | 2015-02-26 |
259 | 2651.9 |
260 |
261 |
262 | | 2015-02-25 |
263 | 3442.1 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 | **1.求极差**
272 |
273 | 极差 = 最大值 - 最小值
274 |
275 |
276 | ```python
277 | xse = data['销量']
278 | range_mm = np.max(xse) - np.min(xse)
279 | range_mm
280 | ```
281 |
282 |
283 |
284 |
285 | 9084.4400000000005
286 |
287 |
288 |
289 | **2.分组**
290 |
291 | 这里根据业务数据的含义,可取组距为500.
292 | 组数 = 极差 / 组距
293 |
294 |
295 | ```python
296 | num_split = range_mm / 500
297 | num_split
298 | ```
299 |
300 |
301 |
302 |
303 | 18.168880000000001
304 |
305 |
306 |
307 | **3.决定分点**
308 |
309 | | | 分布区间| |
310 | |---|---|---|
311 | |[0, 500)|[500, 1000)|...|
312 |
313 | **4.列出频率分布表**
314 |
315 | **5.绘制频率分布直方图**
316 |
317 |
318 | ```python
319 | range_list = list(range(0, 5001, 500))
320 | data_cut = pd.cut(xse.values, range_list, right=False) # 分组区间
321 | frequency = data_cut.value_counts() # 区间-个数
322 |
323 | frequency.plot(kind='bar')
324 | range_list = pd.cut(xse, range_list, right=False)
325 | data['区间'] = range_list.values
326 | data.groupby('区间').median()
327 | data.groupby('区间').mean() # 每个区间平均数
328 |
329 | frequency_df = pd.DataFrame(frequency, columns=['频数'])
330 | frequency_df['频率f'] = frequency_df / frequency_df['频数'].sum()
331 | frequency_df['频率%'] = frequency_df['频率f'].map(lambda x: '%.2f%%' % (x * 100))
332 | frequency_df['累计频率f']=frequency_df['频率f'].cumsum()
333 | frequency_df['累计频率%']=frequency_df['累计频率f'].map(lambda x:'%.4f%%'%(x*100))
334 | print(frequency_df)
335 | ```
336 |
337 | 频数 频率f 频率% 累计频率f 累计频率%
338 | [0, 500) 3 0.015152 1.52% 0.015152 1.5152%
339 | [500, 1000) 1 0.005051 0.51% 0.020202 2.0202%
340 | [1000, 1500) 0 0.000000 0.00% 0.020202 2.0202%
341 | [1500, 2000) 1 0.005051 0.51% 0.025253 2.5253%
342 | [2000, 2500) 53 0.267677 26.77% 0.292929 29.2929%
343 | [2500, 3000) 87 0.439394 43.94% 0.732323 73.2323%
344 | [3000, 3500) 44 0.222222 22.22% 0.954545 95.4545%
345 | [3500, 4000) 7 0.035354 3.54% 0.989899 98.9899%
346 | [4000, 4500) 2 0.010101 1.01% 1.000000 100.0000%
347 | [4500, 5000) 0 0.000000 0.00% 1.000000 100.0000%
348 |
349 |
350 |
351 | 
352 |
353 |
354 | #### 2.定性数据的分布分析
355 | 对于定性变量,常常根据变量的分类类型来分组,可以采用饼图和条形图来描述定性变量的分布。
356 | 饼图每一部分的大小于每一类型的频数成正比;条形图的高度代表每一类型分百分比或频数,它的宽度没有意义。
357 |
358 |
359 | ```python
360 | labels = 'Frogs','Hogs','Dogs','Logs'
361 | sizes = [15,30,45,10]
362 | explode = (0,0.1,0,0) # 0.1表示将Hogs那一块凸显出来
363 | plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
364 | shadow=False, startangle=90) #startangle表示饼图的起始角度
365 |
366 | plt.axis('equal') # 这行让长宽比例相等
367 | plt.show()
368 | ```
369 |
370 |
371 | 
372 |
373 |
374 |
375 |
376 |
377 | ```python
378 | fig = plt.subplot(111)
379 | width = 0.5
380 | x_bar=np.arange(4)
381 | fig.bar(x=x_bar, height=sizes, width=width, color='lightblue')
382 | fig.set_xticks(x_bar)
383 | fig.set_xticklabels(labels)
384 | fig.set_ylabel("sales")
385 | fig.set_ylim(0, 50)
386 | fig.set_title("The Sales in 2018")
387 | fig.grid(True)
388 | plt.show()
389 | ```
390 |
391 |
392 | 
393 |
394 |
395 | ### 对比分析
396 | 对比分析是值把两个相互联系的指标进行比较,从数量上展示和说明研究对象规模的大小,水平的高低,速度的快慢,以及各种关系是否协调。特别适用于指标间的横纵向比较、时间序列的比较分析。
397 |
398 | 对比分析主要有一下两种形式:
399 |
400 | **(1)绝对数比较**
401 | 绝对数比较是利用绝对数进行对比,从而寻找差异的一种方法。
402 |
403 | **(2)相对数比较**
404 | 相对数比较是由两个有联系的指标对比计算的,用以反映客观现象之间数量联系程度的指标,其数值表现形式为相对数。
405 | - **结构相对数**:将同一总体内的部分数值与全部数值对比求得比重,用以说明失误的性质、结构或质量。如居民食品支出总额比重、产品合格率等。
406 | - **比例相对数**:将同一总体内不同部分的数值进行对比,表面总体内各部分的比例关系。如人口性别比例、投资与消费比例等。
407 | - **比较相对数**:将同一时期两个性质相同的指标数值进行对比,说明同类现象在不同空间条件下的数量对比关系。如不同地区商品价格对比,不同行业、不同企业间某项指标对比等。
408 | - **强度相对数**:将两个性质不同但有一定联系的总量指标进行对比,用以说明现象的强度、密度和普遍程度。如人均生产总值用“元/人”表示,人口密度用“人/平方公里”表示,也有用百分数或千分数标示的,如人口出生率%。表示。
409 | - **计划完成度相对数**:是某一时期实际完成数与计划数的对比,用以说明计划完成程度。
410 | - **动态相对数**:将同以现象在不同时期的指标数值进行对比,用以说明发展方向和变化速度。如发展速度、增长速度等。
411 |
412 |
413 |
414 |
415 | ```python
416 | x1 = [32000, 39000, 42000, 30000, 20000, 25000, 31000, 26000, 28000, 30000, 33000, 39000]
417 | x2 = [38000, 42000, 43000, 31000, 25000, 21000, 29000, 30000, 31000, 29000, 26000, 36000]
418 | plt.figure(figsize=(15, 10))
419 | fig = plt.subplot()
420 | fig.plot(np.arange(12), x1, label='2017')
421 | fig.plot(np.arange(12), x2, label='2018')
422 | fig.set_xticks(np.arange(12))
423 | fig.set_xticklabels(['1 month', '2 month', '3 month', '4 month', '5 month', '6 month', '7 month', '8 month', '9 month', '10 month', '11 month', '12 month'])
424 | fig.set_ylim(15000, 50000)
425 | fig.set_ylabel(u"sales (yuan)")
426 | fig.set_title(u"The Sales in 2017 and 2018")
427 | fig.legend(['2017', '2018'], loc=2, ncol=1)
428 | plt.show()
429 | ```
430 |
431 |
432 | 
433 |
434 |
435 | ### 统计量分析
436 | 用统计指标对定量数据进行统计描述,常从集中趋势和离中趋势两个方面进行分析。
437 |
438 | #### 集中趋势度量
439 | - **1.均值**
440 | 均值是所有数据的平均值。
441 | 如果求n个原始观察数据的平均值,计算公式为:
442 | $$\rm mean(x) = \bar x = \frac{\sum x_i}{n}$$
443 | 有时会用到加权平均值
444 | $$\rm mean(x) = \bar x = \frac{\sum \omega_{i}x_i}{\sum \omega_i}$$
445 | 类似地,频率分布表的平均数计算公式:
446 | $$\rm mean(x) =\bar x = \sum \it f_{i}x_i$$
447 | 式中,$x_i$为第i个组段的组中值;$\it f_i$为第i组的频率。
448 |
449 | 均值对极端值很敏感,如果数据中存在极端值或者数据是偏态分布的,那么均值就不能很好地度量数据的集中程度。为了消除少数极端值的影响,可以使用`截断均值`或者`中位数`来度量数据的集中趋势。截断均值是去掉高、低计算值之后的平均数。
450 |
451 |
452 | - **2.中位数**
453 | 中位数是将一组数据观察值按从小到大的顺序排列,位于中间的那个数。
454 |
455 | 将某一数据集$x:(x_1,x_2, \dots, x_n)$按从小到大排列:$x_{(1)}, x_{(2)}, \dots, x_{(n)}$。
456 |
457 | 当n为奇数时
458 | $$M = x_{\frac {n+1}{2}}$$
459 | 当n为偶数时
460 | $$M = \frac{1}{2}(x_{\frac{n}{2}} + x_{\frac{n+1}{2}})$$
461 |
462 | - **3.众数**
463 | 众数是指数据集中出现最频繁的值。众数并不经常用来度量定性变量的中心位置,更适用于定性变量。众数不具有偶唯一性,一般用于离散型变量而非连续型变量。
464 |
465 | #### 离中趋势度量
466 | - **1.极差**
467 | 极差对数据集的极端值非常敏感,并且忽略了位于最大值与最小值之间的数据的分布情况。
468 |
469 | - **2.标准差**
470 | 标准差度量数据偏离均值的程度,计算公式为:
471 | $$s = \sqrt{\frac{\sum (x_i - \bar x)^2}{n}}$$
472 |
473 | - **3.变异系数**
474 | 变异系数度量标准差相对于均值的离中趋势,计算公式为:
475 | $$\rm CV = \frac{s}{\bar x} \times 100\%$$
476 | > 变异系数主要用来比较两个或多个具有不同单位或不同波动幅度的数据集的离中趋势。
477 |
478 | - **4.四分位数间距**
479 | 四分位数间距,是上四分位数$Q_U$与下四分位数$Q_L$之差,其间包含了全部观察值的一半。其值越大,说明数据的变异程度越大;反之,说明变异程度越小。
480 |
481 |
482 | ```python
483 | data.describe()
484 | ```
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 | |
494 | 销量 |
495 |
496 |
497 |
498 |
499 | | count |
500 | 200.000000 |
501 |
502 |
503 | | mean |
504 | 2755.214700 |
505 |
506 |
507 | | std |
508 | 751.029772 |
509 |
510 |
511 | | min |
512 | 22.000000 |
513 |
514 |
515 | | 25% |
516 | 2451.975000 |
517 |
518 |
519 | | 50% |
520 | 2655.850000 |
521 |
522 |
523 | | 75% |
524 | 3026.125000 |
525 |
526 |
527 | | max |
528 | 9106.440000 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 | ### 周期性分析
537 | 周期性分析是探索某个变量是否随着时间变化而呈现出某种变化趋势。时间尺度相对较长的周期性趋势有年度周期性趋势、季节性周期趋势,相对较短的有月度周期性趋势、周度周期性趋势,甚至更短的天、小时周期性趋势。
538 |
539 | ### 贡献度分析(帕累托分析)
540 | 贡献度分析又称为帕累托分析,它的原理是帕累托法则,又称为20/80定律。同样的投入放在不同的地方会产生不同的效益。例如,对一个公司来讲,80%的利润常常来自20%最畅销的产品,而对其他80%的产品只产生了20%利润。
541 |
542 | 下面展示了某餐厅,海鲜系列10个菜品A1~A10某个月的盈利额(已按照从大到小顺序排列)
543 |
544 |
545 | ```python
546 | import matplotlib
547 | from matplotlib.font_manager import *
548 | myfont = FontProperties(fname='/home/heolis/SIMSUN.TTC')
549 | matplotlib.rcParams['axes.unicode_minus']=False
550 |
551 |
552 |
553 | dish_profit = 'data/catering_dish_profit.xls' #餐饮菜品盈利数据
554 | data = pd.read_excel(dish_profit, index_col = u'菜品名')
555 | data = data[u'盈利'].copy()
556 | data.sort_values(ascending = False)
557 |
558 | plt.figure()
559 | data.plot(kind='bar')
560 | plt.ylabel(u'盈利(元)',fontproperties=myfont)
561 | p = 1.0*data.cumsum()/data.sum()
562 | p.plot(color = 'r', secondary_y = True, style = '-o',linewidth = 2)
563 | plt.annotate(format(p[6], '.4%'), xy = (6, p[6]), xytext=(6*0.9, p[6]*0.9), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2")) #添加注释,即85%处的标记。这里包括了指定箭头样式。
564 | plt.ylabel(u'盈利(比例)',fontproperties=myfont)
565 | plt.title(u'帕累托图', fontproperties=myfont)
566 | plt.show()
567 | ```
568 |
569 |
570 | 
571 |
572 |
573 | 由上图可知,菜品A1~A7共7个菜品,占菜品种类数的70%,总盈利额占该月盈利额的85.0033%.根据帕累托法则,应该增加菜品A1~A7的成本投入,减少对菜品A8~A10的投入以获得更高的盈利额。
574 |
575 | ### 相关性分析
576 | 分析连续变量之间线性相关程度的强弱。
577 |
578 | #### 1.绘制散点图
579 | 判断两个变量是否具有线性相关关系的最直观的方法是直接绘制散点图。
580 | 
581 |
582 | #### 2.绘制散点图矩阵
583 | 需要同时考虑多个变量间的相关关系时,利用散点图矩阵同时绘制各变量间的散点图。
584 | 
585 |
586 | #### 计算相关系数
587 | 在二元变量的相关分析过程中比较常用的有Pearson 相关系数、Spearman 质相关系数、判定系数。
588 |
589 | - **1.Pearson 相关系数**
590 | 一般用于分析两个连续性变量之间的关系,其计算公式如下。
591 | $$r = \frac{\sum ^n_{i = 1}(x_i - \bar x)(y_i - \bar y)}{\sqrt{ \sum ^n_{i = 1}(x_i - \bar x)^2 \sum ^n_{i = 1}(y_i - \bar y)^2}}$$
592 |
593 | 相关系数r的取值范围:$-1 \le r \le 1$
594 | $$\begin{cases}r > 0 &为正相关 \\ r < 0 &为负相关 \\ |r| = 0 &表示不存在线性关系 \\ |r| = 1 &表示完全线性关系 \end{cases}$$
595 | 0<|r|<1 表示存在不同程度线性相关:
596 | $$\begin{cases}|r| \le 0.3 & 为不存在线性相关 \\ 0.3 < |r| \le 0.5 & 为低度线性相关 \\ 0.5 < |r| \le 0.8 &为显著线性相关 \\ |r| > 0.8 &为高度线性相关 \end{cases}$$
597 | Pearson线性相关系数要求连续变量服从正态分布。Pearson相关只有在变量具有线性关系时才完全相关。
598 |
599 | - **2.Spearman秩相关系数**
600 | 不服从正态分布的变量、分类或等级变量之间的关联性可采用Spearman秩相关系数,也称等级相关系数来描述。
601 | 其计算公式如下:
602 | $$r_s = 1 - \frac{6\sum^n_{i=1}(R_i - Q_i)^2}{n(n^2 - 1)}$$
603 | > 对两个变量成对的取值分别按照从小到大(或者从大到大小)顺序编秩,$R_i$代表$x_i$的秩次,$Q_i$代表$y_i$的秩次,$R_i-Q_i$为$x_i$、$y_i$的秩次之差。
604 | 由于一个变量的相同的取值必须有相同的秩次,所以在计算中采用的秩次是排序后所在位置的平均值。
605 | 只要两个变量具有严格单调的函数关系,那么它们就是完全Spearman相关的。研究表明,在正态分布假定下,Spearman秩相关系数与Pearson相关系数在效率上是等价的,而对于连续测量数据,更适合用Pearson相关系数来进行分析。
606 |
607 | - **3.判定系数**
608 | 判定系数是相关系数的平方,用$r^2$表示;用来衡量回归方程对y的解释程度。判定系数取值范围:$0≤r2≤1$。$r^2$越接近于1,表明x与y之间的相关性越强;$r^2$越接近于0,表明两个变量之间几乎没有直线相关关系。
609 |
610 |
611 | ```python
612 | catering_sale = 'data/catering_sale_all.xls' #餐饮数据,含有其他属性
613 | data = pd.read_excel(catering_sale, index_col = u'日期') #读取数据,指定“日期”列为索引列
614 |
615 | data.corr() #相关系数矩阵,即给出了任意两款菜式之间的相关系数
616 | data.corr()[u'百合酱蒸凤爪'] #只显示“百合酱蒸凤爪”与其他菜式的相关系数
617 | ```
618 |
619 |
620 |
621 |
622 | 百合酱蒸凤爪 1.000000
623 | 翡翠蒸香茜饺 0.009206
624 | 金银蒜汁蒸排骨 0.016799
625 | 乐膳真味鸡 0.455638
626 | 蜜汁焗餐包 0.098085
627 | 生炒菜心 0.308496
628 | 铁板酸菜豆腐 0.204898
629 | 香煎韭菜饺 0.127448
630 | 香煎罗卜糕 -0.090276
631 | 原汁原味菜心 0.428316
632 | Name: 百合酱蒸凤爪, dtype: float64
633 |
634 |
635 |
636 |
637 | ```python
638 | data[u'百合酱蒸凤爪'].corr(data[u'翡翠蒸香茜饺']) #计算“百合酱蒸凤爪”与“翡翠蒸香茜饺”的相关系数
639 | ```
640 |
641 |
642 |
643 |
644 | 0.0092058030518365284
645 |
646 |
647 |
648 |
649 |
--------------------------------------------------------------------------------
/7.机器学习部分.md:
--------------------------------------------------------------------------------
1 | # 调参
2 | ### GridSearchCV() 对估算器指定参数值进行详尽搜索
3 | ``` Python
4 | from sklearn.model_selection import GridSearchCV
5 |
6 | param_grid = [
7 | # try 12 (3×4) combinations of hyperparameters
8 | {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
9 | # then try 6 (2×3) combinations with bootstrap set as False
10 | {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
11 | ]
12 |
13 | forest_reg = RandomForestRegressor(random_state=42)
14 | # train across 5 folds, that's a total of (12+6)*5=90 rounds of training
15 | grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
16 | scoring='neg_mean_squared_error')
17 | grid_search.fit(housing_prepared, housing_labels)
18 |
19 | grid_search.best_params_
20 | # Out:{'max_features': 8, 'n_estimators': 30}
21 |
22 | grid_search.best_estimator_
23 | # Out:
24 | # RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
25 | # max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
26 | # min_impurity_split=None, min_samples_leaf=1,
27 | # min_samples_split=2, min_weight_fraction_leaf=0.0,
28 | # n_estimators=30, n_jobs=1, oob_score=False, random_state=42,
29 | # verbose=0, warm_start=False)
30 | ```
31 | - estimator : estimator object.每个估算器需要提供一个`score`函数或填写`scoring`参数。
32 | - param_grid : dict or list of dictionaries,键作为参数名称,list作为参数的字典。或存有这样的字典的列表。
33 | - scoring : string, callable, list/tuple, dict or None, default: None,
34 | - cv : int, cross-validation generator or an iterable, optional,如果是整数,则代表KFold
35 | - refit : boolean, or string, default=True,应用已找到的最好的参数到整个数据集上。
36 |
37 | |Methods | description|
38 | |------|-------|
39 | |decision_function(X) | Call decision_function on the estimator with the best found parameters.|
40 | |fit(X[, y, groups]) | Run fit with all sets of parameters.|
41 | |get_params([deep]) | Get parameters for this estimator.|
42 | |inverse_transform(Xt) | Call inverse_transform on the estimator with the best found params.|
43 | |predict(X) | Call predict on the estimator with the best found parameters.|
44 | |predict_log_proba(X) | Call predict_log_proba on the estimator with the best found parameters.|
45 | |predict_proba(X) | Call predict_proba on the estimator with the best found parameters.|
46 | |score(X[, y]) | Returns the score on the given data, if the estimator has been refit.|
47 | |set_params(**params) | Set the parameters of this estimator.|
48 | |transform(X) |Call transform on the estimator with the best found parameters.|
49 |
50 |
51 | ### RandomizedSearchCV()
52 | ``` Python
53 | from sklearn.model_selection import RandomizedSearchCV
54 | from scipy.stats import randint
55 |
56 | param_distribs = {
57 | 'n_estimators': randint(low=1, high=200),
58 | 'max_features': randint(low=1, high=8),
59 | }
60 |
61 | forest_reg = RandomForestRegressor(random_state=42)
62 | rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
63 | n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
64 | rnd_search.fit(housing_prepared, housing_labels)
65 | ```
66 | - estimator : estimator object.指定估算器对象。
67 | - param_distributions : dict,给定以参数名为键,list为参数的字典。或提供一个分布,分布必须提供一个`rvs`方法进行采样,例如来自scipy.stats.distributions的方法。
68 | - n_iter : int, default=10,采样参数设置数量。
69 | - scoring : string, callable, list/tuple, dict or None, default: None
70 | - cv : int, cross-validation generator or an iterable, optional
71 | - refit : boolean, or string default=True
72 | - random_state : int, RandomState instance or None, optional, default=None
73 |
74 | # 模型
75 | ## 分类
76 | ### RandomForestClassifier
77 | ``` Python
78 | from sklearn.ensemble import RandomForestClassifier
79 | from sklearn.model_selection import cross_val_score
80 | rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
81 | scores = cross_val_score(rnd_clf, trainData, trainLabel,
82 | scoring="recall", cv=10)
83 | scores.mean()
84 | ```
85 | ### ExtraTreesClassifier
86 | ``` Python
87 | from sklearn.ensemble import ExtraTreesClassifier
88 | from sklearn.model_selection import cross_val_score
89 | etr_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=50, n_jobs=-1)
90 | etr_score = cross_val_score(etr_clf, trainData, trainLabel, scoring='recall', cv=10)
91 | etr_score.mean()
92 | ```
93 | ### GradientBoostingClassifier
94 | ``` Python
95 | from sklearn.ensemble import GradientBoostingClassifier
96 | gbdt = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=2,max_depth=6,n_estimators=100)
97 | ```
98 |
99 | ### XGBClassifier
100 | ``` Python
101 | from xgboost.sklearn import XGBClassifier
102 | from sklearn import cross_validation, metrics
103 | xgb1 = XGBClassifier(learning_rate =0.05,
104 | n_estimators=1000,
105 | max_depth=3,
106 | min_child_weight=1,
107 | gamma=0.1,
108 | subsample=0.8,
109 | colsample_bytree=0.8,
110 | objective= 'binary:logistic',
111 | nthread=4,
112 | reg_alpha=0.001,
113 | reg_lambda=0.001,
114 | scale_pos_weight=1,
115 | seed=27)
116 | xgb1.fit(subTrain, subLabel)
117 | xgb1_pred = xgb1.predict(trainData)
118 | xgb1_pred_prob = xgb1.predict_proba(trainData)[:, 1]
119 | print(metrics.accuracy_score(trainLabel, xgb1_pred))
120 | print(metrics.roc_auc_score(trainLabel, xgb1_pred_prob))
121 | xgb1_f2 = calc_f2(trainLabel, xgb1_pred)
122 | print(xgb1_f2)
123 | ```
124 |
125 | ### Lightgbm
126 | ``` Python
127 | import lightgbm as lgb
128 | lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
129 | boosting_type='gbdt',
130 | objective='binary',
131 | n_estimators=1000,
132 | metric='auc',
133 | max_depth=3,
134 | num_leaves=5,
135 | subsample=0.7,
136 | colsample_bytree=0.7,
137 | min_data_in_leaf=450,
138 | feature_fraction=0.7,
139 | bagging_fraction=0.7,
140 | bagging_freq=6,
141 | lambda_l1=1,
142 | lambda_l2=0.001,
143 | min_gain_to_split=0.265,
144 | verbose=5,
145 | is_unbalance=True)
146 | lgb_clf.fit(subTrain, subLabel)
147 | lgb_clf_pred = lgb_clf.predict(trainData)
148 | ```
149 | ## 模型融合
150 | ### BaggingClassifier
151 | ``` Python
152 | from sklearn.ensemble import BaggingClassifier
153 | bag_rnd = BaggingClassifier(rnd_clf, n_estimators=10, max_samples=1000, bootstrap=True, n_jobs=-1)
154 | bag_rnd.fit(subTrain, subLabel)
155 | rnd_pred = bag_rnd.predict(trainData)
156 | ```
157 |
158 | ### VotingClassifier
159 | ``` Python
160 | from sklearn.ensemble import VotingClassifier
161 | from xgboost.sklearn import XGBClassifier
162 | xgb = XGBClassifier(learning_rate =0.1,
163 | n_estimators=1000,
164 | max_depth=3,
165 | min_child_weight=1,
166 | gamma=0.1,
167 | subsample=0.8,
168 | colsample_bytree=0.8,
169 | objective= 'binary:logistic',
170 | nthread=4,
171 | reg_alpha=0.001,
172 | reg_lambda=0.001,
173 | scale_pos_weight=1)
174 |
175 | import lightgbm as lgb
176 | lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
177 | boosting_type='gbdt',
178 | objective='binary',
179 | n_estimators=1000,
180 | metric='auc',
181 | max_depth=3,
182 | num_leaves=5,
183 | subsample=0.7,
184 | colsample_bytree=0.7,
185 | min_data_in_leaf=450,
186 | feature_fraction=0.7,
187 | bagging_fraction=0.7,
188 | bagging_freq=6,
189 | lambda_l1=1,
190 | lambda_l2=0.001,
191 | min_gain_to_split=0.265,
192 | verbose=5,
193 | is_unbalance=True)
194 |
195 | from sklearn.ensemble import GradientBoostingClassifier
196 | gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7,
197 | max_features='sqrt', subsample=0.7, random_state=10)
198 |
199 | vot = VotingClassifier(estimators=[('xgb', xgb), ('lgb', lgb_clf)], voting='soft')
200 | vot.fit(trainData, trainLabel)
201 | vot_pred = vot.predict(testData)
202 | vot_pred = pd.DataFrame(vot_pred)
203 | submData['acc_now_delinq'] = vot_pred
204 | submData.to_csv(os.path.join(homePath, 'submit.csv'), index=False)
205 | ```
206 |
207 | ### StackingClassifier
208 | ``` Python
209 | from sklearn.ensemble import RandomForestClassifier
210 | rnd_clf = RandomForestClassifier(n_estimators=110, min_samples_split=90, min_samples_leaf=15,max_depth=10,oob_score=True,max_features=10)
211 |
212 | from xgboost.sklearn import XGBClassifier
213 | xgb = XGBClassifier(learning_rate =0.05,
214 | n_estimators=1000,
215 | max_depth=3,
216 | min_child_weight=1,
217 | gamma=0.1,
218 | subsample=0.8,
219 | colsample_bytree=0.8,
220 | objective= 'binary:logistic',
221 | nthread=4,
222 | reg_alpha=0.001,
223 | reg_lambda=0.001,
224 | scale_pos_weight=1)
225 |
226 | import lightgbm as lgb
227 | lgb_clf = lgb.LGBMClassifier(learning_rate=0.1,
228 | boosting_type='gbdt',
229 | objective='binary',
230 | n_estimators=1000,
231 | metric='auc',
232 | max_depth=3,
233 | num_leaves=5,
234 | subsample=0.7,
235 | colsample_bytree=0.7,
236 | min_data_in_leaf=450,
237 | feature_fraction=0.7,
238 | bagging_fraction=0.7,
239 | bagging_freq=6,
240 | lambda_l1=1,
241 | lambda_l2=0.001,
242 | min_gain_to_split=0.265,
243 | verbose=5,
244 | is_unbalance=True)
245 |
246 | from sklearn.ensemble import GradientBoostingClassifier
247 | gbdt = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=320, min_samples_leaf=7, max_depth=7,
248 | max_features='sqrt', subsample=0.7)
249 |
250 |
251 | from mlxtend.classifier import StackingClassifier
252 | stack_clf = StackingClassifier(classifiers=[gbdt, lgb_clf, rnd_clf, xgb], meta_classifier=xgb)
253 |
254 | stack_clf.fit(trainData, trainLabel)
255 | stack_pred = stack_clf.predict(testData)
256 | ```
257 |
258 | # 数据处理
259 | ## 特征放缩
260 | ### MinMax scaling归一化
261 | 该方法更容易受离散点影响
262 | `X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))`
263 | `X_scaled = X_std * (max - min) + min`
264 | ``` Python
265 | >>> from sklearn.preprocessing import MinMaxScaler
266 | >>>
267 | >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
268 | >>> scaler = MinMaxScaler()
269 | >>> print(scaler.fit(data))
270 | MinMaxScaler(copy=True, feature_range=(0, 1))
271 | >>> print(scaler.data_max_)
272 | [ 1. 18.]
273 | >>> print(scaler.transform(data))
274 | [[ 0. 0. ]
275 | [ 0.25 0.25]
276 | [ 0.5 0.5 ]
277 | [ 1. 1. ]]
278 | >>> print(scaler.transform([[2, 2]]))
279 | [[ 1.5 0. ]]
280 | ```
281 | - feature_range : tuple (min, max), default=(0, 1),归一化后值的范围
282 | - copy : boolean, optional, default True,是否复制数据在新的数据上归一化
283 |
284 | ### 零均值标准化
285 | ``` Python
286 | >>> from sklearn.preprocessing import StandardScaler
287 | >>>
288 | >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
289 | >>> scaler = StandardScaler()
290 | >>> print(scaler.fit(data))
291 | StandardScaler(copy=True, with_mean=True, with_std=True)
292 | >>> print(scaler.mean_)
293 | [ 0.5 0.5]
294 | >>> print(scaler.transform(data))
295 | [[-1. -1.]
296 | [-1. -1.]
297 | [ 1. 1.]
298 | [ 1. 1.]]
299 | >>> print(scaler.transform([[2, 2]]))
300 | [[ 3. 3.]]
301 | ```
302 | - copy : boolean, optional, default True,是否复制数据在新的数据上执行
303 | - with_mean : boolean, True by default,若为True则在缩放前将数据居中。但在稀疏矩阵上是行不通的。
304 | - with_std : boolean, True by default,若为True,则将数据放缩到单位方差或等效于单位标准差
305 |
306 |
307 | ## 处理空数据
308 | ### Imputer() 处理丢失值
309 | 各属性必须是数值
310 | ``` Python
311 | from sklearn.preprocessing import Imputer
312 | # 指定用何值替换丢失的值,此处为中位数
313 | imputer = Imputer(strategy="median")
314 |
315 | # 使实例适应数据
316 | imputer.fit(housing_num)
317 |
318 | # 结果在statistics_ 变量中
319 | imputer.statistics_
320 |
321 | # 替换
322 | X = imputer.transform(housing_num)
323 | housing_tr = pd.DataFrame(X, columns=housing_num.columns,
324 | index = list(housing.index.values))
325 |
326 | # 预览
327 | housing_tr.loc[sample_incomplete_rows.index.values]
328 | ```
329 |
330 | ## 处理文本数据
331 |
332 | ### pandas.factorize() 将输入值编码为枚举类型或分类变量
333 | ``` Python
334 | housing_cat = housing['ocean_proximity']
335 | housing_cat.head(10)
336 | # 输出
337 | # 17606 <1H OCEAN
338 | # 18632 <1H OCEAN
339 | # 14650 NEAR OCEAN
340 | # 3230 INLAND
341 | # 3555 <1H OCEAN
342 | # 19480 INLAND
343 | # 8879 <1H OCEAN
344 | # 13685 INLAND
345 | # 4937 <1H OCEAN
346 | # 4861 <1H OCEAN
347 | # Name: ocean_proximity, dtype: object
348 |
349 | housing_cat_encoded, housing_categories = housing_cat.factorize()
350 | housing_cat_encoded[:10]
351 | # 输出
352 | # array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
353 | ```
354 | ##### 参数
355 | - values : ndarray (1-d);序列
356 | - sort : boolean, default False;根据值排序
357 | - na_sentinel : int, default -1;给未找到赋的值
358 | - size_hint : hint to the hashtable sizer
359 |
360 | ##### 返回值
361 | - labels : the indexer to the original array
362 | - uniques : ndarray (1-d) or Index;当传递的值是Index或Series时,返回独特的索引。
363 |
364 | ### OneHotEncoder 编码整数特征为one-hot向量
365 | 返回值为稀疏矩阵
366 | ``` Python
367 | from sklearn.preprocessing import OneHotEncoder
368 |
369 | encoder = OneHotEncoder()
370 | housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
371 | housing_cat_1hot
372 | ```
373 | 注意`fit_transform()`期望一个二维数组,所以这里将数据reshape了。
374 |
375 | #### 处理文本特征示例
376 | ``` Python
377 | housing_cat = housing['ocean_proximity']
378 | housing_cat.head(10)
379 | # 17606 <1H OCEAN
380 | # 18632 <1H OCEAN
381 | # 14650 NEAR OCEAN
382 | # 3230 INLAND
383 | # 3555 <1H OCEAN
384 | # 19480 INLAND
385 | # 8879 <1H OCEAN
386 | # 13685 INLAND
387 | # 4937 <1H OCEAN
388 | # 4861 <1H OCEAN
389 | # Name: ocean_proximity, dtype: object
390 |
391 | housing_cat_encoded, housing_categories = housing_cat.factorize()
392 | housing_cat_encoded[:10]
393 | # array([0, 0, 1, 2, 0, 2, 0, 2, 0, 0], dtype=int64)
394 |
395 | housing_categories
396 | # Index(['<1H OCEAN', 'NEAR OCEAN', 'INLAND', 'NEAR BAY', 'ISLAND'], dtype='object')
397 |
398 | from sklearn.preprocessing import OneHotEncoder
399 |
400 | encoder = OneHotEncoder()
401 | print(housing_cat_encoded.reshape(-1,1))
402 | housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
403 | housing_cat_1hot
404 | # [[0]
405 | # [0]
406 | # [1]
407 | # ...,
408 | # [2]
409 | # [0]
410 | # [3]]
411 | # <16512x5 sparse matrix of type ''
412 | # with 16512 stored elements in Compressed Sparse Row format>
413 | ```
414 |
415 |
416 | ### LabelEncoder 标签编码
417 | LabelEncoder`是一个可以用来将标签规范化的工具类,它可以将标签的编码值范围限定在[0,n_classes-1]。简单来说就是对不连续的数字或者文本进行编号。
418 | ``` Python
419 | >>> from sklearn import preprocessing
420 | >>> le = preprocessing.LabelEncoder()
421 | >>> le.fit([1, 2, 2, 6])
422 | LabelEncoder()
423 | >>> le.classes_
424 | array([1, 2, 6])
425 | >>> le.transform([1, 1, 2, 6])
426 | array([0, 0, 1, 2])
427 | >>> le.inverse_transform([0, 0, 1, 2])
428 | array([1, 1, 2, 6])
429 | ```
430 | 当然,它也可以用于非数值型标签的编码转换成数值标签(只要它们是可哈希并且可比较的):
431 | ``` Python
432 |
433 | >>> le = preprocessing.LabelEncoder()
434 | >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
435 | LabelEncoder()
436 | >>> list(le.classes_)
437 | ['amsterdam', 'paris', 'tokyo']
438 | >>> le.transform(["tokyo", "tokyo", "paris"])
439 | array([2, 2, 1])
440 | >>> list(le.inverse_transform([2, 2, 1]))
441 | ['tokyo', 'tokyo', 'paris']
442 | ```
443 |
444 | ### LabelBinarizer 标签二值化
445 | LabelBinarizer 是一个用来从多类别列表创建标签矩阵的工具类:
446 | ``` Python
447 | >>> from sklearn import preprocessing
448 | >>> lb = preprocessing.LabelBinarizer()
449 | >>> lb.fit([1, 2, 6, 4, 2])
450 | LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
451 | >>> lb.classes_
452 | array([1, 2, 4, 6])
453 | >>> lb.transform([1, 6])
454 | array([[1, 0, 0, 0],
455 | [0, 0, 0, 1]])
456 | ```
457 | 对于多类别是实例,可以使用:class:MultiLabelBinarizer:
458 | ``` Python
459 | >>> lb = preprocessing.MultiLabelBinarizer()
460 | >>> lb.fit_transform([(1, 2), (3,)])
461 | array([[1, 1, 0],
462 | [0, 0, 1]])
463 | >>> lb.classes_
464 | array([1, 2, 3])
465 | ```
466 |
--------------------------------------------------------------------------------
/3.Matplotlib学习笔记.md:
--------------------------------------------------------------------------------
1 | > 整理翻译自[该项目](https://github.com/ageron/handson-ml/blob/master/tools_matplotlib.ipynb)
2 | ## Matplotlib
3 | 可能还有小伙伴不知道`Matplotlib`是什么,下面是维基百科的介绍。
4 | > Matplotlib 是Python编程语言的一个绘图库及其数值数学扩展 NumPy。它为利用通用的图形用户界面工具包,如Tkinter, wxPython, Qt或GTK+向应用程序嵌入式绘图提供了面向对象的应用程序接口。
5 |
6 | 简单说就是画图的工具包。本文将教会你如何使用`Matplotlib`绘图,如果你没有`Python`基础也没关系,依葫芦画瓢也完全OK的。关于如何安装Python以及Matplotlib,文末有链接。
7 | ## 绘制第一个图
8 |
9 | - 如果给`plot`函数一个一维数组,则将该数组作为纵轴坐标,并且将数组中的每个数据点索引作为水平坐标
10 | ``` python
11 | import matplotlib.pyplot as plt
12 | plt.plot([1, 2, 4, 9, 5, 3])
13 | plt.show()
14 | ```
15 | 
16 |
17 | - 如果提供两个数组,则将其分别作为x轴和y轴
18 | ``` Python
19 | plt.plot([-3, -2, 5, 0], [1, 6, 4, 3])
20 | plt.show()
21 | ```
22 | 
23 |
24 | - 坐标轴会自动匹配数据的范围,不过我们可以调用`axis`函数来改变每个轴的范围`[xmin, xmax, ymin, ymax]`
25 | ``` Python
26 | plt.plot([-3, -2, 5, 0], [1, 6, 4, 3])
27 | plt.axis([-4, 6, 0, 7])
28 | plt.show()
29 | ```
30 | 
31 |
32 |
33 | - 我们用NumPy's的`linspace`函数在范围[-2, 2]内创建包含500个浮点数的数组`x`,计算数组`x`的平方作为数组`y`
34 | ``` Python
35 | import numpy as np
36 | x = np.linspace(-2, 2, 500)
37 | y = x**2
38 |
39 | plt.plot(x, y)
40 | plt.show()
41 | ```
42 | 
43 |
44 | - 添加标题,x、y轴标签,并绘制网格
45 | ``` Python
46 | plt.plot(x, y)
47 | plt.title("Square function")
48 | plt.xlabel("x")
49 | plt.ylabel("y = x**2")
50 | plt.grid(True)
51 | plt.show()
52 | ```
53 | 
54 |
55 | ## 线条样式和颜色
56 | - 默认情况下,matplotlib在连续的点之间绘制一条线。
57 | ``` Python
58 | plt.plot([0, 100, 100, 0, 0, 100, 50, 0, 100], [0, 0, 100, 100, 0, 100, 130, 100, 0])
59 | plt.axis([-10, 110, -10, 140])
60 | plt.show()
61 | ```
62 | 
63 |
64 | - 可在第三个参数更改线条的样式和颜色,比如“g--”表示“绿色虚线”
65 | ``` Python
66 | plt.plot([0, 100, 100, 0, 0, 100, 50, 0, 100], [0, 0, 100, 100, 0, 100, 130, 100, 0], "g--")
67 | plt.axis([-10, 110, -10, 140])
68 | plt.show()
69 | ```
70 | 
71 |
72 | - 你可以绘制多条线在同一个图上,仅仅通过重复`x1, y1, [style1], x2, y2, [style2], ...`
73 | ``` python
74 | plt.plot([0, 100, 100, 0, 0], [0, 0, 100, 100, 0], "r-", [0, 100, 50, 0, 100], [0, 100, 130, 100, 0], "g--")
75 | plt.axis([-10, 110, -10, 140])
76 | plt.show()
77 | ```
78 | 
79 |
80 |
81 | - 也可以在`show`之前`plot`多次
82 | ``` python
83 | plt.plot([0, 100, 100, 0, 0], [0, 0, 100, 100, 0], "r-")
84 | plt.plot([0, 100, 50, 0, 100], [0, 100, 130, 100, 0], "g--")
85 | plt.axis([-10, 110, -10, 140])
86 | plt.show()
87 | ```
88 | 
89 |
90 | - 你也可以绘制点而不只是绘制直线
91 | ``` python
92 | x = np.linspace(-1.4, 1.4, 30)
93 | plt.plot(x, x, 'g--', x, x**2, 'r:', x, x**3, 'b^')
94 | plt.show()
95 | ```
96 | 
97 | #### 接受以下格式字符来控制线条样式或标记
98 | character | description
99 | -------|-------
100 | '-' | solid line style
101 | '--' | dashed line style
102 | '-.' | dash-dot line style
103 | ':' | dotted line style
104 | '.' | point marker
105 | ',' | pixel marker
106 | 'o' | circle marker
107 | 'v' | triangle_down marker
108 | '^' | triangle_up marker
109 | '<' | triangle_left marker
110 | '>' | triangle_right marker
111 | '1' | tri_down marker
112 | '2' | tri_up marker
113 | '3' | tri_left marker
114 | '4' | tri_right marker
115 | 's' | square marker
116 | 'p' | pentagon marker
117 | '*' | star marker
118 | 'h' | hexagon1 marker
119 | 'H' | hexagon2 marker
120 | '+' | plus marker
121 | 'x' | x marker
122 | 'D' | diamond marker
123 | 'd' | thin_diamond marker
124 | '|' | vline marker
125 | '_' | hline marker
126 |
127 | #### 支持以下颜色缩写
128 | |character | color|
129 | |------|-------|
130 | |‘b’ | blue|
131 | |‘g’ | green|
132 | |‘r’ | red|
133 | |‘c’ | cyan|
134 | |‘m’ | magenta|
135 | |‘y’ | yellow|
136 | |‘k’ | black|
137 | |‘w’ | white|
138 |
139 |
140 | - `plot`函数会返回一个`Line2D`对象列表,你可以额外设置一些属性,例如线的宽度,虚线风格等等。
141 | ``` Python
142 | x = np.linspace(-1.4, 1.4, 30)
143 | line1, line2, line3 = plt.plot(x, x, 'g--', x, x**2, 'r:', x, x**3, 'b^')
144 | line1.set_linewidth(3.0)
145 | line1.set_dash_capstyle("round")
146 | line3.set_alpha(0.2)
147 | plt.show()
148 | ```
149 | 
150 |
151 | #### `Line2D`属性
152 | |Property | Value Type|
153 | |-------|-----|
154 | |alpha | float|
155 | |animated | [True / False]|
156 | |antialiased or aa | [True / False]|
157 | |clip_box | a matplotlib.transform.Bbox instance|
158 | |clip_on | [True | False]|
159 | |clip_path | a Path instance and a Transform instance, a Patch|
160 | |color or c | any matplotlib color|
161 | |contains | the hit testing function|
162 | |dash_capstyle | ['butt' / 'round' / 'projecting']|
163 | |dash_joinstyle | ['miter' / 'round' / 'bevel']|
164 | |dashes | sequence of on/off ink in points|
165 | |data | (np.array xdata, np.array ydata)|
166 | |figure | a matplotlib.figure.Figure instance|
167 | |label | any string|
168 | |linestyle or ls | [ '-' / '--' / '-.' / ':' / 'steps' / ...]|
169 | |linewidth or lw | float value in points|
170 | |lod | [True / False]|
171 | |marker | [ '+' / ',' / '.' / '1' / '2' / '3' / '4' ]|
172 | |markeredgecolor or mec | any matplotlib color|
173 | |markeredgewidth or mew | float value in points|
174 | |markerfacecolor or mfc | any matplotlib color|
175 | |markersize or ms | float|
176 | |markevery | [ None / integer / (startind, stride) ]|
177 | |picker | used in interactive line selection|
178 | |pickradius | the line pick selection radius|
179 | |solid_capstyle | ['butt' / 'round' / 'projecting']|
180 | |solid_joinstyle | ['miter' / 'round' / 'bevel']|
181 | |transform | a matplotlib.transforms.Transform instance|
182 | |visible | [True / False]|
183 | |xdata | np.array|
184 | |ydata | np.array|
185 | |zorder | any number|
186 |
187 | ## 保存图像
188 |
189 | - 可使用savefig函数保存图像
190 | ``` python
191 | savefig(fname, dpi=None, facecolor='w', edgecolor='w',
192 | orientation='portrait', papertype=None, format=None,
193 | transparent=False, bbox_inches=None, pad_inches=0.1,
194 | frameon=None)
195 | ```
196 | 参数
197 | - fname: 包含文件名的路径字符串
198 | - dpi: [ None | scalar > 0 | ‘figure’]
199 | - format: 文件扩展名,大多数后端支持` png, pdf, ps, eps and svg`
200 | - transparent: 如果为True则轴部分的背景透明。
201 |
202 | 示例
203 | ``` Python
204 | x = np.linspace(-1.4, 1.4, 30)
205 | plt.plot(x, x**2)
206 | plt.savefig("my_square_function.png", transparent=True)
207 | ```
208 | 
209 |
210 | ## 组合图
211 | - 一个图可能需要包含多个子图,那么如何操作呢。要创建子图其实只需调用子图函数,并制定图中的行数和列数,以及要绘制的子图的索引(从1开始,然后从左到右,从上到下)。注意,pyplot会跟踪当前活动的子图(您可以调用`plt.gca()`来获得引用,可以借此添加额外属性),因此当您调用绘图函数时,它会绘制活动的子图。
212 | - 注意:`subplot(224)`是`subplot(2, 2, 4)`的缩写。
213 | ``` python
214 | x = np.linspace(-1.4, 1.4, 30)
215 | plt.subplot(2, 2, 1) # 2 rows, 2 columns, 1st subplot = top left
216 | plt.plot(x, x)
217 | plt.subplot(2, 2, 2) # 2 rows, 2 columns, 2nd subplot = top right
218 | plt.plot(x, x**2)
219 | plt.subplot(2, 2, 3) # 2 rows, 2 columns, 3rd subplot = bottow left
220 | plt.plot(x, x**3)
221 | plt.subplot(224) # 2 rows, 2 columns, 4th subplot = bottom right
222 | plt.plot(x, x**4)
223 | plt.show()
224 | ```
225 | 
226 |
227 | - 创建跨多个网格单元的子图也很容易
228 | ``` Python
229 | plt.subplot(2, 2, 1) # 2 rows, 2 columns, 1st subplot = top left
230 | plt.plot(x, x)
231 | plt.subplot(2, 2, 2) # 2 rows, 2 columns, 2nd subplot = top right
232 | plt.plot(x, x**2)
233 | plt.subplot(2, 1, 2) # 2 rows, *1* column, 2nd subplot = bottom
234 | plt.plot(x, x**3)
235 | plt.show()
236 | ```
237 | 
238 |
239 | - 如果你需要更复杂的子图定位,你可以使用`subplot2grid`,你可以指定格子行数和列数,然后在格子上绘制子图(左上 = (0, 0)),并且可以指定它能跨越多少行和多少列。
240 | ``` Python
241 | plt.subplot2grid((3,3), (0, 0), rowspan=2, colspan=2)
242 | plt.plot(x, x**2)
243 | plt.subplot2grid((3,3), (0, 2))
244 | plt.plot(x, x**3)
245 | plt.subplot2grid((3,3), (1, 2), rowspan=2)
246 | plt.plot(x, x**4)
247 | plt.subplot2grid((3,3), (2, 0), colspan=2)
248 | plt.plot(x, x**5)
249 | plt.show()
250 | ```
251 | 
252 | - 如果需要更灵活的子图定位,[看这里](https://matplotlib.org/users/gridspec.html)
253 |
254 |
255 | ## 绘制文本
256 | - 你可以调用`call`在图像任意位置添加文本。仅需指定坐标选择一些额外属性。关于TeX方程表达式的细节[看这里](https://matplotlib.org/users/mathtext.html)
257 | ``` Python
258 | x = np.linspace(-1.5, 1.5, 30)
259 | px = 0.8
260 | py = px**2
261 |
262 | plt.plot(x, x**2, "b-", px, py, "ro")
263 |
264 | plt.text(0, 1.5, "Square function\n$y = x^2$", fontsize=20, color='blue', horizontalalignment="center")
265 | plt.text(px - 0.08, py, "Beautiful point", ha="right", weight="heavy") # ha是horizontalalignment的别名。
266 | plt.text(px, py, "x = %0.2f\ny = %0.2f"%(px, py), rotation=50, color='gray')
267 |
268 | plt.show()
269 | ```
270 | 
271 |
272 | - 图像元素的注释使用非常频繁,`annotate`函数使得它非常简单,只需指定兴趣点的位置、文本的位置,加上文字和箭头的一些额外属性就能完成。
273 | ``` Python
274 | plt.plot(x, x**2, px, py, "ro")
275 | plt.annotate("Beautiful point", xy=(px, py), xytext=(px-1.3,py+0.5),
276 | color="green", weight="heavy", fontsize=14,
277 | arrowprops={"facecolor": "lightgreen"})
278 | plt.show()
279 | ```
280 | 
281 |
282 | - 你可以使用`bbox`属性在文本周围加上框。
283 | ``` Python
284 | plt.plot(x, x**2, px, py, "ro")
285 |
286 | bbox_props = dict(boxstyle="rarrow,pad=0.3", ec="b", lw=2, fc="lightblue")
287 | plt.text(px-0.2, py, "Beautiful point", bbox=bbox_props, ha="right")
288 |
289 | bbox_props = dict(boxstyle="round4,pad=1,rounding_size=0.2", ec="black", fc="#EEEEFF", lw=5)
290 | plt.text(0, 1.5, "Square function\n$y = x^2$", fontsize=20, color='black', ha="center", bbox=bbox_props)
291 |
292 | plt.show()
293 | ```
294 | 
295 |
296 | - 如果为了好玩可以绘制漫画风格的图(xkcd-style),只需在`with plt.xkcd()`内写代码就好。
297 | ``` Python
298 | with plt.xkcd():
299 | plt.plot(x, x**2, px, py, "ro")
300 |
301 | bbox_props = dict(boxstyle="rarrow,pad=0.3", ec="b", lw=2, fc="lightblue")
302 | plt.text(px-0.2, py, "Beautiful point", bbox=bbox_props, ha="right")
303 |
304 | bbox_props = dict(boxstyle="round4,pad=1,rounding_size=0.2", ec="black", fc="#EEEEFF", lw=5)
305 | plt.text(0, 1.5, "Square function\n$y = x^2$", fontsize=20, color='black', ha="center", bbox=bbox_props)
306 |
307 | plt.show()
308 | ```
309 | 
310 |
311 | ## 图例
312 | - 添加图例最简单的方法是在对应位置添加标签,然后调用`legend`函数
313 | ``` Python
314 | x = np.linspace(-1.4, 1.4, 50)
315 | plt.plot(x, x**2, "r--", label="Square function")
316 | plt.plot(x, x**3, "g-", label="Cube function")
317 | plt.legend(loc="best")
318 | plt.grid(True)
319 | plt.show()
320 | ```
321 | 
322 |
323 | ## 非线性尺度
324 | - Matplotlib支持非线性尺度,例如对数或logit尺度
325 | ``` Python
326 | x = np.linspace(0.1, 15, 500)
327 | y = x**3/np.exp(2*x)
328 |
329 | plt.figure(1)
330 | plt.plot(x, y)
331 | plt.yscale('linear')
332 | plt.title('linear')
333 | plt.grid(True)
334 |
335 | plt.figure(2)
336 | plt.plot(x, y)
337 | plt.yscale('log')
338 | plt.title('log')
339 | plt.grid(True)
340 |
341 | plt.figure(3)
342 | plt.plot(x, y)
343 | plt.yscale('logit')
344 | plt.title('logit')
345 | plt.grid(True)
346 |
347 | plt.figure(4)
348 | plt.plot(x, y - y.mean())
349 | plt.yscale('symlog', linthreshy=0.05)
350 | plt.title('symlog')
351 | plt.grid(True)
352 |
353 | plt.show()
354 | ```
355 | 
356 | 
357 | 
358 | 
359 |
360 | ## Ticks and tickers 刻度和刻度控制器
361 | - "ticks"是刻度的位置 (例如 (-1, 0, 1)),"tick lines"是在这些位置绘制的小线条(刻度线),"tick labels"实在刻度线旁边绘制的标签(刻度线标签)。"tickers" 是决定在哪能放置刻度的对象,默认的tickers通常在合理的距离放置5到8个刻度。但有时候你需要控制它,幸运的是,matplotlib可以让你完全控制刻度。
362 | ``` Python
363 | x = np.linspace(-2, 2, 100)
364 |
365 | plt.figure(1, figsize=(15,10))
366 | plt.subplot(131)
367 | plt.plot(x, x**3)
368 | plt.grid(True)
369 | plt.title("Default ticks")
370 |
371 | ax = plt.subplot(132)
372 | plt.plot(x, x**3)
373 | ax.xaxis.set_ticks(np.arange(-2, 2, 1))
374 | plt.grid(True)
375 | plt.title("Manual ticks on the x-axis")
376 |
377 | ax = plt.subplot(133)
378 | plt.plot(x, x**3)
379 | plt.minorticks_on()
380 | ax.tick_params(axis='x', which='minor', bottom='off')
381 | ax.xaxis.set_ticks([-2, 0, 1, 2])
382 | ax.yaxis.set_ticks(np.arange(-5, 5, 1))
383 | ax.yaxis.set_ticklabels(["min", -4, -3, -2, -1, 0, 1, 2, 3, "max"])
384 | plt.title("Manual ticks and tick labels\n(plus minor ticks) on the y-axis")
385 |
386 |
387 | plt.grid(True)
388 |
389 | plt.show()
390 | ```
391 | 
392 |
393 | ## 极坐标投影
394 | - 绘制极坐标图,只需在创建子图时,设定"projection"属性为"polar"即可。
395 | ``` Python
396 | radius = 1
397 | theta = np.linspace(0, 2*np.pi*radius, 1000)
398 |
399 | plt.subplot(111, projection='polar')
400 | plt.plot(theta, np.sin(5*theta), "g-")
401 | plt.plot(theta, 0.5*np.cos(20*theta), "b-")
402 | plt.show()
403 | ```
404 | 
405 |
406 | ## 3D投影
407 | - 绘制3D图像非常直接,你需要导入`Axes3D`,以注册`3d`投影。然后设定`projection`属性为`3d`。它将返回一个`Axes3DSubplot`对象,你可以调用`plot_surface`,给定x,y和z坐标加额外的属性来绘制。
408 | ``` Python
409 | from mpl_toolkits.mplot3d import Axes3D
410 |
411 | x = np.linspace(-5, 5, 50)
412 | y = np.linspace(-5, 5, 50)
413 | X, Y = np.meshgrid(x, y)
414 | R = np.sqrt(X**2 + Y**2)
415 | Z = np.sin(R)
416 |
417 | figure = plt.figure(1, figsize = (12, 4))
418 | subplot3d = plt.subplot(111, projection='3d')
419 | surface = subplot3d.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=matplotlib.cm.coolwarm, linewidth=0.1)
420 | plt.show()
421 | ```
422 | 
423 |
424 | - 显示相同数据的另外一种方法是用等高线图
425 | ``` Python
426 | plt.contourf(X, Y, Z, cmap=matplotlib.cm.coolwarm)
427 | plt.colorbar()
428 | plt.show()
429 | ```
430 | 
431 |
432 | ## 散点图
433 | - 提供x和y的坐标就可以绘制散点图。
434 | ``` Python
435 | from numpy.random import rand
436 | x, y = rand(2, 100)
437 | plt.scatter(x, y)
438 | plt.show()
439 | ```
440 | 
441 |
442 | - 你也可以提供每个点的比例
443 | ``` Python
444 | x, y, scale = rand(3, 100)
445 | scale = 500 * scale ** 5
446 | plt.scatter(x, y, s=scale)
447 | plt.show()
448 | ```
449 | 
450 |
451 | - 还可以设置些额外的属性,例如填充颜色、边缘颜色、透明度。
452 | ``` Python
453 | for color in ['red', 'green', 'blue']:
454 | n = 100
455 | x, y = rand(2, n)
456 | scale = 500.0 * rand(n) ** 5
457 | plt.scatter(x, y, s=scale, c=color, alpha=0.3, edgecolors='blue')
458 |
459 | plt.grid(True)
460 |
461 | plt.show()
462 | ```
463 | 
464 |
465 | ## 直线(工具函数)
466 | - 创建一个工具函数来画图通常会更方便,该函数会在给定斜率和截距的情况下在图上绘制一条看似无限长的线。
467 | ``` Python
468 | from numpy.random import randn
469 |
470 | def plot_line(axis, slope, intercept, **kargs):
471 | xmin, xmax = axis.get_xlim()
472 | plt.plot([xmin, xmax], [xmin*slope+intercept, xmax*slope+intercept], **kargs)
473 |
474 | x = randn(1000)
475 | y = 0.5*x + 5 + randn(1000)*2
476 | plt.axis([-2.5, 2.5, -5, 15])
477 | plt.scatter(x, y, alpha=0.2)
478 | plt.plot(1, 0, "ro")
479 | plt.vlines(1, -5, 0, color="red")
480 | plt.hlines(0, -2.5, 1, color="red")
481 | plot_line(axis=plt.gca(), slope=0.5, intercept=5, color="magenta")
482 | plt.grid(True)
483 | plt.show()
484 | ```
485 | 
486 |
487 | ## 直方图
488 | ``` Python
489 | data = [1, 1.1, 1.8, 2, 2.1, 3.2, 3, 3, 3, 3]
490 | plt.subplot(211)
491 | plt.hist(data, bins = 10, rwidth=0.8)
492 |
493 | plt.subplot(212)
494 | plt.hist(data, bins = [1, 1.5, 2, 2.5, 3], rwidth=0.95)
495 | plt.xlabel("Value")
496 | plt.ylabel("Frequency")
497 |
498 | plt.show()
499 | ```
500 | 
501 |
502 | ``` Python
503 | data1 = np.random.randn(400)
504 | data2 = np.random.randn(500) + 3
505 | data3 = np.random.randn(450) + 6
506 | data4a = np.random.randn(200) + 9
507 | data4b = np.random.randn(100) + 10
508 |
509 | plt.hist(data1, bins=5, color='g', alpha=0.75, label='bar hist') # default histtype='bar'
510 | plt.hist(data2, color='b', alpha=0.65, histtype='stepfilled', label='stepfilled hist')
511 | plt.hist(data3, color='r', histtype='step', label='step hist')
512 | plt.hist((data4a, data4b), color=('r','m'), alpha=0.55, histtype='barstacked', label=('barstacked a', 'barstacked b'))
513 |
514 | plt.xlabel("Value")
515 | plt.ylabel("Frequency")
516 | plt.legend()
517 | plt.grid(True)
518 | plt.show()
519 | ```
520 | 
521 |
522 | ## 图像
523 | - 读取图像;仅需导入`matplotlib.image`moudle,再调用`imread`函数(传入文件名),它将以NumPy's数组形式返回图像数据。
524 | ``` Python
525 | import matplotlib.image as mpimg
526 |
527 | img = mpimg.imread('my_square_function.png')
528 | print(img.shape, img.dtype)
529 | # Out:(288, 432, 4) float32
530 | ```
531 | - 生成图像也很简单
532 | ``` Python
533 | img = np.arange(100*100).reshape(100, 100)
534 | print(img)
535 | plt.imshow(img)
536 | plt.show()
537 | ```
538 | 
539 | - 由于我们没有提供RGB等级,`imshow`函数自动将值映射到颜色渐变。默认情况,颜色渐变从蓝色(低值)变为红色(高值),但你可以选择其他颜色映射。例如:
540 | ``` Python
541 | plt.imshow(img, cmap="hot")
542 | plt.show()
543 | ```
544 | 
545 |
546 | - 你也可以直接产生RGB图像
547 | ``` Python
548 | img = np.empty((20,30,3))
549 | img[:, :10] = [0, 0, 0.6]
550 | img[:, 10:20] = [1, 1, 1]
551 | img[:, 20:] = [0.6, 0, 0]
552 | plt.imshow(img)
553 | plt.show()
554 | ```
555 | 
556 |
557 | - 由于img数组非常小(20x30),所以当imshow函数显示它时,它会将图像增大图像的大小。 默认情况下,它使用双线性插值来填充所添加的像素。 这就是边缘看起来模糊的原因。 您可以选择另一种插值算法,例如复制最近像素的颜色:
558 | ``` Python
559 | plt.imshow(img, interpolation="nearest")
560 | plt.show()
561 | ```
562 | 
563 |
564 |
565 | ## 动画
566 | ### 绘制
567 | - `FuncAnimation`构造函数接受一个图形,一个更新函数和可选参数。 我们指定我们需要一个100帧长的动画,每帧之间有20ms。 在每次迭代中,FuncAnimation调用我们的更新函数,并将帧号传递给`num`(在我们的例子中是从0到99),接着是我们用`fargs`指定额外的参数。
568 | - 我们的更新函数简单地将行数据设置为第一个数据点(所以数据将逐渐绘制),并且为了好玩,我们还为每个数据点添加一个小的随机数,这样该行似乎在摆动。
569 | ``` Python
570 | import matplotlib.animation as animation
571 |
572 | x = np.linspace(-1, 1, 100)
573 | y = np.sin(x**2*25)
574 | data = np.array([x, y])
575 |
576 | fig = plt.figure()
577 | line, = plt.plot([], [], "r-") # start with an empty plot
578 | plt.axis([-1.1, 1.1, -1.1, 1.1])
579 | plt.plot([-0.5, 0.5], [0, 0], "b-", [0, 0], [-0.5, 0.5], "b-", 0, 0, "ro")
580 | plt.grid(True)
581 | plt.title("Marvelous animation")
582 |
583 | # this function will be called at every iteration
584 | def update_line(num, data, line):
585 | line.set_data(data[..., :num] + np.random.rand(2, num) / 25) # we only plot the first `num` data points.
586 | return line,
587 |
588 | line_ani = animation.FuncAnimation(fig, update_line, frames=100, fargs=(data, line), interval=67)
589 | plt.show()
590 | ```
591 |
592 | ### 保存
593 | - Matplotlib依靠第三方库来编写视频,如FFMPEG或mencoder。 在这个例子中,我们将使用FFMPEG,所以一定要先安装它。
594 | ``` Python
595 | Writer = animation.writers['ffmpeg']
596 | writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
597 | line_ani.save('my_wiggly_animation.mp4', writer=writer)
598 | ```
599 |
600 | ## 更多
601 | [Matplotlib](https://matplotlib.org/gallery.html)官网
602 |
603 | [安装Python以及Matplotlib](http://blog.csdn.net/sinat_28224453/article/details/51462935)
604 |
--------------------------------------------------------------------------------
/实战篇/4.地震后建筑修复建议预测/README.md:
--------------------------------------------------------------------------------
1 | # 介绍
2 | [地震后建筑修复建议](http://sofasofa.io/competition.php?id=8)是SofaSofa提供的练习比赛,可以说它是一个多分类问题,也可以说它是一个小型的推荐系统。因为它有四个类,评价标准是`map@2`。
3 | 写这个比赛的目的是练习一下前面学的`数据探索(EDA)`和`数据预处理`,还有建模时用到的'blend'和'stacking'。具体代码都在jupyter notebook 文件里,这里简单介绍一下。
4 |
5 | # EDA
6 | 首先从最基本的开始,
7 | ``` Python
8 | trainData.info()
9 | ```
10 | ```
11 |
12 | RangeIndex: 652936 entries, 0 to 652935
13 | Data columns (total 15 columns):
14 | id 652936 non-null int64
15 | district_id 652936 non-null int64
16 | area_id 652936 non-null int64
17 | floors_before 652936 non-null int64
18 | floors_after 652936 non-null int64
19 | age 652936 non-null int64
20 | area 652936 non-null int64
21 | height_before 652936 non-null int64
22 | height_after 652936 non-null int64
23 | land_condition 652936 non-null object
24 | foundation_type 652936 non-null object
25 | roof_type 652936 non-null object
26 | ground_floor_type 652936 non-null object
27 | position 652936 non-null object
28 | y 652936 non-null int64
29 | dtypes: int64(10), object(5)
30 | memory usage: 74.7+ MB
31 | ```
32 | ``` Python
33 | trainData.describe()
34 | ```
35 |
36 |
37 |
38 |
39 |
40 | |
41 | id |
42 | district_id |
43 | area_id |
44 | floors_before |
45 | floors_after |
46 | age |
47 | area |
48 | height_before |
49 | height_after |
50 | y |
51 |
52 |
53 |
54 |
55 | | count |
56 | 652936.000000 |
57 | 652936.000000 |
58 | 652936.000000 |
59 | 652936.000000 |
60 | 652936.000000 |
61 | 652936.000000 |
62 | 652936.000000 |
63 | 652936.000000 |
64 | 652936.000000 |
65 | 652936.000000 |
66 |
67 |
68 | | mean |
69 | 326468.500000 |
70 | 26.827076 |
71 | 2711.729102 |
72 | 2.131408 |
73 | 1.335198 |
74 | 27.823827 |
75 | 418.087992 |
76 | 16.307750 |
77 | 10.377904 |
78 | 2.283069 |
79 |
80 |
81 | | std |
82 | 188486.532019 |
83 | 7.777727 |
84 | 778.158274 |
85 | 0.727938 |
86 | 1.097675 |
87 | 73.181335 |
88 | 231.655079 |
89 | 5.810902 |
90 | 8.646354 |
91 | 0.960629 |
92 |
93 |
94 | | min |
95 | 1.000000 |
96 | 7.000000 |
97 | 701.000000 |
98 | 1.000000 |
99 | 0.000000 |
100 | 0.000000 |
101 | 70.000000 |
102 | 6.000000 |
103 | 0.000000 |
104 | 0.000000 |
105 |
106 |
107 | | 25% |
108 | 163234.750000 |
109 | 22.000000 |
110 | 2241.000000 |
111 | 2.000000 |
112 | 0.000000 |
113 | 10.000000 |
114 | 284.000000 |
115 | 13.000000 |
116 | 0.000000 |
117 | 2.000000 |
118 |
119 |
120 | | 50% |
121 | 326468.500000 |
122 | 27.000000 |
123 | 2703.000000 |
124 | 2.000000 |
125 | 2.000000 |
126 | 18.000000 |
127 | 364.000000 |
128 | 16.000000 |
129 | 12.000000 |
130 | 3.000000 |
131 |
132 |
133 | | 75% |
134 | 489702.250000 |
135 | 31.000000 |
136 | 3119.000000 |
137 | 2.000000 |
138 | 2.000000 |
139 | 30.000000 |
140 | 493.000000 |
141 | 19.000000 |
142 | 16.000000 |
143 | 3.000000 |
144 |
145 |
146 | | max |
147 | 652936.000000 |
148 | 51.000000 |
149 | 5142.000000 |
150 | 9.000000 |
151 | 9.000000 |
152 | 999.000000 |
153 | 5220.000000 |
154 | 305.000000 |
155 | 193.000000 |
156 | 3.000000 |
157 |
158 |
159 |
160 |
161 |
162 | ### 相关性矩阵
163 | ```python
164 | corrMatrix = trainData.corr()
165 | corrMatrix
166 | ```
167 |
168 |
169 |
170 |
171 |
172 |
173 | |
174 | id |
175 | district_id |
176 | area_id |
177 | floors_before |
178 | floors_after |
179 | age |
180 | area |
181 | height_before |
182 | height_after |
183 | y |
184 |
185 |
186 |
187 |
188 | | id |
189 | 1.000000 |
190 | -0.001397 |
191 | -0.001410 |
192 | 0.000964 |
193 | -0.000608 |
194 | 0.000912 |
195 | -0.000127 |
196 | 0.000733 |
197 | -0.000856 |
198 | 0.001189 |
199 |
200 |
201 | | district_id |
202 | -0.001397 |
203 | 1.000000 |
204 | 0.999695 |
205 | -0.089556 |
206 | 0.011780 |
207 | 0.018209 |
208 | 0.045535 |
209 | -0.061551 |
210 | 0.017360 |
211 | -0.079135 |
212 |
213 |
214 | | area_id |
215 | -0.001410 |
216 | 0.999695 |
217 | 1.000000 |
218 | -0.090119 |
219 | 0.011007 |
220 | 0.018106 |
221 | 0.044884 |
222 | -0.061716 |
223 | 0.016755 |
224 | -0.078146 |
225 |
226 |
227 | | floors_before |
228 | 0.000964 |
229 | -0.089556 |
230 | -0.090119 |
231 | 1.000000 |
232 | 0.327052 |
233 | 0.085460 |
234 | 0.102304 |
235 | 0.772104 |
236 | 0.253478 |
237 | 0.186285 |
238 |
239 |
240 | | floors_after |
241 | -0.000608 |
242 | 0.011780 |
243 | 0.011007 |
244 | 0.327052 |
245 | 1.000000 |
246 | 0.030074 |
247 | 0.122605 |
248 | 0.297664 |
249 | 0.941350 |
250 | -0.406570 |
251 |
252 |
253 | | age |
254 | 0.000912 |
255 | 0.018209 |
256 | 0.018106 |
257 | 0.085460 |
258 | 0.030074 |
259 | 1.000000 |
260 | -0.004666 |
261 | 0.062896 |
262 | 0.018811 |
263 | 0.044594 |
264 |
265 |
266 | | area |
267 | -0.000127 |
268 | 0.045535 |
269 | 0.044884 |
270 | 0.102304 |
271 | 0.122605 |
272 | -0.004666 |
273 | 1.000000 |
274 | 0.198313 |
275 | 0.178050 |
276 | -0.152052 |
277 |
278 |
279 | | height_before |
280 | 0.000733 |
281 | -0.061551 |
282 | -0.061716 |
283 | 0.772104 |
284 | 0.297664 |
285 | 0.062896 |
286 | 0.198313 |
287 | 1.000000 |
288 | 0.389301 |
289 | 0.086521 |
290 |
291 |
292 | | height_after |
293 | -0.000856 |
294 | 0.017360 |
295 | 0.016755 |
296 | 0.253478 |
297 | 0.941350 |
298 | 0.018811 |
299 | 0.178050 |
300 | 0.389301 |
301 | 1.000000 |
302 | -0.442474 |
303 |
304 |
305 | | y |
306 | 0.001189 |
307 | -0.079135 |
308 | -0.078146 |
309 | 0.186285 |
310 | -0.406570 |
311 | 0.044594 |
312 | -0.152052 |
313 | 0.086521 |
314 | -0.442474 |
315 | 1.000000 |
316 |
317 |
318 |
319 |
320 |
321 |
322 | ```python
323 | corrMatrix['y']
324 | ```
325 | id 0.001189
326 | district_id -0.079135
327 | area_id -0.078146
328 | floors_before 0.186285
329 | floors_after -0.406570
330 | age 0.044594
331 | area -0.152052
332 | height_before 0.086521
333 | height_after -0.442474
334 | y 1.000000
335 | Name: y, dtype: float64
336 |
337 | ## 数据离散化
338 | 数据离散化是有风险的,因为离散化后的数据效能未必会比离散化之前好,一般是根据专家的建议设置区间,而不是随意猜测一个区间。这里是利用无监督模型(K-means 算法)聚类,将id类的数据分段。
339 | 首先看下数据的分布,猜测一下应该聚成几类,因为k-means算法要求提供聚类的簇数。利用seaborn可视化数据:
340 | ``` Python
341 | sns.distplot(trainData['district_id'])
342 | ```
343 |
344 | 
345 |
346 | 根据可视化后的数据分布,猜测可以聚为6类。(聚成几类是没有一个确定的答案的,可以多尝试几种情况,取最好的)
347 | ``` Python
348 | from sklearn.cluster import KMeans
349 | est = KMeans(n_clusters=6, init="k-means++", n_jobs=-1)
350 | est.fit(trainData['district_id'].reshape(-1, 1))
351 | trainData['district_id'] = est.predict(trainData['district_id'].reshape(-1, 1))
352 | ```
353 | 这里id类型的数据,我都是这样处理的。
354 |
355 | ## 处理异常值
356 | #### `age`属性。
357 | 可视化后是这样的
358 | ``` Python
359 | sns.distplot(trainData['age'])
360 | ```
361 | 
362 |
363 | 一直延续到1000,猜测可能有问题。绘制散点图看看。
364 | ```Python
365 | sns.jointplot(data=trainData, x='id', y='age')
366 | ```
367 | 
368 |
369 | 1000那里突然就出现一堆数据 ,猜测可能是出题热故意设置的,处理方法是直接删除。
370 | ``` Python
371 | # 删除大于阈值的行
372 | index = trainData['age'] <= 176
373 | trainData = trainData[index]
374 | ```
375 |
376 | #### `floors`属性
377 | 数据集中提供了楼层前后高度信息,猜测可能会存在一些异常值,地震楼层数会比地震前还有高。首先进行可视化
378 |
379 | 地震前:`floors_before`
380 | ``` Python
381 | sns.distplot(trainData['floors_before'])
382 | ```
383 |
384 | 
385 |
386 |
387 | 地震后:`floors_after`
388 | ```Python
389 | sns.distplot(trainData['floors_after'])
390 | ```
391 |
392 | 
393 |
394 | 地震前后楼层数对比
395 | ``` Python
396 | plt.plot(trainData['id'], trainData['floors_before'], trainData['id'], trainData['floors_after'])
397 | ```
398 |
399 | 
400 |
401 | 从图上可以发现,确实有些数据像说的那样。先计算下个数
402 | ``` Python
403 | error_floor = trainData['floors_before'] < trainData['floors_after']
404 | # 震后楼层数比震前还高的数量
405 | error_floor.sum()
406 | ```
407 | 1838
408 |
409 | 有1838个,直接删除
410 | ``` Python
411 | # 直接去掉
412 | index = trainData['floors_before'] >= trainData['floors_after']
413 | trainData = trainData[index]
414 | ```
415 |
416 | #### `height`属性
417 | `height`也提供了前后高度,处理方法是一样的。
418 |
419 | ```Python
420 | error_height = trainData['height_after'] > trainData['height_before']
421 | error_height.sum()
422 | ```
423 | 1517
424 | ``` Python
425 | index = trainData['height_after'] <= trainData['height_before']
426 | trainData = trainData[index]
427 | ```
428 |
429 | ## 标签数据-独热编码(one-hot)
430 | ``` Python
431 | trainData = pd.get_dummies(trainData, columns=['position', 'land_condition', 'foundation_type', 'roof_type', 'ground_floor_type'])
432 | ```
433 |
434 | ## 构造属性
435 | 加减乘数,构造属性
436 | ``` Python
437 | trainData['per_floor_height_before'] = trainData['height_before'] / trainData['floors_before']
438 | trainData['per_floor_height_after'] = trainData['height_after'] / trainData['floors_after']
439 | trainData["age_area"] = trainData['age'] / trainData['area']
440 | ```
441 |
442 | 标签数据编号
443 | ``` Python
444 | land_condition.replace(['F', 'M', 'S'], [1, 2, 3], inplace=True)
445 | foundation_type.replace(['M', 'C', 'R', 'B', 'O'], [5, 4, 3, 2, 1], inplace=True)
446 | roof_type.replace(['L', 'H', 'R'], [3, 2, 1], inplace=True)
447 | ground_floor_type.replace(['M', 'R', 'B', 'T', 'O'], [5, 4, 3, 2, 1], inplace=True)
448 | ```
449 |
450 | 用one-hot后的数据构造新属性
451 | ``` Python
452 | trainData['4_rebuild'] = land_condition + foundation_type + roof_type + ground_floor_type
453 | trainData['l_f'] = land_condition + foundation_type
454 | trainData['l_r'] = land_condition + roof_type
455 | trainData['l_g'] = land_condition + ground_floor_type
456 | trainData['f_r'] = foundation_type + roof_type
457 | trainData['f_g'] = foundation_type + ground_floor_type
458 | trainData['r_g'] = roof_type + ground_floor_type
459 | ```
460 |
461 | ## lightGBM模型生成特征重要性图
462 | ``` Python
463 | import lightgbm as lgb
464 |
465 | params = {
466 | 'learning_rate':0.1,
467 | 'lambda_l1':0.1,
468 | 'lambda_l2':0.2,
469 | 'max_depth':4,
470 | 'objective':'multiclass',
471 | 'num_class':4
472 | }
473 |
474 | lgb_train = lgb.Dataset(train, y)
475 | lgb_eval = lgb.Dataset(train, y)
476 | gbm = lgb.train(params,
477 | lgb_train,
478 | num_boost_round=50,
479 | valid_sets=lgb_eval,
480 | early_stopping_rounds=5)
481 | lgb.plot_importance(gbm, figsize=(10,10))
482 | ```
483 | 
484 |
485 | ## 生成新的相关性矩阵
486 | ``` Python
487 | corr = trainData.corr()
488 | corr['y'].sort_values()
489 | ```
490 | ```
491 | per_floor_height_after -0.517127
492 | height_after -0.443536
493 | floors_after -0.405705
494 | ground_floor_type_R -0.382114
495 | roof_type_R -0.331644
496 | foundation_type_R -0.314671
497 | foundation_type_B -0.205903
498 | area_id -0.175130
499 | foundation_type_C -0.172373
500 | area -0.149299
501 | per_floor_height_before -0.146806
502 | district_id -0.085735
503 | position_Not attached -0.049879
504 | foundation_type_O -0.030112
505 | land_condition_F -0.023559
506 | ground_floor_type_O -0.022835
507 | ground_floor_type_T -0.016830
508 | position_Attached-2 side -0.012019
509 | ground_floor_type_B 0.002914
510 | land_condition_M 0.016435
511 | position_Attached-3 side 0.017995
512 | land_condition_S 0.018032
513 | position_Attached-1 side 0.058592
514 | roof_type_H 0.082415
515 | height_before 0.094980
516 | roof_type_L 0.097213
517 | l_g 0.156026
518 | l_r 0.174592
519 | floors_before 0.192760
520 | age_area 0.202228
521 | age 0.222218
522 | r_g 0.244821
523 | ground_floor_type_M 0.283176
524 | l_f 0.336764
525 | 4_rebuild 0.365961
526 | f_r 0.373940
527 | f_g 0.375418
528 | foundation_type_M 0.414113
529 | y 1.000000
530 | Name: y, dtype: float64
531 | ```
532 | 可以看出构造出的几个属性相关性较强。
533 |
534 | # 建模
535 | ## 构建评分函数
536 | 它的评分标准是`map@2`
537 | > 简单来说,对于每一个建筑,若主修复意见正确,得1分;若次修复意见正确,得0.5分;若都不正确,记0分。所有建筑的得分的均值就是map@2
538 |
539 | ```Python
540 | def test_score(y1, y2, trueLabels):
541 | pred_score = (y1 == trueLabels).sum() / len(trueLabels)
542 | pred_score += (y2 == trueLabels).sum() * 0.5 / len(trueLabels)
543 | return pred_score
544 | ```
545 |
546 | ## XGBOOST
547 | ``` Python
548 | import xgboost as xgb
549 | xgb_model = xgb.XGBClassifier(objective='multi:softmax',
550 | eval_metric=['map@2', 'merror'],
551 | n_estimators=700,
552 | num_class=4,
553 | silent=1,
554 | max_depth=6,
555 | nthread=4,
556 | learning_rate=0.1,
557 | gamma=0.5,
558 | min_child_weight=0.6,
559 | max_delta_step=0.1,
560 | subsample=0.6,
561 | colsample_bytree=0.7,
562 | reg_lambda=0.4,
563 | reg_alpha=0.8,
564 | num_leaves=250,
565 | early_stopping_rounds=20,
566 | num_boost_round=8000,
567 | scale_pos_weight=1)
568 | xgb_model.fit(train, y)
569 | pb = xgb_model.predict_proba(train)
570 | pb = np.array(pb)
571 | submit = pd.DataFrame()
572 | submit['y1'] = pb.argsort()[np.arange(len(pb)), -1]
573 | submit['y2'] = pb.argsort()[np.arange(len(pb)), -2]
574 | print(test_score(submit['y1'].values, submit['y2'].values, y))
575 | ```
576 | 0.774950502878
577 |
578 | ## LightGBM
579 | ``` Python
580 | import lightgbm as lgb
581 | lgb_train = lgb.Dataset(train[:600000], y[:600000])
582 | lgb_eval = lgb.Dataset(train[600000:], y[600000:], reference=lgb_train)
583 | sakf
584 | params = {
585 | 'boosting_type': 'gbdt',
586 | 'objective': 'multiclass',
587 | 'num_class': 4,
588 | 'metric': ['multi_error', 'map@2'], # 'map@2',
589 | 'num_leaves': 250, # 4
590 | 'min_data_in_leaf': 100,
591 | 'learning_rate': 0.1,
592 | # 'feature_fraction': 0.3,
593 | 'bagging_fraction': 0.8,
594 | 'bagging_freq': 5,
595 | 'lambda_l1': 0.4,
596 | 'lambda_l2': 0.6,
597 | 'max_depth':6,
598 | # 'min_gain_to_split': 0.2,
599 | 'verbose': 5,
600 | 'is_unbalance': True
601 | }
602 |
603 | print('Start training...')
604 | gbm = lgb.train(params,
605 | lgb_train,
606 | num_boost_round=8000,
607 | valid_sets=lgb_eval,
608 | early_stopping_rounds=500)
609 |
610 | ```
611 | ``` Python
612 | print('Start predicting...')
613 | pb = gbm.predict(train, num_iteration=gbm.best_iteration)
614 | pb = np.array(pb)
615 | submit = pd.DataFrame()
616 | submit['y1'] = pb.argsort()[np.arange(len(pb)), -1]
617 | submit['y2'] = pb.argsort()[np.arange(len(pb)), -2]
618 | print(test_score(submit['y1'].values, submit['y2'].values, y))
619 | ```
620 | Start predicting...
621 | 0.796050152949
622 |
623 | ## 神经网络
624 | ``` Python
625 | from sklearn.preprocessing import OneHotEncoder
626 | enc = OneHotEncoder()
627 | enc.fit(y.reshape(-1, 1))
628 | y_hot = enc.transform(y.reshape(-1, 1))
629 |
630 |
631 | #构建LM神经网络模型
632 | from keras.models import Sequential #导入神经网络初始化函数
633 | from keras.layers.core import Dense, Activation #导入神经网络层函数、激活函数
634 | from keras.layers import Dropout
635 | from keras.metrics import top_k_categorical_accuracy
636 | from keras.callbacks import EarlyStopping
637 | netfile = './net.model' #构建的神经网络模型存储路径
638 |
639 | def acc_top2(y_true, y_pred):
640 | return top_k_categorical_accuracy(y_true, y_pred, k=2)
641 |
642 | net = Sequential()
643 | net.add(Dense(input_dim = 38, output_dim = 128))
644 | net.add(Activation('relu'))
645 | net.add(Dense(input_dim = 128, output_dim = 256))
646 | net.add(Activation('relu'))
647 | net.add(Dense(input_dim = 256, output_dim = 256))
648 | net.add(Activation('relu'))
649 | net.add(Dropout(0.3))
650 | net.add(Dense(input_dim = 256, output_dim = 512))
651 | net.add(Activation('relu'))
652 | net.add(Dense(input_dim = 512, output_dim = 4))
653 | net.add(Activation('softmax'))
654 | net.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=['accuracy']) # accuracy
655 | early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=2)
656 |
657 | net.fit(train, y_hot, epochs=150, batch_size=4096, validation_data=(train[600000:], y_hot[600000:]), callbacks=[early_stopping])
658 | net.save_weights(netfile) #保存模型
659 | ```
660 | ``` Python
661 | predict_prob = net.predict_proba(train[600000:])
662 | pb = np.array(predict_prob)
663 | submit = pd.DataFrame()
664 | submit['y1'] = pb.argsort()[np.arange(len(pb)), -1]
665 | submit['y2'] = pb.argsort()[np.arange(len(pb)), -2]
666 | print(test_score(submit['y1'].values, submit['y2'].values, y[600000:]))
667 | ```
668 | 0.775790784004
669 |
670 | ## XGBOOST生成新特征
671 | ``` Python
672 | from sklearn.model_selection import train_test_split
673 | X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=0)##test_size测试集合所占比例
674 | ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合
675 | X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(X_train, y_train, test_size=0.7, random_state=0)
676 |
677 | def mergeToOne(X,X2):
678 | return np.hstack((X, X2))
679 | ```
680 |
681 | ``` Python
682 | from xgboost.sklearn import XGBClassifier
683 | xgb = XGBClassifier(booster='gbtree',
684 | learning_rate =0.1,
685 | objective='multi:softmax',
686 | num_class=4,
687 | gamma=0.05,
688 | subsample=0.4,
689 | reg_alpha=1e-05,
690 | n_estimators=50,
691 | metric='multi_logloss',
692 | colsample_bytree=0.7,
693 | silent=1,
694 | nthread=4)
695 |
696 | xgb.fit(X_train_1, y_train_1)
697 | new_feature= xgb.apply(X_train_2)
698 |
699 | X_train_new2 = mergeToOne(X_train_2,new_feature)
700 | new_feature_test = xgb.apply(X_test)
701 | X_test_new = mergeToOne(X_test,new_feature_test)
702 | ```
703 |
704 | ## blend
705 | ``` Python
706 | import numpy as np
707 | from sklearn.cross_validation import StratifiedKFold
708 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
709 | from sklearn.ensemble import GradientBoostingClassifier
710 | from sklearn.linear_model import LogisticRegression
711 | from xgboost.sklearn import XGBClassifier
712 | import lightgbm as lgb
713 |
714 | def blend(X, y, X_submission, n_folds):
715 | skf = list(StratifiedKFold(y, n_folds))
716 |
717 | clfs = [RandomForestClassifier(n_estimators=150, min_samples_split=90, min_samples_leaf=15,max_depth=8, n_jobs=-1, criterion='gini'),
718 | RandomForestClassifier(n_estimators=150, min_samples_split=90, min_samples_leaf=15,max_depth=8, n_jobs=-1, criterion='entropy'),
719 | ExtraTreesClassifier(n_estimators=150, min_samples_split=90, min_samples_leaf=15,max_depth=8, n_jobs=-1, criterion='gini'),
720 | ExtraTreesClassifier(n_estimators=150, min_samples_split=90, min_samples_leaf=15,max_depth=8, n_jobs=-1, criterion='entropy'),
721 | GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=8),
722 | XGBClassifier(learning_rate =0.05, n_estimators=300, max_depth=6, min_child_weight=1, gamma=0.1, subsample=0.8,
723 | colsample_bytree=0.8, objective= 'multi:softmax', nthread=4, eg_alpha=0.001, scale_pos_weight=1),
724 | lgb.LGBMClassifier(learning_rate=0.1, boosting_type='gbdt', objective='multiclass', n_estimators=300, metric='multi_logloss',
725 | max_depth=7, num_leaves=5, subsample=0.7, colsample_bytree=0.7, min_data_in_leaf=45, feature_fraction=0.7, bagging_fraction=0.7,
726 | bagging_freq=6, lambda_l1=1, lambda_l2=0.001, min_gain_to_split=0.265, verbose=5, is_unbalance=True)]
727 |
728 |
729 |
730 |
731 | dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
732 |
733 | dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
734 |
735 | for j, clf in enumerate(clfs):
736 | print (j, clf)
737 | dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
738 | for i, (train, test) in enumerate(skf):
739 | print ("Fold", i)
740 | X_train = X[train]
741 | y_train = y[train]
742 | X_test = X[test]
743 | y_test = y[test]
744 | clf.fit(X_train, y_train)
745 | y_submission = clf.predict_proba(X_test)[:, 1]
746 | dataset_blend_train[test, j] = y_submission
747 | dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
748 | dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
749 | print("Blending.")
750 | clf = LogisticRegression()
751 | clf.fit(dataset_blend_train, y)
752 | y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
753 |
754 | return clf.predict_proba(dataset_blend_test)
755 | ```
756 |
757 | ## Stacking
758 | > emmmm,这个我不确定是不是这样写。
759 | ``` Python
760 | import lightgbm as lgb
761 | from xgboost.sklearn import XGBClassifier
762 | from sklearn.ensemble import RandomForestClassifier
763 | xgb = XGBClassifier(booster='gbtree',
764 | learning_rate =0.1,
765 | objective='multi:softmax',
766 | num_class=4,
767 | gamma=0.05,
768 | subsample=0.4,
769 | reg_alpha=1e-05,
770 | n_estimators=50,
771 | metric='multi_logloss',
772 | colsample_bytree=0.7,
773 | silent=1,
774 | nthread=4)
775 |
776 | gbm = lgb.LGBMClassifier(learning_rate=0.1,
777 | boosting_type='gbdt',
778 | objective='multiclass',
779 | n_estimators=50,
780 | metric='multi_logloss',
781 | max_depth=7,
782 | bagging_fraction=0.7,
783 | is_unbalance=True)
784 |
785 | rf = RandomForestClassifier(n_estimators=50,
786 | min_samples_split=90,
787 | min_samples_leaf=15,
788 | max_depth=8,
789 | oob_score=True)
790 | ```
791 |
792 | ``` Python
793 | xgb.fit(X_train_1, y_train_1)
794 | new_feature= xgb.apply(X_train_2)
795 |
796 | X_train_new2 = mergeToOne(X_train_2,new_feature)
797 | new_feature_test = xgb.apply(X_test)
798 | X_test_new = mergeToOne(X_test,new_feature_test)
799 |
800 |
801 | gbm.fit(X_train_1, y_train_1)
802 | new_feature = gbm.apply(X_train_2)
803 |
804 | X_train_new2 = mergeToOne(X_train_new2,new_feature)
805 | new_feature_test = gbm.apply(X_test)
806 | X_test_new = mergeToOne(X_test_new,new_feature_test)
807 |
808 |
809 | rf.fit(X_train_1, y_train_1)
810 | new_feature = rf.apply(X_train_2)
811 | X_train_new2 = mergeToOne(X_train_new2, new_feature)
812 | new_feature_test = rf.apply(X_test)
813 | X_test_new = mergeToOne(X_test_new, new_feature_test)
814 |
815 | ```
816 | ## 加权投票
817 | ``` Python
818 | def wsubmit(xg, lg, nn):
819 | xg_y1 = xg['y1'].values
820 | lg_y1 = lg['y1'].values
821 | lg_y2 = lg['y2'].values
822 | nn_y1 = lg['y1'].values
823 | submitData = pd.DataFrame()
824 | y1 = []
825 | y2 = []
826 | for i in range(len(xg)):
827 | row_y1 = [xg_y1[i], lg_y1[i], nn_y1[i]]
828 | y1.append(max(row_y1, key=row_y1.count))
829 | if max(row_y1, key=row_y1.count) != lg_y1[i]:
830 | y2.append(lg_y1[i])
831 | else:
832 | y2.append(lg_y2[i])
833 | submitData['y1'] = y1
834 | submitData['y2'] = y2
835 | submitData.to_csv('submit_voting.csv', index=False)
836 | ```
837 |
838 | # 总结
839 | 这次主要是锻炼之前学到的东西,实际比赛排名不是很高。
--------------------------------------------------------------------------------