├── Part1
├── Least_squares.ipynb
├── Least_squares.py
├── Numpy_P_R_F1_etc.ipynb
├── Numpy_P_R_F1_etc.py
├── Sklearn_P_R_F1_ect.ipynb
└── Sklearn_P_R_F1_ect.py
├── Part2
├── .idea
│ ├── Part2.iml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ └── workspace.xml
├── LinearDiscriminantAnalysis_LDA.ipynb
├── LinearDiscriminantAnalysis_LDA.py
├── LinearRegression.ipynb
├── LinearRegression.py
├── LogisticRegression.ipynb
└── LogisticRegression.py
├── Part3
├── Naive_bayes.ipynb
└── Naive_bayes.py
├── Part4
├── DecionTree.ipynb
└── DecionTree.py
├── Part5
├── BoostTree.ipynb
└── BoostTree.py
├── Part6
├── GBDT.ipynb
├── GBDT.py
├── xgboost.ipynb
└── xgboost.py
├── Part7
├── SVM.ipynb
└── SVM.py
└── README.md
/Part1/Least_squares.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import scipy as sp
9 | from scipy.optimize import leastsq
10 | import matplotlib.pyplot as plt
11 | get_ipython().run_line_magic('matplotlib', 'inline')
12 |
13 |
14 | # In[2]:
15 |
16 |
17 | # 目标函数
18 | def real_func(x):
19 | return np.cos(2*np.pi*x)
20 |
21 | # 多项式
22 | def fit_func(p, x):
23 | f = np.poly1d(p)
24 | return f(x)
25 |
26 | # 残差
27 | def residuals_func(p, x, y):
28 | ret = fit_func(p, x) - y
29 | return ret
30 |
31 |
32 | # In[3]:
33 |
34 |
35 | # 十个点
36 | x = np.linspace(0, 1, 10)
37 | x_points = np.linspace(0, 1, 1000)
38 | # 加上正态分布噪音的目标函数的值
39 | y_ = real_func(x)
40 | y = [np.random.normal(0, 0.1) + y1 for y1 in y_]
41 |
42 |
43 | def fitting(M=0):
44 | """
45 | M 为 多项式的次数
46 | """
47 | # 随机初始化多项式参数
48 | p_init = np.random.rand(M + 1)
49 | # 最小二乘法
50 | p_lsq = leastsq(residuals_func, p_init, args=(x, y))
51 | print('Fitting Parameters:', p_lsq[0])
52 |
53 | # 可视化
54 | plt.plot(x_points, real_func(x_points), label='real')
55 | plt.plot(x_points, fit_func(p_lsq[0], x_points), label='fitted curve')
56 | plt.plot(x, y, 'bo', label='noise')
57 | plt.legend()
58 | return p_lsq
59 |
60 |
61 | # In[4]:
62 |
63 |
64 | # M=0
65 | p_lsq_0 = fitting(M=0)
66 |
67 |
68 | # In[5]:
69 |
70 |
71 | # M=1
72 | p_lsq_1 = fitting(M=1)
73 |
74 |
75 | # In[6]:
76 |
77 |
78 | # M=3
79 | p_lsq_3 = fitting(M=3)
80 |
81 |
82 | # In[7]:
83 |
84 |
85 |
86 | # M=9
87 | p_lsq_9 = fitting(M=9)
88 |
89 |
90 | # In[8]:
91 |
92 |
93 | regularization = 0.0001
94 | def residuals_func_regularization(p, x, y):
95 | ret = fit_func(p, x) - y
96 | ret = np.append(ret,
97 | np.sqrt(0.5 * regularization * np.square(p))) # L2范数作为正则化项
98 | return ret
99 |
100 | # 最小二乘法,加正则化项
101 | p_init = np.random.rand(9 + 1)
102 | p_lsq_regularization = leastsq(
103 | residuals_func_regularization, p_init, args=(x, y))
104 |
105 |
106 | plt.plot(x_points, real_func(x_points), label='real')
107 | plt.plot(x_points, fit_func(p_lsq_9[0], x_points), label='fitted curve')
108 | plt.plot(
109 | x_points,
110 | fit_func(p_lsq_regularization[0], x_points),
111 | label='regularization')
112 | plt.plot(x, y, 'bo', label='noise')
113 | plt.legend()
114 |
115 |
116 | # In[ ]:
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/Part1/Numpy_P_R_F1_etc.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "['TN', 'TN', 'TN', 'TN', 'FP', 'FP', 'TN', 'FP', 'TN', 'TN', 'FP', 'TN', 'TP', 'FN', 'FN', 'FN', 'TP', 'FN', 'TP', 'FN']\n",
13 | "0.39999999999999997\n",
14 | "0.765625\n",
15 | "0.765625\n"
16 | ]
17 | }
18 | ],
19 | "source": [
20 | "import numpy as np\n",
21 | "import pandas as pd\n",
22 | " \n",
23 | "class Score():\n",
24 | " def __init__(self,pre_score,rel_label,threshold,beta):\n",
25 | " self.tn = 0\n",
26 | " self.fn = 0\n",
27 | " self.fp = 0\n",
28 | " self.tp = 0\n",
29 | " self.pre_score = pre_score\n",
30 | " self.rel_label = rel_label\n",
31 | " self.threshold = threshold\n",
32 | " self.beta = beta\n",
33 | " list(map(self.__getCM_count,\n",
34 | " self.pre_score,\n",
35 | " self.rel_label))\n",
36 | " \n",
37 | " def __getCM(self,pre, rel):\n",
38 | " if (pre < self.threshold):\n",
39 | " if (rel == 0): return 'TN'\n",
40 | " if (rel == 1): return 'FN'\n",
41 | " if (pre >= self.threshold):\n",
42 | " if (rel == 0): return 'FP'\n",
43 | " if (rel == 1): return 'TP'\n",
44 | " \n",
45 | " def get_cm(self):\n",
46 | " return list(map(self.__getCM,\n",
47 | " self.pre_score,\n",
48 | " self.rel_label))\n",
49 | " \n",
50 | " def __getCM_count(self,pre, rel):\n",
51 | " if (pre < self.threshold):\n",
52 | " if (rel == 0): self.tn += 1\n",
53 | " if (rel == 1): self.fn += 1\n",
54 | " if (pre >= self.threshold):\n",
55 | " if (rel == 0): self.fp += 1\n",
56 | " if (rel == 1): self.tp += 1\n",
57 | " \n",
58 | " def get_f1(self):\n",
59 | " P = self.tp/(self.tp+self.fp)\n",
60 | " R = self.tp/(self.tp+self.fn)\n",
61 | " if(P == 0.0):\n",
62 | " return 0.0\n",
63 | " else:\n",
64 | " return (self.beta*self.beta+1)*P*R/(self.beta*self.beta*P+R)\n",
65 | " \n",
66 | " # 方法二 precision——分数精度\n",
67 | " def get_auc_by_count(self,precision=100):\n",
68 | " # 正样本数\n",
69 | " postive_len = sum(self.rel_label)\n",
70 | " # 负样本数\n",
71 | " negative_len = len(self.rel_label) - postive_len\n",
72 | " # 总对比数\n",
73 | " total_case = postive_len * negative_len\n",
74 | " # 正样本分数计数器(填0在range...)\n",
75 | " pos_histogram = [0 for _ in range(precision+1)]\n",
76 | " # 负样本分数计数器(填0在range...)\n",
77 | " neg_histogram = [0 for _ in range(precision+1)]\n",
78 | " # 分数放大\n",
79 | " bin_width = 1.0 / precision\n",
80 | " \n",
81 | " for i in range(len(self.rel_label)):\n",
82 | " nth_bin = int(self.pre_score[i] / bin_width)\n",
83 | " if self.rel_label[i] == 1:\n",
84 | " pos_histogram[nth_bin] += 1\n",
85 | " else:\n",
86 | " neg_histogram[nth_bin] += 1\n",
87 | " \n",
88 | " accumulated_neg = 0\n",
89 | " satisfied_pair = 0\n",
90 | " for i in range(precision+1):\n",
91 | " satisfied_pair += (pos_histogram[i] * accumulated_neg + pos_histogram[i] * neg_histogram[i] * 0.5)\n",
92 | " accumulated_neg += neg_histogram[i]\n",
93 | " return satisfied_pair / float(total_case)\n",
94 | " \n",
95 | " # 方法三\n",
96 | " def get_auc_by_rank(self):\n",
97 | " # 拼接排序\n",
98 | " df = pd.DataFrame({'pre_score':self.pre_score,'rel_label':self.rel_label})\n",
99 | " df = df.sort_values(by='pre_score',ascending=False).reset_index(drop=True)\n",
100 | " # 获取 n,N,M\n",
101 | " n = len(df)\n",
102 | " M = len(df[df['rel_label']==1])\n",
103 | " N = n - M\n",
104 | " # 初始化rank 和同值统计ank_tmp,count_all,count_p\n",
105 | " rank = 0.0\n",
106 | " rank_tmp,count_all,count_p = 0.0,0,0\n",
107 | " # 添加防止越界的一条不影响结果的记录\n",
108 | " df.loc[n] = [0,0]\n",
109 | " # 遍历一次\n",
110 | " for i in range(n):\n",
111 | " # 判断i+1是否与i同值,不同值则要考虑是否刚刚结束同值统计\n",
112 | " if(df['pre_score'][i+1] != df['pre_score'][i]):\n",
113 | " # 正样本\n",
114 | " if(df['rel_label'][i] == 1):\n",
115 | " # 计数不为0,刚刚结束同值统计\n",
116 | " if (count_all != 0):\n",
117 | " # 同值统计结果加在rank上,这里注意补回结束统计时漏掉的最后一条同值数据\n",
118 | " rank += (rank_tmp + n - i) * (count_p+1) / (count_all+1)\n",
119 | " rank_tmp, count_all, count_p = 0.0, 0, 0\n",
120 | " continue\n",
121 | " rank += (n-i)\n",
122 | " else:\n",
123 | " if (count_all != 0):\n",
124 | " rank += (rank_tmp + n - i) * (count_p) / (count_all+1)\n",
125 | " rank_tmp, count_all, count_p = 0.0, 0, 0\n",
126 | " continue\n",
127 | " else:\n",
128 | " rank_tmp += (n-i)\n",
129 | " count_all += 1\n",
130 | " if(df['rel_label'][i] == 1):\n",
131 | " count_p += 1\n",
132 | " return (rank-M*(1+M)/2)/(M*N)\n",
133 | " \n",
134 | " \n",
135 | "if __name__ == '__main__':\n",
136 | " learn_data_L2 = [0.2,0.3,0.4,0.35,0.6,0.55,0.2,0.57,0.3,0.15,0.77,0.33,0.9,0.49, 0.45,0.41, 0.66,0.43,0.7,0.4]\n",
137 | " learn_data_R2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
138 | " learn_data2 = pd.DataFrame({'Learn': learn_data_L2, 'Real': learn_data_R2})\n",
139 | " \n",
140 | " score2 = Score(learn_data2['Learn'], learn_data2['Real'], 0.5, 1)\n",
141 | " \n",
142 | " print(score2.get_cm())\n",
143 | " print(score2.get_f1())\n",
144 | " print(score2.get_auc_by_count())\n",
145 | " print(score2.get_auc_by_rank())"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": []
154 | }
155 | ],
156 | "metadata": {
157 | "kernelspec": {
158 | "display_name": "Python 3",
159 | "language": "python",
160 | "name": "python3"
161 | },
162 | "language_info": {
163 | "codemirror_mode": {
164 | "name": "ipython",
165 | "version": 3
166 | },
167 | "file_extension": ".py",
168 | "mimetype": "text/x-python",
169 | "name": "python",
170 | "nbconvert_exporter": "python",
171 | "pygments_lexer": "ipython3",
172 | "version": "3.7.3"
173 | }
174 | },
175 | "nbformat": 4,
176 | "nbformat_minor": 2
177 | }
178 |
--------------------------------------------------------------------------------
/Part1/Numpy_P_R_F1_etc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 |
10 | class Score():
11 | def __init__(self,pre_score,rel_label,threshold,beta):
12 | self.tn = 0
13 | self.fn = 0
14 | self.fp = 0
15 | self.tp = 0
16 | self.pre_score = pre_score
17 | self.rel_label = rel_label
18 | self.threshold = threshold
19 | self.beta = beta
20 | list(map(self.__getCM_count,
21 | self.pre_score,
22 | self.rel_label))
23 |
24 | def __getCM(self,pre, rel):
25 | if (pre < self.threshold):
26 | if (rel == 0): return 'TN'
27 | if (rel == 1): return 'FN'
28 | if (pre >= self.threshold):
29 | if (rel == 0): return 'FP'
30 | if (rel == 1): return 'TP'
31 |
32 | def get_cm(self):
33 | return list(map(self.__getCM,
34 | self.pre_score,
35 | self.rel_label))
36 |
37 | def __getCM_count(self,pre, rel):
38 | if (pre < self.threshold):
39 | if (rel == 0): self.tn += 1
40 | if (rel == 1): self.fn += 1
41 | if (pre >= self.threshold):
42 | if (rel == 0): self.fp += 1
43 | if (rel == 1): self.tp += 1
44 |
45 | def get_f1(self):
46 | P = self.tp/(self.tp+self.fp)
47 | R = self.tp/(self.tp+self.fn)
48 | if(P == 0.0):
49 | return 0.0
50 | else:
51 | return (self.beta*self.beta+1)*P*R/(self.beta*self.beta*P+R)
52 |
53 | # 方法二 precision——分数精度
54 | def get_auc_by_count(self,precision=100):
55 | # 正样本数
56 | postive_len = sum(self.rel_label)
57 | # 负样本数
58 | negative_len = len(self.rel_label) - postive_len
59 | # 总对比数
60 | total_case = postive_len * negative_len
61 | # 正样本分数计数器(填0在range...)
62 | pos_histogram = [0 for _ in range(precision+1)]
63 | # 负样本分数计数器(填0在range...)
64 | neg_histogram = [0 for _ in range(precision+1)]
65 | # 分数放大
66 | bin_width = 1.0 / precision
67 |
68 | for i in range(len(self.rel_label)):
69 | nth_bin = int(self.pre_score[i] / bin_width)
70 | if self.rel_label[i] == 1:
71 | pos_histogram[nth_bin] += 1
72 | else:
73 | neg_histogram[nth_bin] += 1
74 |
75 | accumulated_neg = 0
76 | satisfied_pair = 0
77 | for i in range(precision+1):
78 | satisfied_pair += (pos_histogram[i] * accumulated_neg + pos_histogram[i] * neg_histogram[i] * 0.5)
79 | accumulated_neg += neg_histogram[i]
80 | return satisfied_pair / float(total_case)
81 |
82 | # 方法三
83 | def get_auc_by_rank(self):
84 | # 拼接排序
85 | df = pd.DataFrame({'pre_score':self.pre_score,'rel_label':self.rel_label})
86 | df = df.sort_values(by='pre_score',ascending=False).reset_index(drop=True)
87 | # 获取 n,N,M
88 | n = len(df)
89 | M = len(df[df['rel_label']==1])
90 | N = n - M
91 | # 初始化rank 和同值统计ank_tmp,count_all,count_p
92 | rank = 0.0
93 | rank_tmp,count_all,count_p = 0.0,0,0
94 | # 添加防止越界的一条不影响结果的记录
95 | df.loc[n] = [0,0]
96 | # 遍历一次
97 | for i in range(n):
98 | # 判断i+1是否与i同值,不同值则要考虑是否刚刚结束同值统计
99 | if(df['pre_score'][i+1] != df['pre_score'][i]):
100 | # 正样本
101 | if(df['rel_label'][i] == 1):
102 | # 计数不为0,刚刚结束同值统计
103 | if (count_all != 0):
104 | # 同值统计结果加在rank上,这里注意补回结束统计时漏掉的最后一条同值数据
105 | rank += (rank_tmp + n - i) * (count_p+1) / (count_all+1)
106 | rank_tmp, count_all, count_p = 0.0, 0, 0
107 | continue
108 | rank += (n-i)
109 | else:
110 | if (count_all != 0):
111 | rank += (rank_tmp + n - i) * (count_p) / (count_all+1)
112 | rank_tmp, count_all, count_p = 0.0, 0, 0
113 | continue
114 | else:
115 | rank_tmp += (n-i)
116 | count_all += 1
117 | if(df['rel_label'][i] == 1):
118 | count_p += 1
119 | return (rank-M*(1+M)/2)/(M*N)
120 |
121 |
122 | if __name__ == '__main__':
123 | learn_data_L2 = [0.2,0.3,0.4,0.35,0.6,0.55,0.2,0.57,0.3,0.15,0.77,0.33,0.9,0.49, 0.45,0.41, 0.66,0.43,0.7,0.4]
124 | learn_data_R2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
125 | learn_data2 = pd.DataFrame({'Learn': learn_data_L2, 'Real': learn_data_R2})
126 |
127 | score2 = Score(learn_data2['Learn'], learn_data2['Real'], 0.5, 1)
128 |
129 | print(score2.get_cm())
130 | print(score2.get_f1())
131 | print(score2.get_auc_by_count())
132 | print(score2.get_auc_by_rank())
133 |
134 |
135 | # In[ ]:
136 |
137 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/Part1/Sklearn_P_R_F1_ect.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | # 准确率
8 | import numpy as np
9 | from sklearn.metrics import accuracy_score
10 | y_pred = [0, 2, 1, 3,9,9,8,5,8]
11 | y_true = [0, 1, 2, 3,2,6,3,5,9]
12 |
13 | accuracy_score(y_true, y_pred)
14 |
15 |
16 | # In[2]:
17 |
18 |
19 | accuracy_score(y_true, y_pred, normalize=False) # 类似海明距离,每个类别求准确后,再求微平均
20 |
21 |
22 | # In[4]:
23 |
24 |
25 | import warnings
26 | warnings.filterwarnings('ignore')
27 | from sklearn import metrics
28 | print(metrics.precision_score(y_true, y_pred, average='micro')) # 微平均,精确率
29 | print(metrics.precision_score(y_true, y_pred, average='macro')) # 宏平均,精确率
30 | print(metrics.precision_score(y_true, y_pred, labels=[0, 1, 2, 3], average='macro')) # 指定特定分类标签的精确率
31 |
32 |
33 | # In[5]:
34 |
35 |
36 | #召回率
37 | print(metrics.recall_score(y_true, y_pred, average='micro'))
38 | print(metrics.recall_score(y_true, y_pred, average='macro'))
39 | #F1
40 | print(metrics.f1_score(y_true, y_pred, average='weighted'))
41 |
42 |
43 | # In[9]:
44 |
45 |
46 | # 混淆矩阵
47 | from sklearn.metrics import confusion_matrix
48 | print(confusion_matrix(y_true, y_pred))
49 |
50 | # 分类报告:precision/recall/fi-score/均值/分类个数
51 | from sklearn.metrics import classification_report
52 | y_true = [0, 1, 2, 2, 0]
53 | y_pred = [0, 0, 2, 2, 0]
54 | target_names = ['class 0', 'class 1', 'class 2']
55 | print(classification_report(y_true, y_pred, target_names=target_names))
56 |
57 |
58 | # In[10]:
59 |
60 |
61 | #ROC与AUC值
62 | import numpy as np
63 | from sklearn.metrics import roc_auc_score
64 | y_true = np.array([0, 0, 1, 1])
65 | y_scores = np.array([0.1, 0.4, 0.35, 0.8])
66 | roc_auc_score(y_true, y_scores)
67 |
68 |
69 | # In[13]:
70 |
71 |
72 | #-----------------以iris数据集为例(线性核的支持向量机)---------------------
73 | import numpy as np
74 | import matplotlib.pyplot as plt
75 | from itertools import cycle
76 |
77 | from sklearn import svm, datasets
78 | from sklearn.metrics import roc_curve, auc
79 | from sklearn.model_selection import train_test_split
80 | from sklearn.preprocessing import label_binarize
81 | from sklearn.multiclass import OneVsRestClassifier
82 | from scipy import interp
83 |
84 | # Import some data to play with
85 | iris = datasets.load_iris()
86 | X = iris.data
87 | y = iris.target
88 |
89 | # Binarize the output
90 | y = label_binarize(y, classes=[0, 1, 2])
91 | n_classes = y.shape[1]
92 |
93 | # Add noisy features to make the problem harder
94 | random_state = np.random.RandomState(0)
95 | n_samples, n_features = X.shape
96 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
97 |
98 | # shuffle and split training and test sets
99 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
100 | random_state=0)
101 |
102 | # Learn to predict each class against the other
103 | classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
104 | random_state=random_state))
105 | y_score = classifier.fit(X_train, y_train).decision_function(X_test)
106 |
107 | # Compute ROC curve and ROC area for each class
108 | fpr = dict()
109 | tpr = dict()
110 | roc_auc = dict()
111 | for i in range(n_classes):
112 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
113 | roc_auc[i] = auc(fpr[i], tpr[i])
114 |
115 | # Compute micro-average ROC curve and ROC area
116 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
117 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
118 |
119 | plt.figure()
120 | lw = 2
121 | plt.plot(fpr[2], tpr[2], color='darkorange',
122 | lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
123 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
124 | plt.xlim([0.0, 1.0])
125 | plt.ylim([0.0, 1.05])
126 | plt.xlabel('False Positive Rate')
127 | plt.ylabel('True Positive Rate')
128 | plt.title('Receiver operating characteristic example')
129 | plt.legend(loc="lower right")
130 | plt.show()
131 |
132 |
133 | # In[14]:
134 |
135 |
136 | # Compute macro-average ROC curve and ROC area
137 |
138 | # First aggregate all false positive rates
139 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
140 |
141 | # Then interpolate all ROC curves at this points
142 | mean_tpr = np.zeros_like(all_fpr)
143 | for i in range(n_classes):
144 | mean_tpr += interp(all_fpr, fpr[i], tpr[i])
145 |
146 | # Finally average it and compute AUC
147 | mean_tpr /= n_classes
148 |
149 | fpr["macro"] = all_fpr
150 | tpr["macro"] = mean_tpr
151 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
152 |
153 | # Plot all ROC curves
154 | plt.figure()
155 | plt.plot(fpr["micro"], tpr["micro"],
156 | label='micro-average ROC curve (area = {0:0.2f})'
157 | ''.format(roc_auc["micro"]),
158 | color='deeppink', linestyle=':', linewidth=4)
159 |
160 | plt.plot(fpr["macro"], tpr["macro"],
161 | label='macro-average ROC curve (area = {0:0.2f})'
162 | ''.format(roc_auc["macro"]),
163 | color='navy', linestyle=':', linewidth=4)
164 |
165 | colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
166 | for i, color in zip(range(n_classes), colors):
167 | plt.plot(fpr[i], tpr[i], color=color, lw=lw,
168 | label='ROC curve of class {0} (area = {1:0.2f})'
169 | ''.format(i, roc_auc[i]))
170 |
171 | plt.plot([0, 1], [0, 1], 'k--', lw=lw)
172 | plt.xlim([0.0, 1.0])
173 | plt.ylim([0.0, 1.05])
174 | plt.xlabel('False Positive Rate')
175 | plt.ylabel('True Positive Rate')
176 | plt.title('Some extension of Receiver operating characteristic to multi-class')
177 | plt.legend(loc="lower right")
178 | plt.show()
179 |
180 |
181 | # In[ ]:
182 |
183 |
184 |
185 |
186 |
--------------------------------------------------------------------------------
/Part2/.idea/Part2.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/Part2/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Part2/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Part2/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Part2/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 | 1563350901040
99 |
100 |
101 | 1563350901040
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 | file://$PROJECT_DIR$/LinearDiscriminantAnalysis_LDA.py
132 | 8
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/Part2/LinearDiscriminantAnalysis_LDA.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "将样例投影到一条直线上,使得同类的样例投影点尽可能小,不同类投影点尽可能远离"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "import sys\n",
18 | "import numpy as np\n",
19 | "import operator\n",
20 | "import matplotlib.pyplot as plt\n",
21 | "\n",
22 | "%matplotlib inline"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "group1 = np.random.random((8,2))*5+20\n",
32 | "group2 = np.random.random((8,2))*5+2"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "x1 = group1\n",
42 | "y1 = np.ones((8,1))\n",
43 | "x0 = group2\n",
44 | "y0 = np.zeros((8,1))"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 6,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/plain": [
55 | ""
56 | ]
57 | },
58 | "execution_count": 6,
59 | "metadata": {},
60 | "output_type": "execute_result"
61 | },
62 | {
63 | "data": {
64 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAADlNJREFUeJzt3V+IXOd9xvHnEdZFZBshWWtXGO8ONcG0VCCHxRRcglKRkBiC7UIKZhNUCN0EbIhDLmK8F3EvFkyIHffKdI1NFDI1BOzUTmvaGpHiOhSHkVEtBVF00d0lySJt7CIrKJDY+vVizsbr9czOnz0z5+xvvh9YZvbsGZ2fD4fH77znfd/jiBAAIJc9VRcAACgf4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJDQdb12sH2bpO9L+iNJ1yQtRcTf235M0t9KWi92fTQiXtnu3zp06FA0Go0dFQwAk+b06dO/joipQT7TM9wlvSfpGxHxpu0bJZ22/Wrxt+9GxHf6PVij0VCr1RqkPgCYeLZXBv1Mz3CPiDVJa8X7K7bPS7p18PIAAOMyUJ+77YakOyW9UWx6yPZbtp+zfaDk2gAAQ+o73G3fIOkFSQ9HxLuSnpZ0u6Sjarfsn+jyuXnbLdut9fX1TrsAAErWV7jb3qt2sDcj4kVJioiLEfF+RFyT9Iykuzp9NiKWImI2Imanpga6HwAAGFLPcLdtSc9KOh8RT27afnjTbvdLOld+eQCAYfTTcr9b0pck/aXtM8XPPZK+bfus7bckfUrS10dZKABIkppNqdGQ9uxpvzabVVdUS/2Mlnldkjv8adsx7QBQimZTWliQVlelgwelK1ek3/2u/beVFWl+vv1+bq66GmuIGaoA6qvZbIf3yooUIb399gfBvuHq1Xb440MIdwD1tbDQDu9eVldHX8suQ7gDqK9+Q3t6erR17EKEO4D66ie09+2TFhdHX8suQ7gDqK/FxXZ4b7Z3r3TTTZItzcxIS0vcTO2gn4XDAKAaG6G9MVpmerod+IR5T4Q7gHqbmyPMh0C3DAAkRLgDyIdZrHTLAEhmY+LTxvj4CZ3FSssdQC6dJj5N4CxWwh1ALt0mPk3YLFbCHUA9lNVP3m3i04TNYiXcAVRv6wJhG/3kwwR8p4lPEziLlXAHUL2y+8k/9rEP3t90U3WzWCsctUO4A6jeoP3k3UJz4xvA229/sO9vf1tmpf0r89vIEBwRYzmQJM3Ozkar1Rrb8QDsEo1GO/y2mpmRlpc/vG3rUEep3e2ytNRu6ff774zaIP9NPdg+HRGzg3yGljuA6g3ST75dF06dRspUXAvhDqB6c3PtlvfMTO/VHrcLzTqNlKm4FsIdQD3MzbW7K65da792uwG6XWjWaaRMxbUQ7gB2l+1Cc5BvAKNWcS3cUAWw+zSbE7XG+zA3VFk4DMDuwxrvPdEtAwAJEe4AkBDhDgAJEe4AkBDhDgAJEe4AkBDhDgAJEe4AkBDhDgAJEe4AkBDhDgAJEe4AJkeFzzQdNxYOAzAZtj6eb+OZplLKRch6ttxt32b7J7bP2/657a8V2w/aftX2heL1wOjLBYAhbfd4voT66ZZ5T9I3IuJPJP25pAdt/6mkRySdioiPSzpV/A4A9VSn56uOQc9wj4i1iHizeH9F0nlJt0q6V9LJYreTku4bVZEAsGN1er7qGAx0Q9V2Q9Kdkt6QdEtErEnt/wFIurns4gCgNHV6vuoY9B3utm+Q9IKkhyPi3QE+N2+7Zbu1vr4+TI0AsHN1er7qGPT1DFXbeyX9s6R/i4gni23/I+lYRKzZPizpPyLiju3+HZ6hCgCDG+YZqv2MlrGkZyWd3wj2wsuSThTvT0h6aZADAwBGp59x7ndL+pKks7bPFNselfS4pB/a/rKkVUlfGE2JAIBB9Qz3iHhdkrv8+Xi55QAAysDyAwCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQEOEOAAkR7gCQUM9wt/2c7Uu2z23a9pjtX9o+U/zcM9oyAQCD6Kfl/j1Jn+2w/bsRcbT4eaXcsgAAO9Ez3CPiNUnvjKEWAEBJdtLn/pDtt4pumwPddrI9b7tlu7W+vr6DwwEA+jVsuD8t6XZJRyWtSXqi244RsRQRsxExOzU1NeThAACDGCrcI+JiRLwfEdckPSPprnLLAgDsxFDhbvvwpl/vl3Su274AgPG7rtcOtp+XdEzSIdu/kPQtScdsH5UUkpYlfWWENQIABtQz3CPigQ6bnx1BLQCAkjBDFQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBICHCHQASItwBIKGe4W77OduXbJ/btO2g7VdtXyheD4y2TADAIPppuX9P0me3bHtE0qmI+LikU8XvAICa6BnuEfGapHe2bL5X0sni/UlJ95VcFwBgB4btc78lItYkqXi9uduOtudtt2y31tfXhzwcAGAQI7+hGhFLETEbEbNTU1OjPhwAQMOH+0XbhyWpeL1UXkkAgJ0aNtxflnSieH9C0kvllAMAKEM/QyGfl/Rfku6w/QvbX5b0uKRP274g6dPF7wCAmriu1w4R8UCXPx0vuRYAQEmYoQoACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuAJAQ4Q4ACRHuI9A821TjqYb2/N0eNZ5qqHm2WXVJACbMdVUXkE3zbFPzP57X1d9flSStXF7R/I/nJUlzR+aqLA3ABKHlXrKFUwt/CPYNV39/VQunFiqqCMAkItwH0E93y+rl1Y6f7bYdAEaBcO/TRnfLyuUVheIP3S1bA356/3THz3fbDgCjQLj3qd/ulsXji9q3d9+Htu3bu0+LxxdHXiMAbCDc+9Rvd8vckTktfX5JM/tnZFkz+2e09PklbqYCGCtGy/Rpev+0Vi6vdNy+1dyROcIcQKVoufeJ7hYAuwnh3ie6WwDsJo6IsR1sdnY2Wq3W2I4HABnYPh0Rs4N8hpY7ACREuANAQjsaLWN7WdIVSe9Lem/Qrw0AgNEoo+X+qYg4SrB3xyqRAMaNce4j1m2VyJ+u/lSvXHhFq5dXNb1/WovHFxl5A6A0OxotY/t/Jf2fpJD0DxGxtN3+kzhapvFUo+PkJ8sKfXDu9+3dx9BKAB1VMVrm7oj4hKTPSXrQ9ic7FDVvu2W7tb6+vsPD7T7dli3YHOwSywIDKNeOwj0iflW8XpL0I0l3ddhnKSJmI2J2ampqJ4fblQZZDZJlgQGUZehwt3297Rs33kv6jKRzZRWWRadlCyx33JdlgQGUZSct91skvW77vyX9TNK/RMS/llNWHp2WLfjq7FdZpwbASLH8QEWaZ5taOLXAaBkAPQ1zQ5VwB4CaY20ZAIAkwr1UzEQFUBfMUC1Jt5mokuhLBzB2tNxL0u0B2l988Yu04gGMHeFeku0mIG204gl4AONCuJek1wQklhcAME6Ee0k6zUTdiuUFAIwL4V6SzTNRu2F5AQDjQriXaO7InJYfXtYP/uoHLC8AoFKE+wh0Wk+GtdoBjBPLDwBAzbH8wJCYWQogm4mfocrMUgAZ7YqW+yhb1t1mljImHcBuVvuW+6hb1t3GnjMmHcBuVvuW+6hb1t3GnjMmHcBuVvtwH3XLutPMUsakA9jtah/uo25ZMyYdQEa173NfPL74oT53qfyW9dyROcIcQCq1b7nTsgaAwTFDFQBqjhmqAABJhDsApES4A0BChDsAJES4A0BChDsAJDTWoZC21yWtjO2A1Tgk6ddVF1EznJPOOC8fxTnp7I6IuHGQD4x1hmpETI3zeFWw3Rp0PGp2nJPOOC8fxTnpzPbAE4TolgGAhAh3AEiIcC/fUtUF1BDnpDPOy0dxTjob+LyM9YYqAGA8aLkDQEKEe4lsL9s+a/vMMHe3M7D9nO1Lts9t2nbQ9qu2LxSvB6qssQpdzstjtn9ZXC9nbN9TZY3jZvs22z+xfd72z21/rdg+sdfLNudk4GuFbpkS2V6WNBsREztO1/YnJf1G0vcj4s+Kbd+W9E5EPG77EUkHIuKbVdY5bl3Oy2OSfhMR36mytqrYPizpcES8aftGSacl3SfpbzSh18s25+SvNeC1QssdpYqI1yS9s2XzvZJOFu9Pqn2xTpQu52WiRcRaRLxZvL8i6bykWzXB18s252RghHu5QtK/2z5te77qYmrklohYk9oXr6SbK66nTh6y/VbRbTMx3Q9b2W5IulPSG+J6kfSRcyINeK0Q7uW6OyI+Ielzkh4svooD3Twt6XZJRyWtSXqi2nKqYfsGSS9Iejgi3q26njrocE4GvlYI9xJFxK+K10uSfiTprmorqo2LRV/iRp/ipYrrqYWIuBgR70fENUnPaAKvF9t71Q6xZkS8WGye6Oul0zkZ5loh3Eti+/riBohsXy/pM5LObf+pifGypBPF+xOSXqqwltrYCLDC/Zqw68W2JT0r6XxEPLnpTxN7vXQ7J8NcK4yWKYntP1a7tS61F2T7x4hYrLCkSth+XtIxtVf3uyjpW5L+SdIPJU1LWpX0hYiYqJuLXc7LMbW/ZoekZUlf2ehrngS2/0LSf0o6K+lasflRtfuYJ/J62eacPKABrxXCHQASolsGABIi3AEgIcIdABIi3AEgIcIdABIi3AEgIcIdABIi3AEgof8HE18SdCd26yQAAAAASUVORK5CYII=\n",
65 | "text/plain": [
66 | ""
67 | ]
68 | },
69 | "metadata": {
70 | "needs_background": "light"
71 | },
72 | "output_type": "display_data"
73 | }
74 | ],
75 | "source": [
76 | "plt.scatter(x1[:,0],x1[:,1],c = 'r')\n",
77 | "plt.scatter(x0[:,0],x0[:,1],c = 'g')"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 7,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "mean1 = np.array([np.mean(x1[:,0]),np.mean(x1[:,1])])\n",
87 | "mean0 = np.array([np.mean(x0[:,0]),np.mean(x0[:,1])])\n",
88 | "from numpy import mat\n",
89 | "m1 = np.shape(x1)[0]\n",
90 | "sw = np.zeros(shape=(2,2))\n",
91 | "for i in range(m1):\n",
92 | " xsmean = mat(x1[i,:]-mean1)\n",
93 | " sw+=xsmean.transpose()*xsmean\n",
94 | "m0 = np.shape(x0)[0]\n",
95 | "for i in range(m0):\n",
96 | " xsmean = mat(x0[i,:]-mean0)\n",
97 | " sw+=xsmean.transpose()*xsmean\n",
98 | "w = (mean0-mean1)*(mat(sw).I)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 9,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "data": {
108 | "text/plain": [
109 | "matrix([[-0.70025326, -0.59358341]])"
110 | ]
111 | },
112 | "execution_count": 9,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "w"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 11,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/plain": [
129 | "[]"
130 | ]
131 | },
132 | "execution_count": 11,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | },
136 | {
137 | "data": {
138 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD8CAYAAAB0IB+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHBdJREFUeJzt3XlwnPd93/H3F+B9CLxAArsiCB0UD/FY2LRsy7YsWRetg9i0taeuZkLHdtgkdh130pnY4bRx2rKxm9ajtJOkVcaeqiltNx0nC0qkLYm05VOSRRnLmxQpiaS4CxLgBd7Esd/+8SwkkFoQILH383nNeBZ4drHP7/GOPrv8Pb/9PObuiIhI9asp9QBERKQ4FPgiIiGhwBcRCQkFvohISCjwRURCQoEvIhISCnwRkZBQ4IuIhIQCX0QkJMaUegCDzZo1y5ubm0s9DBGRivLaa68dd/f64R5XVoHf3NzM1q1bSz0MEZGKYmaHRvI4TemIiISEAl9EJCQU+CIiIaHAFxEJCQW+iEhIKPBFpDKtXw/NzVBTE9yuX1/qEZU9Bb6IVIbBAT9rFnzuc3DoELgHt2vWKPSHocAXkfK3fn0Q6AMBf+IE9PRc+ZgLF2Dt2tKMr0Io8EWk/K1dGwT6cA4fLvxYKpgCX0TK30iDvKkpuNX8fk4KfBEpfwNBfi2TJsG6de+d/tH8/jsU+CJSfq7+hP7II0GgDzZ2LMycCWYwbx489RQ88UTu6R/N7wMKfBEpN7k+oT/9NKxeHQT7QMB/4QswZcp7//7QED1ixZjfL/OppLJqyxQRGfIT+qZNcPBg8PvAm8LA4wambX75y+ANwf29zzuSaaHRGGpMEPzLowyY5/o/pkRWrFjhqkcWCbmamtyBbQaZTPBzc3PuT/K1tdDfn/tv/+7vChu8Q41p3rx336gKxMxec/cVwz1OUzoiUl6G+iQ+ePtQ0zO5wh6CN5BCf8oeakxltFRUgS8i5WXduveeoB1YgTNgqDeF2trc2+fNy8/YrmUkb1QlpsAXkfLyxBPBipvBJ2gHVuAMGOpNYc2a4d8sCmUkb1QlpsAXkfLzxBPBvHcmE9xePR0z1JvCX//18G8WhRxzqfY9QjppKyJS4Yp20tbM5prZT8xsj5ntMrM/zG6fYWYvmNn+7O300e5LROSGlfka+WLIx5ROH/BH7r4I+BDwRTNbDHwV2OLu84Et2d9FRIpPdQtAHgLf3Tvc/TfZn88Ce4Ao0Ao8nX3Y00B8tPsSEbkhqlsA8nzS1syagRbgFWCOu3dA8KYAzM7nvkRERqwC1sgXQ94C38ymAD8AvuLuZ67j79aY2VYz29rV1ZWv4YiIvKsC1sgXQ14C38zGEoT9enf/h+zmY2bWmL2/EejM9bfu/pS7r3D3FfX19fkYjojIlSpgjXwx5GOVjgHfBva4+7cG3bUBWJ39eTXQNtp9iYjckApYI18Mo16Hb2YfBX4O7ACyzUb8CcE8/t8DTcBh4FPufvJaz6V1+CIi12+k6/BHXY/s7r8AbIi77x/t84uISH6oWkFEJCQU+CIiIaHAFxEJCQW+iEhIKPBFREJCgS8iEhIKfBGRkFDgi4iEhAJfRCQkFPgiIiGhwBcRCQkFvohISCjwRURCQoEvIhISCnwRkZBQ4IuIhIQCX0QkJBT4IiIhocAXEQkJBb6ISEgo8EVEQkKBLyISEgp8EZGQUOCLiISEAl9EJCQU+CIiIaHAFxEJCQW+iEhI5CXwzew7ZtZpZjsHbZthZi+Y2f7s7fR87EtERG5Mvj7h/y9g5VXbvgpscff5wJbs7yIiUiJ5CXx3/xlw8qrNrcDT2Z+fBuL52JeIiNyYQs7hz3H3DoDs7ewC7ktERIZR8pO2ZrbGzLaa2daurq5SD0dEpGoVMvCPmVkjQPa2M9eD3P0pd1/h7ivq6+sLOBwRkXArZOBvAFZnf14NtBVwXyIiMox8Lcv8HvASsMDMjpjZ54FvAA+a2X7gwezvIiJSImPy8STu/pkh7ro/H88vIiKjV/KTtiIiUhwKfBGRkFDgi4iEhAJfRCQkFPgiIiGhwK9w63esp/nJZmr+rIbmJ5tZv2N9qYckImVKgV/Ghgvz9TvWs+aZNRzqPoTjHOo+xJpn1ij0RSQnBX6ZGkmYr92ylgu9F674uwu9F1i7ZW2xhysiFUCBX6ZGEuaHuw/n/NuhtotIuCnwy9RIwryprinnY4baLiLhpsAvUyMJ83X3r2PS2ElX3D9p7CTW3b+uoGMTkcqkwC9TIwnzJ5Y+wVOPP8W8unkYxry6eaxevpq1W9a+c6L3Dzb+gVbxiAgA5u6lHsM7VqxY4Vu3bi31MMrG+h3rWbtlLYe7D9NU18S6+9fxxNInrvn4Nc+sec/c/2CTxk7iqcefuubziEhlMbPX3H3FsI9T4FeP5iebOdR9aNjHzaubx8GvHCz8gESkKEYa+JrSqSIjXZ2jVTwi4ZSXPnwpvsHTPTMmzgDAGdm/1rSKRyScFPgV6Oq5+hMXT4z4b7WKRyS8NKVTgXJ9KSuXeXXz+P0Vv3/FKh6dsBUJL33Cr0AjmYM3TCdmReQK+oRfQjfadDmSOXjN04vI1RT4I1CICuLRNF3m+lLWYJqnF5FcFPjDKFQF8WiaLq/+hu3MiTOZOXGm5ulF5Jr0xathDPVlptF+eanmz2pyLqM0jMyfZm74eUUkfPTFqzwpVAWxmi5FpNgU+MMoVDCr6VJEik2BP4xCBXOupkvNvYtIIWkOfwSut7VSRKSY1JYpIhISZXPS1sxWmtk+MztgZl8t9P5ERCS3gga+mdUCfwV8ElgMfMbMFhdynyIikluhP+HfBRxw9zfdvQf4PtCa75309mc4db4n308rIlJVCh34UeDtQb8fyW7Lqxf3dXHXf9rMF55+lWe2pbnY05/vXYiIVLxCt2Vajm1XnCU2szXAGoCmphtb23777Cl89u5mNmxLs3lPJ5PH1fLwkgbisSh33zaTMbVafSoiUtBVOmb2YeDr7v5w9vevAbj7n+d6/GhX6fRnnFfePEEimeKHO45y9nIfs6aM5/HljcRjUZbdXIdZrvcgEZHKVRbLMs1sDPA6cD+QAl4F/oW778r1+Hwuy7zU289P9naSSKb4yd4uevoz3DprMqtiEeKxKM2zJudlPyIipVYWgZ8dyCPAk0At8B13H/IrqoVah999oZcf7uwgkUzxylsncYflc6cRj0V4bFmE+qnj875PEZFiKZvAvx7F+OJVR/dFNiTTJJJp9nScobbG+Mjts4jHIjx8ZwOTx+siYCJSWRT4I/D6sbMk2lO0JdOkTl9kwtgaHlzcQDwW4Z476hmrk70iUgEU+Nchk3FeO3yKRHuKjTs6OH2hl+mTxvLosuBk7/vnTdfJXhEpWwr8G9TTl+Fnr3eRSKbYvOcYl3oz3Dx9Iq3Zk73z50wt6fhERK6mwM+Dc5f7eG7nURLJFL88cJyMw+LGm4i3RFi1PEpD3YRSD1FERIGfb51nL/Hstg7akim2HenGDD50y0ziLRFWLmmkbuLYUg9RREJKgV9Ab3adoy2Zpi2Z4uCJC4wbU8MnFswm3hLhvoWzGT+mttRDFJEQUeAXgbuz7Ug3ifYUz25Pc/xcD1MnjOGRJY20tkT40C0zqanRyV4RKSwFfpH19Wf45RsnaGtP8dyuo5zv6afhpgmsikVojUVY3HiTVvqISEEo8EvoYk8/L+w5Rlt7ip++3kVfxpk/ewrxliirlkeYO2PS8E8iIjJCCvwycfJ8Dxt3dNDWnmLroVMAfKB5Oq2xKI8ubWT65HElHqGIVDoFfhl6++QFNmxLk2hPsb/zHGNqjHsX1LMqFuXBRXOYOE4ne0Xk+inwy5i7s7vjDG3JNBuSaY6euRR0+N/ZQGtLlI+ow19EroMCv0L0Z5xX3jpBW3uaTTs7OHsp6PB/bFkj8ZYoy9XhLyLDUOBXoEu9/by4r5NEe5of7+2kpz/DLbMms2p5hHhLlFvU4S8iOSjwK1z3xV5+tLODRHual986EXT431xHvCWqDn8RuYICv4p0dF/kmW1pEu1pdl/V4f/QnQ1MUYe/SKgp8KvU/mNnSSSDDv8jp9ThLyIK/Krn7rx26BSJZIqN2zs4NajDvzUW5f1N01XrIBISCvwQ6enL8PP9XSSSaV7YfZRLvRmi07Id/i1R7lCHv0hVU+CH1LnLfTy/6yiJZJpf7O8i47Co8SbisQirYhEa6yaWeogikmcKfKHr7GWe3R5csH3b26fV4S9SpRT4coW3jp+nLXuy963j5xlXW8MnFgYd/vcumM2Esap1EKlUCnzJyd3ZfqSbRDLFM9s6OH7usjr8RSqcAl+G1def4VdvnCCRTPHcTnX4i1QqBb5cl4s9/Wzec4y2ZIoX96nDX6SSKPDlhp0a6PBPpnj1YNDh//5504nHIjy6LMIMdfiLlBUFvuTFQId/WzLF68eCDv+P31FPa4s6/EXKhQJf8srd2dNxlrZkig3b0nR0q8NfpFwUJfDN7FPA14FFwF3uvnXQfV8DPg/0A1929+eGez4FfmXIZJxX3jpJWzLFph0dnLnUx6wp43hsWUQd/iIlUKzAXwRkgP8J/JuBwDezxcD3gLuACLAZuMPd+6/1fAr8ynO5r5+f7O2iLZliy95OevoyNM+cRGssqg5/kSIZaeCPqlfX3fdkd3b1Xa3A9939MvCWmR0gCP+XRrM/KT/jx9SyckkDK5c00H2xl+d2HiWRTPHffryfv9yyn+U319Eai/LY8kZmT51Q6uGKhFqhitSjwMuDfj+S3SZVrG7iWD79gbl8+gNzOdp9KejwT6b498/u5j9u3J3t8I/y8BJ1+IuUwrD/1ZnZZqAhx11r3b1tqD/LsS3n3JGZrQHWADQ1NQ03HKkQDXUT+N17buV377mVA51nSbQH4f9H/28baxM7eGDRHOKxKPfcUc+4MTrZK1IMeVmlY2YvcuUc/tcA3P3Ps78/B3zd3a85paM5/Orm7vzm8CkS7Wme3Z7m1IVepk0ay6NLgwu2q8Nf5MYUdVlmjsC/E/gu75603QLM10lbGdDbn+3wb0/zvDr8RUalWKt0fgv470A9cBpIuvvD2fvWAp8D+oCvuPsPh3s+BX44nb/cx/O7j5JoT/OLA8fpz7g6/EWug754JRWp6+xlNmY7/JPZDv8P3jKDeCzKJ5eqw18kFwW+VLyDx8/TlgxqHd7Mdvjft7CeeCzKfQvV4S8yQIEvVcPd2ZHqJtGe5pntabrOBh3+n1zSQDwW5YO3zqRWJ3slxBT4UpX6+jO89OYJEu1pntt1lHOX+5hz03hWLY/QGotyZ0Qd/hI+Cnypepd6gw7/RHuan77eSW+/c/vsKcRjQfirw1/CQoEvoXLqfA+bdnbQ1p7m1wdPAurwl/BQ4EtoHTmV7fBvT7Pv2FnG1Bj33FFPayzCg4vnMGmcah2kuijwRYA9HWdIJFNsSAYd/pMGOvxjET56+yx1+EtVUOCLDJLJOL8+GHT4b9x+ZYd/ayxCbO40neyViqXAFxnC5b5+XtwXdPhv3vNuh/+qWJR4LMKt9VNKPUSR66LAFxmBM5d6+dHOo7QlU/zqjRO4w7Jsh//j6vCXCqHAF7lOx8682+G/M3WGGoOP3D6L1liUh++cw9QJqnWQ8qTAFxmFA51naUsG4f/2yYuMH1PDA4uDDv+Pq8NfyowCXyQPgg7/07QlUzy7vYOT53uYNmksjyxtJB6LsmKeOvyl9BT4InnW25/hF/uPk0imeH7XMS729hOdNpFVsQjxWJQFDerwl9JQ4IsU0PnLfbyw+xiJZIqf7w86/Bc2TCXeEmXV8giRaerwl+JR4IsUyfFzl9m4vYNEMkX74aDD/67mGcRbojyypJG6STrZK4WlwBcpgVwd/vcuqCfeEuUT6vCXAlHgi5SQu7Mzla112Jbt8B8/hpVLGoi3RPmQOvwljxT4ImWiP+O89MYJEskUP9r5bof/48uCC7arw19GS4EvUoYu9fazZU8niWSKF/cFHf631U8mHovSGovSNFMd/nL9FPgiZe70hR427ThKIpni128FHf7va5pGvCXKo0sbmTllfIlHKJVCgS9SQVKnL7Ihe7J379Ggw/9j82cRb4mqw1+GpcAXqVADHf7PJNOksx3+Dy2eQ2tLlI+pw19yUOCLVLhcHf4zJ4/jsWWNtLZEaVGHv2Qp8EWqyOW+fn66r4u2ZJrNe45xuS/DvJmTaF0eobUlym3q8A81Bb5IlTr7Tod/ml+9cZyMw9JoHa2xCKuWR5h9kzr8w0aBLxICnWcuBRdsT6bZkeqmxuDu22bRGouwckmDOvxDQoEvEjIHOs+xIZkikUxz+OSFoMN/0RxaYxHuXTBbHf5VrCiBb2Z/ATwO9ABvAL/j7qez930N+DzQD3zZ3Z8b7vkU+CKjl6vDv27iQId/hA80z1CHf5UpVuA/BPzY3fvM7JsA7v7HZrYY+B5wFxABNgN3uHv/tZ5PgS+SX0N1+D++PEK8JcLChptKPUTJg5EG/qi+zeHuzw/69WXgn2V/bgW+7+6XgbfM7ABB+L80mv2JyPUZW1vDfQtnc9/C2Vd0+P/tz9/kf/z0DRY2TKU1FmVVLEJUHf5VL59f3/sc8H+zP0cJ3gAGHMluew8zWwOsAWhqasrjcERksMnjxxBviRJvib7T4d+WTPHNH+3lmz/ay123zCAei/LI0gamTRpX6uFKAQw7pWNmm4GGHHetdfe27GPWAiuAf+LubmZ/Bbzk7v8ne/+3gU3u/oNr7UtTOiLFd/jEBdqSKRLJFG90nWdsrXHvgtnEY1HuX6QO/0qQtykdd39gmB2tBh4D7vd33z2OAHMHPexmID3cvkSk+JpmTuJf3T+fL33idnalz/CP7Sme2Zbmhd3HmDp+DA8vaSAei/Lh29ThX+lGe9J2JfAt4OPu3jVo+53Ad3n3pO0WYL5O2opUhlwd/rOnjg9O9saiLImqw7+cFGuVzgFgPHAiu+lld/+97H1rCeb1+4CvuPsPh3s+Bb5I+cnV4X9rtsM/rg7/sqAvXolI3uXq8G9pmkY8FuWxZerwLxUFvogU1NUd/rUDHf6xKA/dqQ7/YlLgi0jR7D16hkR7mg3JFOnuS0wcW8tDd84hHovy0fmzGKsO/4JS4ItI0WUyzqsHT5JIptm0o4Pui73MnDyOR5c10hqL8r4mdfgXggJfREqqpy/Di/s6r+jwb5oxidZYhNZYlNtnq8M/XxT4IlI2cnX4L4neRDwW5fHlEeaow39UFPgiUpZydfh/+LaZtMairFzSwE3q8L9uCnwRKXtvdJ2jrf3dDv9xY2p4YNFsWmNR7l1Qz/gxqnUYCQW+iFQMd6f97dO0tQcd/ife6fBvoDUW5S51+F+TAl9EKlJvf4ZfHDhOW3uK53cf40JPP5G6CTweC2odFjWqw/9qCnwRqXgXerId/u0pfrb/OP0ZZ8GcqbS2BCt91OEfUOCLSFU5ce4yG3d0kGhP8ZvDpwG4q3kGrS0RHl3aGOoOfwW+iFStXB3+H79jNvGWCA8smhO6Dn8FvohUPXdnV/oMifYUG7al6Tx7mSnjx/DwnQ3EWyLcfdusUHT4K/BFJFT6M87Lb54g0R50+J+93Ef91PE8viy4YPvSaF3V1joo8EUktC719vPjvZ0k2lO8uK+Lnv4Mt9ZPpnV5lHhLhHkzJ5d6iHmlwBcRAbov9LJpZ3Cy95Vsh39s7jTisQiPLY8wqwo6/BX4IiJXSZ++yIZtaRLt73b4f/T2WcRbIjy0uIHJ4yuzw1+BLyJyDfuOniWRTLEhmSZ1+iITx9by4OI5xFsifGx+fUV1+CvwRURGIJNxth46RSKZYtOODk5f6GXG5HE8urSReEuE9zVNL/uTvQp8EZHr1NOX4aevd5FIpti8u3I6/BX4IiKjcPZSL8/tOkZbMsUvD7zb4d+6PMqqWHl1+CvwRUTypPPMJZ7Z3kFbMsX2I92YwYdvnUk8FmXl0tJ3+CvwRUQK4I2uc7Ql07QlUxw6EXT4378w6PC/b2FpOvwV+CIiBeTuJN8+TVsyzbPb0xw/18NNE8bwyNLggu0fvKV4Hf4KfBGRIukb6PBPpnlu11Eu9PTTWDeBVcsjxFsK3+GvwBcRKYGBDv+2ZJqfvd5F36AO/1XLI9w8fVLe96nAFxEpsZPne9i4PU0imea1Q6eAwnT4K/BFRMrI2ycHOvzTHOg8906Hf2ss6PCfOO7GT/YWJfDN7D8ArUAG6AQ+6+5pC76W9pfAI8CF7PbfDPd8CnwRqXYDHf5tyaDD/9iZy0weV8u/fvAOvvCxW2/oOUca+KNtCvoLd/+32R1+Gfh3wO8BnwTmZ//3QeBvsrciIqFmZiyJ1rEkWsdXP7mIV948QSKZoqGu8F/kGlXgu/uZQb9OBgb+udAK/G8P/vnwsplNM7NGd+8Yzf5ERKpJbY1x9+2zuPv2WUXZ36i7QM1sHfDbQDdwX3ZzFHh70MOOZLe9J/DNbA2wBqCpqWm0wxERkSEM2/9pZpvNbGeO/7UCuPtad58LrAe+NPBnOZ4q58kCd3/K3Ve4+4r6+vobPQ4RERnGsJ/w3f2BET7Xd4GNwJ8SfKKfO+i+m4H0dY9ORETyZlQN/2Y2f9Cvq4C92Z83AL9tgQ8B3Zq/FxEprdHO4X/DzBYQLMs8RLBCB2ATwZLMAwTLMn9nlPsREZFRGu0qnX86xHYHvjia5xYRkfyqnIs2iojIqCjwRURCoqy6dMysi+BcwI2YBRzP43AqRRiPW8ccDjrmkZvn7sOuay+rwB8NM9s6ki6JahPG49Yxh4OOOf80pSMiEhIKfBGRkKimwH+q1AMokTAet445HHTMeVY1c/giInJt1fQJX0RErqEqAt/MVprZPjM7YGZfLfV4isHMDprZDjNLmllVXibMzL5jZp1mtnPQthlm9oKZ7c/eTi/lGAthiOP+upmlsq930sweKeUY88nM5prZT8xsj5ntMrM/zG6v2tf6Gsdc0Ne54qd0zKwWeB14kKCl81XgM+6+u6QDKzAzOwiscPeqXadsZvcA5wguprMku+0/Ayfd/RvZN/fp7v7HpRxnvg1x3F8Hzrn7fynl2ArBzBqBRnf/jZlNBV4D4sBnqdLX+hrH/GkK+DpXwyf8u4AD7v6mu/cA3ye44pZUOHf/GXDyqs2twNPZn58m+I+kqgxx3FXL3TsGrnnt7meBPQQXTKra1/oax1xQ1RD4Q11dq9o58LyZvZa9alhYzBmo2s7ezi7xeIrpS2a2PTvlUzXTG4OZWTPQArxCSF7rq44ZCvg6V0Pgj/jqWlXmI+7+PoILxn8xOw0g1etvgNuAGMGlQv9raYeTf2Y2BfgB8JWrrpddtXIcc0Ff52oI/FBeXcvd09nbTuAfCaa2wuBYdv5zYB60s8TjKQp3P+bu/e6eAf6WKnu9zWwsQfCtd/d/yG6u6tc61zEX+nWuhsB/FZhvZreY2TjgnxNccatqmdnk7IkezGwy8BCw89p/VTU2AKuzP68G2ko4lqIZCL6s36KKXm8zM+DbwB53/9agu6r2tR7qmAv9Olf8Kh2A7NKlJ4Fa4Dvuvq7EQyooM7uV4FM9BBex+W41HrOZfQ+4l6BB8BjB9ZITwN8DTcBh4FPuXlUnOIc47nsJ/pnvwEHgX1bLZUPN7KPAz4EdBFfPA/gTgjntqnytr3HMn6GAr3NVBL6IiAyvGqZ0RERkBBT4IiIhocAXEQkJBb6ISEgo8EVEQkKBLyISEgp8EZGQUOCLiITE/webtX/a/L+dNwAAAABJRU5ErkJggg==\n",
139 | "text/plain": [
140 | ""
141 | ]
142 | },
143 | "metadata": {
144 | "needs_background": "light"
145 | },
146 | "output_type": "display_data"
147 | }
148 | ],
149 | "source": [
150 | "plt.scatter(x1[:,0],x1[:,1],c = 'r')\n",
151 | "plt.scatter(x0[:,0],x0[:,1],c = 'g')\n",
152 | "x = np.arange(0,25,0.1)\n",
153 | "y = np.array((-w[0,0]*x)/w[0,1])\n",
154 | "plt.plot(x,y)"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": []
163 | }
164 | ],
165 | "metadata": {
166 | "kernelspec": {
167 | "display_name": "Python 3",
168 | "language": "python",
169 | "name": "python3"
170 | },
171 | "language_info": {
172 | "codemirror_mode": {
173 | "name": "ipython",
174 | "version": 3
175 | },
176 | "file_extension": ".py",
177 | "mimetype": "text/x-python",
178 | "name": "python",
179 | "nbconvert_exporter": "python",
180 | "pygments_lexer": "ipython3",
181 | "version": "3.7.3"
182 | }
183 | },
184 | "nbformat": 4,
185 | "nbformat_minor": 2
186 | }
187 |
--------------------------------------------------------------------------------
/Part2/LinearDiscriminantAnalysis_LDA.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # 将样例投影到一条直线上,使得同类的样例投影点尽可能小,不同类投影点尽可能远离
5 |
6 | # In[1]:
7 |
8 |
9 | import os
10 | import sys
11 | import numpy as np
12 | import operator
13 | import matplotlib.pyplot as plt
14 |
15 | get_ipython().run_line_magic('matplotlib', 'inline')
16 |
17 |
18 | # In[3]:
19 |
20 |
21 | group1 = np.random.random((8,2))*5+20
22 | group2 = np.random.random((8,2))*5+2
23 |
24 |
25 | # In[4]:
26 |
27 |
28 | x1 = group1
29 | y1 = np.ones((8,1))
30 | x0 = group2
31 | y0 = np.zeros((8,1))
32 |
33 |
34 | # In[6]:
35 |
36 |
37 | plt.scatter(x1[:,0],x1[:,1],c = 'r')
38 | plt.scatter(x0[:,0],x0[:,1],c = 'g')
39 |
40 |
41 | # In[7]:
42 |
43 |
44 | mean1 = np.array([np.mean(x1[:,0]),np.mean(x1[:,1])])
45 | mean0 = np.array([np.mean(x0[:,0]),np.mean(x0[:,1])])
46 | from numpy import mat
47 | m1 = np.shape(x1)[0]
48 | sw = np.zeros(shape=(2,2))
49 | for i in range(m1):
50 | xsmean = mat(x1[i,:]-mean1)
51 | sw+=xsmean.transpose()*xsmean
52 | m0 = np.shape(x0)[0]
53 | for i in range(m0):
54 | xsmean = mat(x0[i,:]-mean0)
55 | sw+=xsmean.transpose()*xsmean
56 | w = (mean0-mean1)*(mat(sw).I)
57 |
58 |
59 | # In[9]:
60 |
61 |
62 | w
63 |
64 |
65 | # In[11]:
66 |
67 |
68 | plt.scatter(x1[:,0],x1[:,1],c = 'r')
69 | plt.scatter(x0[:,0],x0[:,1],c = 'g')
70 | x = np.arange(0,25,0.1)
71 | y = np.array((-w[0,0]*x)/w[0,1])
72 | plt.plot(x,y)
73 |
74 |
75 | # In[ ]:
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/Part2/LinearRegression.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[2]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import matplotlib.pyplot as plt
10 | from sklearn.datasets import make_regression
11 |
12 | get_ipython().run_line_magic('matplotlib', 'inline')
13 |
14 |
15 | # 构造数据集
16 |
17 | # In[4]:
18 |
19 |
20 | x_train,y_train = make_regression(n_samples=100,noise=20,n_features=1)
21 | plt.scatter(x_train,y_train)
22 |
23 |
24 | # In[30]:
25 |
26 |
27 | #使用梯度下降来求解
28 | class LinearRegression():
29 | def __init__(self):
30 | pass
31 | def fit(self,x,y,lr):
32 | x = np.insert(x,0,1,axis=1)
33 | y = y.reshape(-1,1)
34 | self.w = np.random.randn(x.shape[1],1)
35 | self.lr = lr
36 |
37 | for _ in range(50):
38 | y_pred = x @ self.w
39 | mse = np.mean(0.5*(y_pred-y)**2)
40 | grad_w = x.T@(y_pred-y)
41 | self.w -= self.lr*grad_w
42 | print(_,mse,self.w[0][0],self.w[1][0])
43 | def predict(self,x):
44 | x = np.insert(x,0,1,axis=1)
45 | return x @ self.w
46 |
47 |
48 | # In[31]:
49 |
50 |
51 | LR = LinearRegression()
52 |
53 |
54 | # In[32]:
55 |
56 |
57 | LR.fit(x_train,y_train,0.01)
58 |
59 |
60 | # In[33]:
61 |
62 |
63 | y_pred = LR.predict(x_train)
64 | plt.scatter(x_train,y_train)
65 | plt.plot(x_train,y_pred,'r--')
66 |
67 |
68 | # In[ ]:
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/Part2/LogisticRegression.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # 考虑二分类问题,只有0/1
5 |
6 | # In[2]:
7 |
8 |
9 | import numpy as np
10 | from sklearn.datasets import load_iris,make_classification
11 | import matplotlib.pyplot as plt
12 |
13 | get_ipython().run_line_magic('matplotlib', 'inline')
14 |
15 |
16 | # In[3]:
17 |
18 |
19 | sigmoid = lambda x:1./(1+np.exp(-x))
20 |
21 |
22 | # In[4]:
23 |
24 |
25 | x = np.linspace(-10,10,10000)
26 | y = sigmoid(x)
27 | plt.plot(x,y)
28 | plt.scatter(0,sigmoid(0))
29 |
30 |
31 | # In[5]:
32 |
33 |
34 | data = load_iris()
35 | x_trian = data.data[data.target != 0]
36 | y_trian = data.target[data.target !=0]
37 | y_trian[y_trian==1] = 0
38 | y_trian[y_trian==2] = 1
39 | y_trian = y_trian.reshape(-1,1)
40 | x_trian[:5],y_trian[:5]
41 |
42 |
43 | # In[8]:
44 |
45 |
46 | #定义对数回归的梯度下架解法
47 | class LogisticRegression_1():
48 | def __init__(self):
49 | self.sigmoid = lambda x:1./(1+np.exp(-x))
50 | def fit(self,x,y,lr):
51 | self.w = np.random.randn(x.shape[1],1)
52 | self.lr = lr
53 | for _ in range(1000):
54 | y_pred = self.sigmoid(x @ self.w)
55 | self.w -=self.lr * x.T @ (y_pred-y)
56 | print(np.mean(0.5*(y_pred-y)**2))
57 | def predict(self,x):
58 | y_pred = np.round(self.sigmoid(x @ self.w))
59 | return y_pred
60 |
61 |
62 | # In[9]:
63 |
64 |
65 | LR = LogisticRegression_1()
66 | LR.fit(x_trian,y_trian,0.01)
67 |
68 |
69 | # In[10]:
70 |
71 |
72 | y_pred = LR.predict(x_trian)
73 | acc = np.sum(y_trian==y_pred,axis=0)/len(y_trian)
74 | acc[0]
75 |
76 |
77 | # 下面是较为复杂的一种实现
78 |
79 | # In[1]:
80 |
81 |
82 | from math import exp
83 | import numpy as np
84 | import pandas as pd
85 | import matplotlib.pyplot as plt
86 | get_ipython().run_line_magic('matplotlib', 'inline')
87 |
88 | from sklearn.datasets import load_iris
89 | from sklearn.model_selection import train_test_split
90 |
91 |
92 | # In[2]:
93 |
94 |
95 | # data
96 | def create_data():
97 | iris = load_iris()
98 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
99 | df['label'] = iris.target
100 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
101 | data = np.array(df.iloc[:100, [0,1,-1]])
102 | # print(data)
103 | return data[:,:2], data[:,-1]
104 |
105 |
106 | # In[3]:
107 |
108 |
109 | X, y = create_data()
110 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
111 |
112 |
113 | # In[4]:
114 |
115 |
116 |
117 | class LogisticReressionClassifier:
118 | def __init__(self, max_iter=200, learning_rate=0.01):
119 | self.max_iter = max_iter
120 | self.learning_rate = learning_rate
121 |
122 | def sigmoid(self, x):
123 | return 1 / (1 + exp(-x))
124 |
125 | def data_matrix(self, X):
126 | data_mat = []
127 | for d in X:
128 | data_mat.append([1.0, *d])
129 | return data_mat
130 |
131 | def fit(self, X, y):
132 | # label = np.mat(y)
133 | data_mat = self.data_matrix(X) # m*n
134 | self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
135 |
136 | for iter_ in range(self.max_iter):
137 | for i in range(len(X)):
138 | result = self.sigmoid(np.dot(data_mat[i], self.weights))
139 | error = y[i] - result
140 | self.weights += self.learning_rate * error * np.transpose(
141 | [data_mat[i]])
142 | print('LogisticRegression Model(learning_rate={},max_iter={})'.format(
143 | self.learning_rate, self.max_iter))
144 |
145 | # def f(self, x):
146 | # return -(self.weights[0] + self.weights[1] * x) / self.weights[2]
147 |
148 | def score(self, X_test, y_test):
149 | right = 0
150 | X_test = self.data_matrix(X_test)
151 | for x, y in zip(X_test, y_test):
152 | result = np.dot(x, self.weights)
153 | if (result > 0 and y == 1) or (result < 0 and y == 0):
154 | right += 1
155 | return right / len(X_test)
156 |
157 |
158 | # In[5]:
159 |
160 |
161 | lr_clf = LogisticReressionClassifier()
162 | lr_clf.fit(X_train, y_train)
163 | lr_clf.score(X_test, y_test)
164 |
165 |
166 | # In[6]:
167 |
168 |
169 | x_ponits = np.arange(4, 8)
170 | y_ = -(lr_clf.weights[1]*x_ponits + lr_clf.weights[0])/lr_clf.weights[2]
171 | plt.plot(x_ponits, y_)
172 |
173 | #lr_clf.show_graph()
174 | plt.scatter(X[:50,0],X[:50,1], label='0')
175 | plt.scatter(X[50:,0],X[50:,1], label='1')
176 | plt.legend()
177 |
178 |
179 | # scikit-learn实例
180 | # sklearn.linear_model.LogisticRegression
181 | # solver参数决定了我们对逻辑回归损失函数的优化方法,有四种算法可以选择,分别是:
182 | #
183 | # a) liblinear:使用了开源的liblinear库实现,内部使用了坐标轴下降法来迭代优化损失函数。
184 | # b) lbfgs:拟牛顿法的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
185 | # c) newton-cg:也是牛顿法家族的一种,利用损失函数二阶导数矩阵即海森矩阵来迭代优化损失函数。
186 | # d) sag:即随机平均梯度下降,是梯度下降法的变种,和普通梯度下降法的区别是每次迭代仅仅用一部分的样本来计算梯度,适合于样本数据多的时候。
187 |
188 | # In[7]:
189 |
190 |
191 | from sklearn.linear_model import LogisticRegression
192 | clf = LogisticRegression(max_iter=200)
193 |
194 |
195 | # In[8]:
196 |
197 |
198 | clf.fit(X_train, y_train)
199 |
200 |
201 | # In[9]:
202 |
203 |
204 | print(clf.coef_, clf.intercept_)
205 |
206 |
207 | # In[10]:
208 |
209 |
210 | x_ponits = np.arange(4, 8)
211 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1]
212 | plt.plot(x_ponits, y_)
213 |
214 | plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0')
215 | plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1')
216 | plt.xlabel('sepal length')
217 | plt.ylabel('sepal width')
218 | plt.legend()
219 |
220 |
221 | # In[ ]:
222 |
223 |
224 |
225 |
226 |
--------------------------------------------------------------------------------
/Part3/Naive_bayes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Naive bayes\n",
10 | "# 朴素贝叶斯是基于贝叶斯定理与特征条件的独立假设的分类方法,对于给定的训练数据集,首先基于特征条件独立假设学习输入\\输出的联合概率分布,然后基于此模型,对给定的输入x,利用贝叶斯定理求后验概率最大的输出y。"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "import matplotlib.pyplot as plt\n",
22 | "%matplotlib inline\n",
23 | "\n",
24 | "from sklearn.datasets import load_iris\n",
25 | "from sklearn.model_selection import train_test_split\n",
26 | "\n",
27 | "from collections import Counter\n",
28 | "import math"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# data\n",
38 | "def create_data():\n",
39 | " iris = load_iris()\n",
40 | " df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
41 | " df['label'] = iris.target\n",
42 | " df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']\n",
43 | " data = np.array(df.iloc[:100, :])\n",
44 | " # print(data)\n",
45 | " return data[:,:-1], data[:,-1]"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "(array([5. , 3.5, 1.6, 0.6]), 0.0)"
57 | ]
58 | },
59 | "execution_count": 3,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "X, y = create_data()\n",
66 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
67 | "X_test[0], y_test[0]"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "GaussianNB 高斯朴素贝叶斯\n",
75 | "特征的可能性被假设为高斯"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 4,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "class NaiveBayes:\n",
85 | " def __init__(self):\n",
86 | " self.model = None\n",
87 | "\n",
88 | " # 数学期望\n",
89 | " @staticmethod\n",
90 | " def mean(X):\n",
91 | " return sum(X) / float(len(X))\n",
92 | "\n",
93 | " # 标准差(方差)\n",
94 | " def stdev(self, X):\n",
95 | " avg = self.mean(X)\n",
96 | " return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X)))\n",
97 | "\n",
98 | " # 概率密度函数\n",
99 | " def gaussian_probability(self, x, mean, stdev):\n",
100 | " exponent = math.exp(-(math.pow(x - mean, 2) /\n",
101 | " (2 * math.pow(stdev, 2))))\n",
102 | " return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent\n",
103 | "\n",
104 | " # 处理X_train\n",
105 | " def summarize(self, train_data):\n",
106 | " summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]\n",
107 | " return summaries\n",
108 | "\n",
109 | " # 分类别求出数学期望和标准差\n",
110 | " def fit(self, X, y):\n",
111 | " labels = list(set(y))\n",
112 | " data = {label: [] for label in labels}\n",
113 | " for f, label in zip(X, y):\n",
114 | " data[label].append(f)\n",
115 | " self.model = {\n",
116 | " label: self.summarize(value)\n",
117 | " for label, value in data.items()\n",
118 | " }\n",
119 | " return 'gaussianNB train done!'\n",
120 | "\n",
121 | " # 计算概率\n",
122 | " def calculate_probabilities(self, input_data):\n",
123 | " # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]}\n",
124 | " # input_data:[1.1, 2.2]\n",
125 | " probabilities = {}\n",
126 | " for label, value in self.model.items():\n",
127 | " probabilities[label] = 1\n",
128 | " for i in range(len(value)):\n",
129 | " mean, stdev = value[i]\n",
130 | " probabilities[label] *= self.gaussian_probability(\n",
131 | " input_data[i], mean, stdev)\n",
132 | " return probabilities\n",
133 | "\n",
134 | " # 类别\n",
135 | " def predict(self, X_test):\n",
136 | " # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26}\n",
137 | " label = sorted(\n",
138 | " self.calculate_probabilities(X_test).items(),\n",
139 | " key=lambda x: x[-1])[-1][0]\n",
140 | " return label\n",
141 | "\n",
142 | " def score(self, X_test, y_test):\n",
143 | " right = 0\n",
144 | " for X, y in zip(X_test, y_test):\n",
145 | " label = self.predict(X)\n",
146 | " if label == y:\n",
147 | " right += 1\n",
148 | "\n",
149 | " return right / float(len(X_test))"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 5,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "'gaussianNB train done!'"
161 | ]
162 | },
163 | "execution_count": 5,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "model = NaiveBayes()\n",
170 | "model.fit(X_train, y_train)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 6,
176 | "metadata": {},
177 | "outputs": [
178 | {
179 | "name": "stdout",
180 | "output_type": "stream",
181 | "text": [
182 | "0.0\n"
183 | ]
184 | }
185 | ],
186 | "source": [
187 | "print(model.predict([4.4, 3.2, 1.3, 0.2]))"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 7,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "data": {
197 | "text/plain": [
198 | "1.0"
199 | ]
200 | },
201 | "execution_count": 7,
202 | "metadata": {},
203 | "output_type": "execute_result"
204 | }
205 | ],
206 | "source": [
207 | "model.score(X_test, y_test)"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 8,
213 | "metadata": {},
214 | "outputs": [
215 | {
216 | "data": {
217 | "text/plain": [
218 | "1.0"
219 | ]
220 | },
221 | "execution_count": 8,
222 | "metadata": {},
223 | "output_type": "execute_result"
224 | }
225 | ],
226 | "source": [
227 | "#使用sklearn\n",
228 | "from sklearn.naive_bayes import GaussianNB\n",
229 | "\n",
230 | "clf = GaussianNB()\n",
231 | "clf.fit(X_train, y_train)\n",
232 | "clf.score(X_test, y_test)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 10,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "name": "stdout",
242 | "output_type": "stream",
243 | "text": [
244 | "[0.]\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "print(clf.predict([[4.4, 3.2, 1.3, 0.2]]))"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": []
258 | }
259 | ],
260 | "metadata": {
261 | "kernelspec": {
262 | "display_name": "Python 3",
263 | "language": "python",
264 | "name": "python3"
265 | },
266 | "language_info": {
267 | "codemirror_mode": {
268 | "name": "ipython",
269 | "version": 3
270 | },
271 | "file_extension": ".py",
272 | "mimetype": "text/x-python",
273 | "name": "python",
274 | "nbconvert_exporter": "python",
275 | "pygments_lexer": "ipython3",
276 | "version": "3.6.4"
277 | }
278 | },
279 | "nbformat": 4,
280 | "nbformat_minor": 2
281 | }
282 |
--------------------------------------------------------------------------------
/Part3/Naive_bayes.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[ ]:
5 |
6 |
7 | # Naive bayes
8 | # 朴素贝叶斯是基于贝叶斯定理与特征条件的独立假设的分类方法,对于给定的训练数据集,首先基于特征条件独立假设学习输入\输出的联合概率分布,然后基于此模型,对给定的输入x,利用贝叶斯定理求后验概率最大的输出y。
9 |
10 |
11 | # In[1]:
12 |
13 |
14 | import numpy as np
15 | import pandas as pd
16 | import matplotlib.pyplot as plt
17 | get_ipython().run_line_magic('matplotlib', 'inline')
18 |
19 | from sklearn.datasets import load_iris
20 | from sklearn.model_selection import train_test_split
21 |
22 | from collections import Counter
23 | import math
24 |
25 |
26 | # In[2]:
27 |
28 |
29 | # data
30 | def create_data():
31 | iris = load_iris()
32 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
33 | df['label'] = iris.target
34 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
35 | data = np.array(df.iloc[:100, :])
36 | # print(data)
37 | return data[:,:-1], data[:,-1]
38 |
39 |
40 | # In[3]:
41 |
42 |
43 | X, y = create_data()
44 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
45 | X_test[0], y_test[0]
46 |
47 |
48 | # GaussianNB 高斯朴素贝叶斯
49 | # 特征的可能性被假设为高斯
50 |
51 | # In[4]:
52 |
53 |
54 | class NaiveBayes:
55 | def __init__(self):
56 | self.model = None
57 |
58 | # 数学期望
59 | @staticmethod
60 | def mean(X):
61 | return sum(X) / float(len(X))
62 |
63 | # 标准差(方差)
64 | def stdev(self, X):
65 | avg = self.mean(X)
66 | return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X)))
67 |
68 | # 概率密度函数
69 | def gaussian_probability(self, x, mean, stdev):
70 | exponent = math.exp(-(math.pow(x - mean, 2) /
71 | (2 * math.pow(stdev, 2))))
72 | return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
73 |
74 | # 处理X_train
75 | def summarize(self, train_data):
76 | summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)]
77 | return summaries
78 |
79 | # 分类别求出数学期望和标准差
80 | def fit(self, X, y):
81 | labels = list(set(y))
82 | data = {label: [] for label in labels}
83 | for f, label in zip(X, y):
84 | data[label].append(f)
85 | self.model = {
86 | label: self.summarize(value)
87 | for label, value in data.items()
88 | }
89 | return 'gaussianNB train done!'
90 |
91 | # 计算概率
92 | def calculate_probabilities(self, input_data):
93 | # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]}
94 | # input_data:[1.1, 2.2]
95 | probabilities = {}
96 | for label, value in self.model.items():
97 | probabilities[label] = 1
98 | for i in range(len(value)):
99 | mean, stdev = value[i]
100 | probabilities[label] *= self.gaussian_probability(
101 | input_data[i], mean, stdev)
102 | return probabilities
103 |
104 | # 类别
105 | def predict(self, X_test):
106 | # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26}
107 | label = sorted(
108 | self.calculate_probabilities(X_test).items(),
109 | key=lambda x: x[-1])[-1][0]
110 | return label
111 |
112 | def score(self, X_test, y_test):
113 | right = 0
114 | for X, y in zip(X_test, y_test):
115 | label = self.predict(X)
116 | if label == y:
117 | right += 1
118 |
119 | return right / float(len(X_test))
120 |
121 |
122 | # In[5]:
123 |
124 |
125 | model = NaiveBayes()
126 | model.fit(X_train, y_train)
127 |
128 |
129 | # In[6]:
130 |
131 |
132 | print(model.predict([4.4, 3.2, 1.3, 0.2]))
133 |
134 |
135 | # In[7]:
136 |
137 |
138 | model.score(X_test, y_test)
139 |
140 |
141 | # In[8]:
142 |
143 |
144 | #使用sklearn
145 | from sklearn.naive_bayes import GaussianNB
146 |
147 | clf = GaussianNB()
148 | clf.fit(X_train, y_train)
149 | clf.score(X_test, y_test)
150 |
151 |
152 | # In[10]:
153 |
154 |
155 | print(clf.predict([[4.4, 3.2, 1.3, 0.2]]))
156 |
157 |
--------------------------------------------------------------------------------
/Part4/DecionTree.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "%matplotlib inline\n",
13 | "\n",
14 | "from sklearn.datasets import load_iris\n",
15 | "from sklearn.model_selection import train_test_split\n",
16 | "from collections import Counter\n",
17 | "import math\n",
18 | "from math import log\n",
19 | "import pprint"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "def create_data():\n",
29 | " datasets = [['青年', '否', '否', '一般', '否'],\n",
30 | " ['青年', '否', '否', '好', '否'],\n",
31 | " ['青年', '是', '否', '好', '是'],\n",
32 | " ['青年', '是', '是', '一般', '是'],\n",
33 | " ['青年', '否', '否', '一般', '否'],\n",
34 | " ['中年', '否', '否', '一般', '否'],\n",
35 | " ['中年', '否', '否', '好', '否'],\n",
36 | " ['中年', '是', '是', '好', '是'],\n",
37 | " ['中年', '否', '是', '非常好', '是'],\n",
38 | " ['中年', '否', '是', '非常好', '是'],\n",
39 | " ['老年', '否', '是', '非常好', '是'],\n",
40 | " ['老年', '否', '是', '好', '是'],\n",
41 | " ['老年', '是', '否', '好', '是'],\n",
42 | " ['老年', '是', '否', '非常好', '是'],\n",
43 | " ['老年', '否', '否', '一般', '否'],\n",
44 | " ]\n",
45 | " labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']\n",
46 | " # 返回数据集和每个维度的名称\n",
47 | " return datasets, labels"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/html": [
58 | "\n",
59 | "\n",
72 | "
\n",
73 | " \n",
74 | " \n",
75 | " | \n",
76 | " 年龄 | \n",
77 | " 有工作 | \n",
78 | " 有自己的房子 | \n",
79 | " 信贷情况 | \n",
80 | " 类别 | \n",
81 | "
\n",
82 | " \n",
83 | " \n",
84 | " \n",
85 | " 0 | \n",
86 | " 青年 | \n",
87 | " 否 | \n",
88 | " 否 | \n",
89 | " 一般 | \n",
90 | " 否 | \n",
91 | "
\n",
92 | " \n",
93 | " 1 | \n",
94 | " 青年 | \n",
95 | " 否 | \n",
96 | " 否 | \n",
97 | " 好 | \n",
98 | " 否 | \n",
99 | "
\n",
100 | " \n",
101 | " 2 | \n",
102 | " 青年 | \n",
103 | " 是 | \n",
104 | " 否 | \n",
105 | " 好 | \n",
106 | " 是 | \n",
107 | "
\n",
108 | " \n",
109 | " 3 | \n",
110 | " 青年 | \n",
111 | " 是 | \n",
112 | " 是 | \n",
113 | " 一般 | \n",
114 | " 是 | \n",
115 | "
\n",
116 | " \n",
117 | " 4 | \n",
118 | " 青年 | \n",
119 | " 否 | \n",
120 | " 否 | \n",
121 | " 一般 | \n",
122 | " 否 | \n",
123 | "
\n",
124 | " \n",
125 | " 5 | \n",
126 | " 中年 | \n",
127 | " 否 | \n",
128 | " 否 | \n",
129 | " 一般 | \n",
130 | " 否 | \n",
131 | "
\n",
132 | " \n",
133 | " 6 | \n",
134 | " 中年 | \n",
135 | " 否 | \n",
136 | " 否 | \n",
137 | " 好 | \n",
138 | " 否 | \n",
139 | "
\n",
140 | " \n",
141 | " 7 | \n",
142 | " 中年 | \n",
143 | " 是 | \n",
144 | " 是 | \n",
145 | " 好 | \n",
146 | " 是 | \n",
147 | "
\n",
148 | " \n",
149 | " 8 | \n",
150 | " 中年 | \n",
151 | " 否 | \n",
152 | " 是 | \n",
153 | " 非常好 | \n",
154 | " 是 | \n",
155 | "
\n",
156 | " \n",
157 | " 9 | \n",
158 | " 中年 | \n",
159 | " 否 | \n",
160 | " 是 | \n",
161 | " 非常好 | \n",
162 | " 是 | \n",
163 | "
\n",
164 | " \n",
165 | " 10 | \n",
166 | " 老年 | \n",
167 | " 否 | \n",
168 | " 是 | \n",
169 | " 非常好 | \n",
170 | " 是 | \n",
171 | "
\n",
172 | " \n",
173 | " 11 | \n",
174 | " 老年 | \n",
175 | " 否 | \n",
176 | " 是 | \n",
177 | " 好 | \n",
178 | " 是 | \n",
179 | "
\n",
180 | " \n",
181 | " 12 | \n",
182 | " 老年 | \n",
183 | " 是 | \n",
184 | " 否 | \n",
185 | " 好 | \n",
186 | " 是 | \n",
187 | "
\n",
188 | " \n",
189 | " 13 | \n",
190 | " 老年 | \n",
191 | " 是 | \n",
192 | " 否 | \n",
193 | " 非常好 | \n",
194 | " 是 | \n",
195 | "
\n",
196 | " \n",
197 | " 14 | \n",
198 | " 老年 | \n",
199 | " 否 | \n",
200 | " 否 | \n",
201 | " 一般 | \n",
202 | " 否 | \n",
203 | "
\n",
204 | " \n",
205 | "
\n",
206 | "
"
207 | ],
208 | "text/plain": [
209 | " 年龄 有工作 有自己的房子 信贷情况 类别\n",
210 | "0 青年 否 否 一般 否\n",
211 | "1 青年 否 否 好 否\n",
212 | "2 青年 是 否 好 是\n",
213 | "3 青年 是 是 一般 是\n",
214 | "4 青年 否 否 一般 否\n",
215 | "5 中年 否 否 一般 否\n",
216 | "6 中年 否 否 好 否\n",
217 | "7 中年 是 是 好 是\n",
218 | "8 中年 否 是 非常好 是\n",
219 | "9 中年 否 是 非常好 是\n",
220 | "10 老年 否 是 非常好 是\n",
221 | "11 老年 否 是 好 是\n",
222 | "12 老年 是 否 好 是\n",
223 | "13 老年 是 否 非常好 是\n",
224 | "14 老年 否 否 一般 否"
225 | ]
226 | },
227 | "execution_count": 3,
228 | "metadata": {},
229 | "output_type": "execute_result"
230 | }
231 | ],
232 | "source": [
233 | "datasets, labels = create_data()\n",
234 | "train_data = pd.DataFrame(datasets, columns=labels)\n",
235 | "train_data"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 4,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "# 熵\n",
245 | "def calc_ent(datasets):\n",
246 | " data_length = len(datasets)\n",
247 | " label_count = {}\n",
248 | " for i in range(data_length):\n",
249 | " label = datasets[i][-1]\n",
250 | " if label not in label_count:\n",
251 | " label_count[label] = 0\n",
252 | " label_count[label] += 1\n",
253 | " ent = -sum([(p / data_length) * log(p / data_length, 2)\n",
254 | " for p in label_count.values()])\n",
255 | " return ent\n",
256 | "\n",
257 | "\n",
258 | "# 经验条件熵\n",
259 | "def cond_ent(datasets, axis=0):\n",
260 | " data_length = len(datasets)\n",
261 | " feature_sets = {}\n",
262 | " for i in range(data_length):\n",
263 | " feature = datasets[i][axis]\n",
264 | " if feature not in feature_sets:\n",
265 | " feature_sets[feature] = []\n",
266 | " feature_sets[feature].append(datasets[i])\n",
267 | " cond_ent = sum(\n",
268 | " [(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()])\n",
269 | " return cond_ent\n",
270 | "\n",
271 | "\n",
272 | "# 信息增益\n",
273 | "def info_gain(ent, cond_ent):\n",
274 | " return ent - cond_ent\n",
275 | "\n",
276 | "\n",
277 | "def info_gain_train(datasets):\n",
278 | " count = len(datasets[0]) - 1\n",
279 | " ent = calc_ent(datasets)\n",
280 | "# ent = entropy(datasets)\n",
281 | " best_feature = []\n",
282 | " for c in range(count):\n",
283 | " c_info_gain = info_gain(ent, cond_ent(datasets, axis=c))\n",
284 | " best_feature.append((c, c_info_gain))\n",
285 | " print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))\n",
286 | " # 比较大小\n",
287 | " best_ = max(best_feature, key=lambda x: x[-1])\n",
288 | " return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best_[0]])"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 5,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "特征(年龄) - info_gain - 0.083\n",
301 | "特征(有工作) - info_gain - 0.324\n",
302 | "特征(有自己的房子) - info_gain - 0.420\n",
303 | "特征(信贷情况) - info_gain - 0.363\n"
304 | ]
305 | },
306 | {
307 | "data": {
308 | "text/plain": [
309 | "'特征(有自己的房子)的信息增益最大,选择为根节点特征'"
310 | ]
311 | },
312 | "execution_count": 5,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "info_gain_train(np.array(datasets))"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 6,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "# 定义节点类 二叉树\n",
328 | "class Node:\n",
329 | " def __init__(self, root=True, label=None, feature_name=None, feature=None):\n",
330 | " self.root = root\n",
331 | " self.label = label\n",
332 | " self.feature_name = feature_name\n",
333 | " self.feature = feature\n",
334 | " self.tree = {}\n",
335 | " self.result = {\n",
336 | " 'label:': self.label,\n",
337 | " 'feature': self.feature,\n",
338 | " 'tree': self.tree\n",
339 | " }\n",
340 | "\n",
341 | " def __repr__(self):\n",
342 | " return '{}'.format(self.result)\n",
343 | "\n",
344 | " def add_node(self, val, node):\n",
345 | " self.tree[val] = node\n",
346 | "\n",
347 | " def predict(self, features):\n",
348 | " if self.root is True:\n",
349 | " return self.label\n",
350 | " return self.tree[features[self.feature]].predict(features)\n",
351 | "\n",
352 | "\n",
353 | "class DTree:\n",
354 | " def __init__(self, epsilon=0.1):\n",
355 | " self.epsilon = epsilon\n",
356 | " self._tree = {}\n",
357 | "\n",
358 | " # 熵\n",
359 | " @staticmethod\n",
360 | " def calc_ent(datasets):\n",
361 | " data_length = len(datasets)\n",
362 | " label_count = {}\n",
363 | " for i in range(data_length):\n",
364 | " label = datasets[i][-1]\n",
365 | " if label not in label_count:\n",
366 | " label_count[label] = 0\n",
367 | " label_count[label] += 1\n",
368 | " ent = -sum([(p / data_length) * log(p / data_length, 2)\n",
369 | " for p in label_count.values()])\n",
370 | " return ent\n",
371 | "\n",
372 | " # 经验条件熵\n",
373 | " def cond_ent(self, datasets, axis=0):\n",
374 | " data_length = len(datasets)\n",
375 | " feature_sets = {}\n",
376 | " for i in range(data_length):\n",
377 | " feature = datasets[i][axis]\n",
378 | " if feature not in feature_sets:\n",
379 | " feature_sets[feature] = []\n",
380 | " feature_sets[feature].append(datasets[i])\n",
381 | " cond_ent = sum([(len(p) / data_length) * self.calc_ent(p)\n",
382 | " for p in feature_sets.values()])\n",
383 | " return cond_ent\n",
384 | "\n",
385 | " # 信息增益\n",
386 | " @staticmethod\n",
387 | " def info_gain(ent, cond_ent):\n",
388 | " return ent - cond_ent\n",
389 | "\n",
390 | " def info_gain_train(self, datasets):\n",
391 | " count = len(datasets[0]) - 1\n",
392 | " ent = self.calc_ent(datasets)\n",
393 | " best_feature = []\n",
394 | " for c in range(count):\n",
395 | " c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))\n",
396 | " best_feature.append((c, c_info_gain))\n",
397 | " # 比较大小\n",
398 | " best_ = max(best_feature, key=lambda x: x[-1])\n",
399 | " return best_\n",
400 | "\n",
401 | " def train(self, train_data):\n",
402 | " \"\"\"\n",
403 | " input:数据集D(DataFrame格式),特征集A,阈值eta\n",
404 | " output:决策树T\n",
405 | " \"\"\"\n",
406 | " _, y_train, features = train_data.iloc[:, :\n",
407 | " -1], train_data.iloc[:,\n",
408 | " -1], train_data.columns[:\n",
409 | " -1]\n",
410 | " # 1,若D中实例属于同一类Ck,则T为单节点树,并将类Ck作为结点的类标记,返回T\n",
411 | " if len(y_train.value_counts()) == 1:\n",
412 | " return Node(root=True, label=y_train.iloc[0])\n",
413 | "\n",
414 | " # 2, 若A为空,则T为单节点树,将D中实例树最大的类Ck作为该节点的类标记,返回T\n",
415 | " if len(features) == 0:\n",
416 | " return Node(\n",
417 | " root=True,\n",
418 | " label=y_train.value_counts().sort_values(\n",
419 | " ascending=False).index[0])\n",
420 | "\n",
421 | " # 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征\n",
422 | " max_feature, max_info_gain = self.info_gain_train(np.array(train_data))\n",
423 | " max_feature_name = features[max_feature]\n",
424 | "\n",
425 | " # 4,Ag的信息增益小于阈值eta,则置T为单节点树,并将D中是实例数最大的类Ck作为该节点的类标记,返回T\n",
426 | " if max_info_gain < self.epsilon:\n",
427 | " return Node(\n",
428 | " root=True,\n",
429 | " label=y_train.value_counts().sort_values(\n",
430 | " ascending=False).index[0])\n",
431 | "\n",
432 | " # 5,构建Ag子集\n",
433 | " node_tree = Node(\n",
434 | " root=False, feature_name=max_feature_name, feature=max_feature)\n",
435 | "\n",
436 | " feature_list = train_data[max_feature_name].value_counts().index\n",
437 | " for f in feature_list:\n",
438 | " sub_train_df = train_data.loc[train_data[max_feature_name] ==\n",
439 | " f].drop([max_feature_name], axis=1)\n",
440 | "\n",
441 | " # 6, 递归生成树\n",
442 | " sub_tree = self.train(sub_train_df)\n",
443 | " node_tree.add_node(f, sub_tree)\n",
444 | "\n",
445 | " # pprint.pprint(node_tree.tree)\n",
446 | " return node_tree\n",
447 | "\n",
448 | " def fit(self, train_data):\n",
449 | " self._tree = self.train(train_data)\n",
450 | " return self._tree\n",
451 | "\n",
452 | " def predict(self, X_test):\n",
453 | " return self._tree.predict(X_test)"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": 7,
459 | "metadata": {},
460 | "outputs": [
461 | {
462 | "data": {
463 | "text/plain": [
464 | "{'label:': None, 'feature': 2, 'tree': {'否': {'label:': None, 'feature': 1, 'tree': {'否': {'label:': '否', 'feature': None, 'tree': {}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}"
465 | ]
466 | },
467 | "execution_count": 7,
468 | "metadata": {},
469 | "output_type": "execute_result"
470 | }
471 | ],
472 | "source": [
473 | "datasets, labels = create_data()\n",
474 | "data_df = pd.DataFrame(datasets, columns=labels)\n",
475 | "dt = DTree()\n",
476 | "tree = dt.fit(data_df)\n",
477 | "tree"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 8,
483 | "metadata": {},
484 | "outputs": [
485 | {
486 | "data": {
487 | "text/plain": [
488 | "'否'"
489 | ]
490 | },
491 | "execution_count": 8,
492 | "metadata": {},
493 | "output_type": "execute_result"
494 | }
495 | ],
496 | "source": []
497 | },
498 | {
499 | "cell_type": "markdown",
500 | "metadata": {},
501 | "source": [
502 | "使用sklearn使用"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 10,
508 | "metadata": {},
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/plain": [
513 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
514 | " max_features=None, max_leaf_nodes=None,\n",
515 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
516 | " min_samples_leaf=1, min_samples_split=2,\n",
517 | " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
518 | " splitter='best')"
519 | ]
520 | },
521 | "execution_count": 10,
522 | "metadata": {},
523 | "output_type": "execute_result"
524 | }
525 | ],
526 | "source": [
527 | "\n",
528 | "# data\n",
529 | "def create_data():\n",
530 | " iris = load_iris()\n",
531 | " df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
532 | " df['label'] = iris.target\n",
533 | " df.columns = [\n",
534 | " 'sepal length', 'sepal width', 'petal length', 'petal width', 'label'\n",
535 | " ]\n",
536 | " data = np.array(df.iloc[:100, [0, 1, -1]])\n",
537 | " # print(data)\n",
538 | " return data[:, :2], data[:, -1]\n",
539 | "\n",
540 | "\n",
541 | "X, y = create_data()\n",
542 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
543 | "\n",
544 | "from sklearn.tree import DecisionTreeClassifier\n",
545 | "from sklearn.tree import export_graphviz\n",
546 | "import graphviz\n",
547 | "\n",
548 | "\n",
549 | "clf = DecisionTreeClassifier()\n",
550 | "clf.fit(X_train, y_train,)"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": null,
556 | "metadata": {},
557 | "outputs": [],
558 | "source": []
559 | }
560 | ],
561 | "metadata": {
562 | "kernelspec": {
563 | "display_name": "Python 3",
564 | "language": "python",
565 | "name": "python3"
566 | },
567 | "language_info": {
568 | "codemirror_mode": {
569 | "name": "ipython",
570 | "version": 3
571 | },
572 | "file_extension": ".py",
573 | "mimetype": "text/x-python",
574 | "name": "python",
575 | "nbconvert_exporter": "python",
576 | "pygments_lexer": "ipython3",
577 | "version": "3.7.3"
578 | }
579 | },
580 | "nbformat": 4,
581 | "nbformat_minor": 2
582 | }
583 |
--------------------------------------------------------------------------------
/Part4/DecionTree.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 | import matplotlib.pyplot as plt
10 | get_ipython().run_line_magic('matplotlib', 'inline')
11 |
12 | from sklearn.datasets import load_iris
13 | from sklearn.model_selection import train_test_split
14 | from collections import Counter
15 | import math
16 | from math import log
17 | import pprint
18 |
19 |
20 | # In[2]:
21 |
22 |
23 | def create_data():
24 | datasets = [['青年', '否', '否', '一般', '否'],
25 | ['青年', '否', '否', '好', '否'],
26 | ['青年', '是', '否', '好', '是'],
27 | ['青年', '是', '是', '一般', '是'],
28 | ['青年', '否', '否', '一般', '否'],
29 | ['中年', '否', '否', '一般', '否'],
30 | ['中年', '否', '否', '好', '否'],
31 | ['中年', '是', '是', '好', '是'],
32 | ['中年', '否', '是', '非常好', '是'],
33 | ['中年', '否', '是', '非常好', '是'],
34 | ['老年', '否', '是', '非常好', '是'],
35 | ['老年', '否', '是', '好', '是'],
36 | ['老年', '是', '否', '好', '是'],
37 | ['老年', '是', '否', '非常好', '是'],
38 | ['老年', '否', '否', '一般', '否'],
39 | ]
40 | labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
41 | # 返回数据集和每个维度的名称
42 | return datasets, labels
43 |
44 |
45 | # In[3]:
46 |
47 |
48 | datasets, labels = create_data()
49 | train_data = pd.DataFrame(datasets, columns=labels)
50 | train_data
51 |
52 |
53 | # In[4]:
54 |
55 |
56 | # 熵
57 | def calc_ent(datasets):
58 | data_length = len(datasets)
59 | label_count = {}
60 | for i in range(data_length):
61 | label = datasets[i][-1]
62 | if label not in label_count:
63 | label_count[label] = 0
64 | label_count[label] += 1
65 | ent = -sum([(p / data_length) * log(p / data_length, 2)
66 | for p in label_count.values()])
67 | return ent
68 |
69 |
70 | # 经验条件熵
71 | def cond_ent(datasets, axis=0):
72 | data_length = len(datasets)
73 | feature_sets = {}
74 | for i in range(data_length):
75 | feature = datasets[i][axis]
76 | if feature not in feature_sets:
77 | feature_sets[feature] = []
78 | feature_sets[feature].append(datasets[i])
79 | cond_ent = sum(
80 | [(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()])
81 | return cond_ent
82 |
83 |
84 | # 信息增益
85 | def info_gain(ent, cond_ent):
86 | return ent - cond_ent
87 |
88 |
89 | def info_gain_train(datasets):
90 | count = len(datasets[0]) - 1
91 | ent = calc_ent(datasets)
92 | # ent = entropy(datasets)
93 | best_feature = []
94 | for c in range(count):
95 | c_info_gain = info_gain(ent, cond_ent(datasets, axis=c))
96 | best_feature.append((c, c_info_gain))
97 | print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))
98 | # 比较大小
99 | best_ = max(best_feature, key=lambda x: x[-1])
100 | return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best_[0]])
101 |
102 |
103 | # In[5]:
104 |
105 |
106 | info_gain_train(np.array(datasets))
107 |
108 |
109 | # In[6]:
110 |
111 |
112 | # 定义节点类 二叉树
113 | class Node:
114 | def __init__(self, root=True, label=None, feature_name=None, feature=None):
115 | self.root = root
116 | self.label = label
117 | self.feature_name = feature_name
118 | self.feature = feature
119 | self.tree = {}
120 | self.result = {
121 | 'label:': self.label,
122 | 'feature': self.feature,
123 | 'tree': self.tree
124 | }
125 |
126 | def __repr__(self):
127 | return '{}'.format(self.result)
128 |
129 | def add_node(self, val, node):
130 | self.tree[val] = node
131 |
132 | def predict(self, features):
133 | if self.root is True:
134 | return self.label
135 | return self.tree[features[self.feature]].predict(features)
136 |
137 |
138 | class DTree:
139 | def __init__(self, epsilon=0.1):
140 | self.epsilon = epsilon
141 | self._tree = {}
142 |
143 | # 熵
144 | @staticmethod
145 | def calc_ent(datasets):
146 | data_length = len(datasets)
147 | label_count = {}
148 | for i in range(data_length):
149 | label = datasets[i][-1]
150 | if label not in label_count:
151 | label_count[label] = 0
152 | label_count[label] += 1
153 | ent = -sum([(p / data_length) * log(p / data_length, 2)
154 | for p in label_count.values()])
155 | return ent
156 |
157 | # 经验条件熵
158 | def cond_ent(self, datasets, axis=0):
159 | data_length = len(datasets)
160 | feature_sets = {}
161 | for i in range(data_length):
162 | feature = datasets[i][axis]
163 | if feature not in feature_sets:
164 | feature_sets[feature] = []
165 | feature_sets[feature].append(datasets[i])
166 | cond_ent = sum([(len(p) / data_length) * self.calc_ent(p)
167 | for p in feature_sets.values()])
168 | return cond_ent
169 |
170 | # 信息增益
171 | @staticmethod
172 | def info_gain(ent, cond_ent):
173 | return ent - cond_ent
174 |
175 | def info_gain_train(self, datasets):
176 | count = len(datasets[0]) - 1
177 | ent = self.calc_ent(datasets)
178 | best_feature = []
179 | for c in range(count):
180 | c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
181 | best_feature.append((c, c_info_gain))
182 | # 比较大小
183 | best_ = max(best_feature, key=lambda x: x[-1])
184 | return best_
185 |
186 | def train(self, train_data):
187 | """
188 | input:数据集D(DataFrame格式),特征集A,阈值eta
189 | output:决策树T
190 | """
191 | _, y_train, features = train_data.iloc[:, :
192 | -1], train_data.iloc[:,
193 | -1], train_data.columns[:
194 | -1]
195 | # 1,若D中实例属于同一类Ck,则T为单节点树,并将类Ck作为结点的类标记,返回T
196 | if len(y_train.value_counts()) == 1:
197 | return Node(root=True, label=y_train.iloc[0])
198 |
199 | # 2, 若A为空,则T为单节点树,将D中实例树最大的类Ck作为该节点的类标记,返回T
200 | if len(features) == 0:
201 | return Node(
202 | root=True,
203 | label=y_train.value_counts().sort_values(
204 | ascending=False).index[0])
205 |
206 | # 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征
207 | max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
208 | max_feature_name = features[max_feature]
209 |
210 | # 4,Ag的信息增益小于阈值eta,则置T为单节点树,并将D中是实例数最大的类Ck作为该节点的类标记,返回T
211 | if max_info_gain < self.epsilon:
212 | return Node(
213 | root=True,
214 | label=y_train.value_counts().sort_values(
215 | ascending=False).index[0])
216 |
217 | # 5,构建Ag子集
218 | node_tree = Node(
219 | root=False, feature_name=max_feature_name, feature=max_feature)
220 |
221 | feature_list = train_data[max_feature_name].value_counts().index
222 | for f in feature_list:
223 | sub_train_df = train_data.loc[train_data[max_feature_name] ==
224 | f].drop([max_feature_name], axis=1)
225 |
226 | # 6, 递归生成树
227 | sub_tree = self.train(sub_train_df)
228 | node_tree.add_node(f, sub_tree)
229 |
230 | # pprint.pprint(node_tree.tree)
231 | return node_tree
232 |
233 | def fit(self, train_data):
234 | self._tree = self.train(train_data)
235 | return self._tree
236 |
237 | def predict(self, X_test):
238 | return self._tree.predict(X_test)
239 |
240 |
241 | # In[7]:
242 |
243 |
244 | datasets, labels = create_data()
245 | data_df = pd.DataFrame(datasets, columns=labels)
246 | dt = DTree()
247 | tree = dt.fit(data_df)
248 | tree
249 |
250 |
251 | # In[8]:
252 |
253 |
254 |
255 |
256 |
257 | # 使用sklearn使用
258 |
259 | # In[10]:
260 |
261 |
262 |
263 | # data
264 | def create_data():
265 | iris = load_iris()
266 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
267 | df['label'] = iris.target
268 | df.columns = [
269 | 'sepal length', 'sepal width', 'petal length', 'petal width', 'label'
270 | ]
271 | data = np.array(df.iloc[:100, [0, 1, -1]])
272 | # print(data)
273 | return data[:, :2], data[:, -1]
274 |
275 |
276 | X, y = create_data()
277 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
278 |
279 | from sklearn.tree import DecisionTreeClassifier
280 | from sklearn.tree import export_graphviz
281 | import graphviz
282 |
283 |
284 | clf = DecisionTreeClassifier()
285 | clf.fit(X_train, y_train,)
286 |
287 |
288 | # In[ ]:
289 |
290 |
291 |
292 |
293 |
--------------------------------------------------------------------------------
/Part5/BoostTree.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "from sklearn.datasets import load_iris\n",
12 | "from sklearn.model_selection import train_test_split\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "data": {
24 | "text/plain": [
25 | ""
26 | ]
27 | },
28 | "execution_count": 2,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | },
32 | {
33 | "data": {
34 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAGcZJREFUeJzt3X+MXWWdx/H3d4dZOiowoYwrzJQtWNMo0LUwgqQJccHdaq2lQba08VeVlV2DCwYXI4agNiZgSPDHkmj4kQWELXYrlsLyYxGW+CNSMwVs11YiCtoZ2GUotshaoB2++8e9Q2fu3Jl7n3vvmfs8z/28kqZzz316+n3O0S+353zOc83dERGRvPxZuwsQEZHWU3MXEcmQmruISIbU3EVEMqTmLiKSITV3EZEMqbmLiGRIzV1EJENq7iIiGTqk3oFm1gUMASPuvrzivbXA1cBIedO17n7DTPs76qijfP78+UHFioh0uq1btz7v7n21xtXd3IGLgZ3A4dO8/z13/0y9O5s/fz5DQ0MBf72IiJjZ7+oZV9dlGTMbAD4AzPhpXERE4lDvNfdvAJ8HXpthzIfMbJuZbTSzedUGmNkFZjZkZkOjo6OhtYqISJ1qNnczWw485+5bZxh2FzDf3RcBPwRurjbI3a9z90F3H+zrq3nJSEREGlTPNfclwAozWwbMAQ43s1vd/SPjA9x994Tx1wNfa22ZIiKts3//foaHh3n55ZfbXcq05syZw8DAAN3d3Q39+ZrN3d0vAy4DMLP3AP88sbGXtx/t7s+WX66gdONVRCRKw8PDHHbYYcyfPx8za3c5U7g7u3fvZnh4mOOOO66hfTScczezdWa2ovzyIjP7pZn9ArgIWNvofkVEivbyyy8zd+7cKBs7gJkxd+7cpv5lERKFxN0fBh4u/3zFhO2vf7oXyc2mx0a4+v4neGbPPo7p7eHSpQtZubi/3WVJk2Jt7OOarS+ouYt0mk2PjXDZHdvZt38MgJE9+7jsju0AavASNS0/IDKDq+9/4vXGPm7f/jGuvv+JNlUkubjvvvtYuHAhCxYs4Kqrrmr5/tXcRWbwzJ59QdtF6jE2NsaFF17Ivffey44dO1i/fj07duxo6d+hyzIiMzimt4eRKo38mN6eNlQj7dLq+y4///nPWbBgAccffzwAq1ev5s477+Qd73hHq0rWJ3eRmVy6dCE93V2TtvV0d3Hp0oVtqkhm2/h9l5E9+3AO3nfZ9NhIzT87nZGREebNO/gg/8DAACMjje+vGjV3kRmsXNzPleecRH9vDwb09/Zw5Tkn6WZqBynivou7T9nW6vSOLsuI1LBycb+aeQcr4r7LwMAAu3btev318PAwxxxzTMP7q0af3EVEZjDd/ZVm7ru8613v4te//jVPPfUUr776KrfffjsrVqyo/QcDqLmLiMygiPsuhxxyCNdeey1Lly7l7W9/O6tWreKEE05ottTJf0dL9yYikpnxS3Ktfkp52bJlLFu2rBUlVqXmLiJSQ4r3XXRZRkQkQ2ruIiIZUnMXEcmQmruISIbU3EVEMqTmLtnY9NgIS656iOO+8B8sueqhptb+ECnaJz/5Sd785jdz4oknFrJ/NXfJQhGLO4kUae3atdx3332F7V/NXbKgL9WQQm3bAF8/Eb7cW/p924amd3nGGWdw5JFHtqC46vQQk2RBX6ohhdm2Ae66CPaX/7e0d1fpNcCiVe2rqwZ9cpcsFLG4kwgAD6472NjH7d9X2h4xNXfJgr5UQwqzdzhseyR0WUayUNTiTiIcMVC6FFNte8TU3CUbKS7uJAk464rJ19wBuntK25uwZs0aHn74YZ5//nkGBgb4yle+wvnnn99ksQepuUvTWv3lwSJRGb9p+uC60qWYIwZKjb3Jm6nr169vQXHTU3OXpozny8djiOP5ckANXvKxaFXUyZhqdENVmqJ8uUic1NylKcqXS6rcvd0lzKjZ+tTcpSnKl0uK5syZw+7du6Nt8O7O7t27mTNnTsP70DV3acqlSxdOuuYOypdL/AYGBhgeHmZ0dLTdpUxrzpw5DAw0HrdUc5emKF8uKeru7ua4445rdxmFqru5m1kXMASMuPvyivcOBW4BTgF2A+e5+9MtrFMipny5SHxCPrlfDOwEDq/y3vnAH9x9gZmtBr4GnNeC+kSSosy/xKKuG6pmNgB8ALhhmiFnAzeXf94InGVm1nx5IunQmvISk3rTMt8APg+8Ns37/cAuAHc/AOwF5jZdnUhClPmXmNRs7ma2HHjO3bfONKzKtikZIzO7wMyGzGwo5rvUIo1Q5l9iUs8n9yXACjN7GrgdONPMbq0YMwzMAzCzQ4AjgBcqd+Tu17n7oLsP9vX1NVW4SGyU+ZeY1Gzu7n6Zuw+4+3xgNfCQu3+kYthm4OPln88tj4nz6QCRgmhNeYlJwzl3M1sHDLn7ZuBG4Ltm9iSlT+yrW1SfSDKU+ZeYWLs+YA8ODvrQ0FBb/m4RkVSZ2VZ3H6w1Tk+oSrQu37Sd9Vt2MeZOlxlrTpvHV1ee1O6yRJKg5i5RunzTdm595Pevvx5zf/21GrxIbVoVUqK0fkuV76ycYbuITKbmLlEam+Ze0HTbRWQyNXeJUtc0q1dMt11EJlNzlyitOW1e0HYRmUw3VCVK4zdNlZYRaYxy7iIiCVHOXZry4et/xk9/c3B5oCVvPZLbPnV6GytqH63RLinSNXeZorKxA/z0Ny/w4et/1qaK2kdrtEuq1NxlisrGXmt7zrRGu6RKzV1kBlqjXVKl5i4yA63RLqlSc5cplrz1yKDtOdMa7ZIqNXeZ4rZPnT6lkXdqWmbl4n6uPOck+nt7MKC/t4crzzlJaRmJnnLuIiIJUc5dmlJUtjtkv8qXizROzV2mGM92j0cAx7PdQFPNNWS/RdUg0il0zV2mKCrbHbJf5ctFmqPmLlMUle0O2a/y5SLNUXOXKYrKdofsV/lykeaoucsURWW7Q/arfLlIc3RDVaYYv2HZ6qRKyH6LqkGkUyjnLiKSEOXcCxZDBju0hhhqFpHZoebegBgy2KE1xFCziMwe3VBtQAwZ7NAaYqhZRGaPmnsDYshgh9YQQ80iMnvU3BsQQwY7tIYYahaR2aPm3oAYMtihNcRQs4jMHt1QbUAMGezQGmKoWURmT82cu5nNAX4EHErpPwYb3f1LFWPWAlcD418Jf6273zDTfpVzFxEJ18qc+yvAme7+kpl1Az8xs3vd/ZGKcd9z9880UqzMjss3bWf9ll2MudNlxprT5vHVlSc1PTaW/HwsdYjEoGZz99JH+5fKL7vLv9rzWKs07PJN27n1kd+//nrM/fXXlU07ZGws+flY6hCJRV03VM2sy8weB54DHnD3LVWGfcjMtpnZRjOb19IqpWnrt+yqe3vI2Fjy87HUIRKLupq7u4+5+zuBAeBUMzuxYshdwHx3XwT8ELi52n7M7AIzGzKzodHR0WbqlkBj09xbqbY9ZGws+flY6hCJRVAU0t33AA8D76vYvtvdXym/vB44ZZo/f527D7r7YF9fXwPlSqO6zOreHjI2lvx8LHWIxKJmczezPjPrLf/cA7wX+FXFmKMnvFwB7GxlkdK8NadVv1JWbXvI2Fjy87HUIRKLetIyRwM3m1kXpf8YbHD3u81sHTDk7puBi8xsBXAAeAFYW1TB0pjxG6H1JGBCxsaSn4+lDpFYaD13EZGEaD33ghWVqQ7Jlxe575D5pXgskrNtAzy4DvYOwxEDcNYVsGhVu6uSiKm5N6CoTHVIvrzIfYfML8VjkZxtG+Cui2B/Ofmzd1fpNajBy7S0cFgDispUh+TLi9x3yPxSPBbJeXDdwcY+bv++0naRaai5N6CoTHVIvrzIfYfML8VjkZy9w2HbRVBzb0hRmeqQfHmR+w6ZX4rHIjlHDIRtF0HNvSFFZapD8uVF7jtkfikei+ScdQV0V/zHsruntF1kGrqh2oCiMtUh+fIi9x0yvxSPRXLGb5oqLSMBlHMXEUmIcu4yRQzZdUmc8vbJUHPvEDFk1yVxytsnRTdUO0QM2XVJnPL2SVFz7xAxZNclccrbJ0XNvUPEkF2XxClvnxQ19w4RQ3ZdEqe8fVJ0Q7VDxJBdl8Qpb58U5dxFRBKinHtZUXntkP3Gsi65suuRyT0znvv8QrThWGTd3IvKa4fsN5Z1yZVdj0zumfHc5xeiTcci6xuqReW1Q/Yby7rkyq5HJvfMeO7zC9GmY5F1cy8qrx2y31jWJVd2PTK5Z8Zzn1+INh2LrJt7UXntkP3Gsi65suuRyT0znvv8QrTpWGTd3IvKa4fsN5Z1yZVdj0zumfHc5xeiTcci6xuqReW1Q/Yby7rkyq5HJvfMeO7zC9GmY6Gcu4hIQpRzL5jy8yKJuPsS2HoT+BhYF5yyFpZf0/x+I8/xq7k3QPl5kUTcfQkM3XjwtY8dfN1Mg08gx5/1DdWiKD8vkoitN4Vtr1cCOX419wYoPy+SCB8L216vBHL8au4NUH5eJBHWFba9Xgnk+NXcG6D8vEgiTlkbtr1eCeT4dUO1AcrPiyRi/KZpq9MyCeT4lXMXEUlIy3LuZjYH+BFwaHn8Rnf/UsWYQ4FbgFOA3cB57v50A3XXFJovT20N85Dseu7HotAccUj2uag6ipxf5BnspoTOLedjMYN6Lsu8Apzp7i+ZWTfwEzO7190fmTDmfOAP7r7AzFYDXwPOa3Wxofny1NYwD8mu534sCs0Rh2Sfi6qjyPklkMFuWOjccj4WNdS8oeolL5Vfdpd/VV7LORu4ufzzRuAss9bHNkLz5amtYR6SXc/9WBSaIw7JPhdVR5HzSyCD3bDQueV8LGqoKy1jZl1m9jjwHPCAu2+pGNIP7AJw9wPAXmBulf1cYGZDZjY0OjoaXGxovjy1NcxDsuu5H4tCc8Qh2eei6ihyfglksBsWOrecj0UNdTV3dx9z93cCA8CpZnZixZBqn9KndCR3v87dB919sK+vL7jY0Hx5amuYh2TXcz8WheaIQ7LPRdVR5PwSyGA3LHRuOR+LGoJy7u6+B3gYeF/FW8PAPAAzOwQ4AnihBfVNEpovT20N85Dseu7HotAccUj2uag6ipxfAhnshoXOLedjUUM9aZk+YL+77zGzHuC9lG6YTrQZ+DjwM+Bc4CEvIGMZmi9PbQ3zkOx67sei0BxxSPa5qDqKnF8CGeyGhc4t52NRQ82cu5ktonSztIvSJ/0N7r7OzNYBQ+6+uRyX/C6wmNIn9tXu/tuZ9qucu4hIuJbl3N19G6WmXbn9igk/vwz8XWiRIiJSjOyXH0juwR2ZHSEPtsTwEEyRD+6k9pBWDOcjAVk39+Qe3JHZEfJgSwwPwRT54E5qD2nFcD4SkfWqkMk9uCOzI+TBlhgeginywZ3UHtKK4XwkIuvmntyDOzI7Qh5sieEhmCIf3EntIa0Yzkcism7uyT24I7Mj5MGWGB6CKfLBndQe0orhfCQi6+ae3IM7MjtCHmyJ4SGYIh/cSe0hrRjORyKybu4rF/dz5Tkn0d/bgwH9vT1cec5Jupna6Ratgg9+C46YB1jp9w9+q/oNuZCxMdQbOr6o+aW23wzpyzpERBLSsoeYRDpeyBd7xCK1mmPJrsdSRwuouYvMJOSLPWKRWs2xZNdjqaNFsr7mLtK0kC/2iEVqNceSXY+ljhZRcxeZScgXe8QitZpjya7HUkeLqLmLzCTkiz1ikVrNsWTXY6mjRdTcRWYS8sUesUit5liy67HU0SJq7iIzWX4NDJ5/8FOvdZVex3hjclxqNceSXY+ljhZRzl1EJCHKucvsSTEbXFTNReXLUzzG0lZq7tKcFLPBRdVcVL48xWMsbadr7tKcFLPBRdVcVL48xWMsbafmLs1JMRtcVM1F5ctTPMbSdmru0pwUs8FF1VxUvjzFYyxtp+YuzUkxG1xUzUXly1M8xtJ2au7SnBSzwUXVXFS+PMVjLG2nnLuISELqzbnrk7vkY9sG+PqJ8OXe0u/bNsz+fouqQSSQcu6Sh6Ky4CH7VR5dIqJP7pKHorLgIftVHl0iouYueSgqCx6yX+XRJSJq7pKHorLgIftVHl0iouYueSgqCx6yX+XRJSJq7pKHorLgIftVHl0iUjPnbmbzgFuAtwCvAde5+zcrxrwHuBN4qrzpDnef8S6Scu4iIuFauZ77AeBz7v6omR0GbDWzB9x9R8W4H7v78kaKlQiluH54SM0pzi8GOm7JqNnc3f1Z4Nnyz380s51AP1DZ3CUXKea1lUcvno5bUoKuuZvZfGAxsKXK26eb2S/M7F4zO6EFtUm7pJjXVh69eDpuSan7CVUzexPwfeCz7v5ixduPAn/p7i+Z2TJgE/C2Kvu4ALgA4Nhjj224aClYinlt5dGLp+OWlLo+uZtZN6XGfpu731H5vru/6O4vlX++B+g2s6OqjLvO3QfdfbCvr6/J0qUwKea1lUcvno5bUmo2dzMz4EZgp7tXXbvUzN5SHoeZnVre7+5WFiqzKMW8tvLoxdNxS0o9l2WWAB8FtpvZ4+VtXwSOBXD37wDnAp82swPAPmC1t2stYWne+M2xlFIRITWnOL8Y6LglReu5i4gkpJU5d4mVMseT3X0JbL2p9IXU1lX6ertmvwVJJFFq7qlS5niyuy+BoRsPvvaxg6/V4KUDaW2ZVClzPNnWm8K2i2ROzT1VyhxP5mNh20Uyp+aeKmWOJ7OusO0imVNzT5Uyx5OdsjZsu0jm1NxTpbXDJ1t+DQyef/CTunWVXutmqnQo5dxFRBKinHsDNj02wtX3P8Eze/ZxTG8Ply5dyMrF/e0uq3Vyz8XnPr8Y6BgnQ829bNNjI1x2x3b27S+lK0b27OOyO7YD5NHgc8/F5z6/GOgYJ0XX3Muuvv+J1xv7uH37x7j6/ifaVFGL5Z6Lz31+MdAxToqae9kze/YFbU9O7rn43OcXAx3jpKi5lx3T2xO0PTm55+Jzn18MdIyTouZedunShfR0T37gpae7i0uXLmxTRS2Wey4+9/nFQMc4KbqhWjZ+0zTbtEzua3HnPr8Y6BgnRTl3EZGE1Jtz12UZkRRs2wBfPxG+3Fv6fduGNPYtbaPLMiKxKzJfrux6tvTJXSR2RebLlV3Plpq7SOyKzJcru54tNXeR2BWZL1d2PVtq7iKxKzJfrux6ttTcRWJX5Nr9+l6AbCnnLiKSEOXcRUQ6mJq7iEiG1NxFRDKk5i4ikiE1dxGRDKm5i4hkSM1dRCRDau4iIhmq2dzNbJ6Z/ZeZ7TSzX5rZxVXGmJl9y8yeNLNtZnZyMeVKU7Rut0jHqGc99wPA59z9UTM7DNhqZg+4+44JY94PvK386zTg2+XfJRZat1uko9T85O7uz7r7o+Wf/wjsBCq/WPRs4BYveQToNbOjW16tNE7rdot0lKBr7mY2H1gMbKl4qx/YNeH1MFP/A4CZXWBmQ2Y2NDo6GlapNEfrdot0lLqbu5m9Cfg+8Fl3f7Hy7Sp/ZMqKZO5+nbsPuvtgX19fWKXSHK3bLdJR6mruZtZNqbHf5u53VBkyDMyb8HoAeKb58qRltG63SEepJy1jwI3ATne/Zpphm4GPlVMz7wb2uvuzLaxTmqV1u0U6Sj1pmSXAR4HtZvZ4edsXgWMB3P07wD3AMuBJ4E/AJ1pfqjRt0So1c5EOUbO5u/tPqH5NfeIYBy5sVVEiItIcPaEqIpIhNXcRkQypuYuIZEjNXUQkQ2ruIiIZUnMXEcmQmruISIasFFFvw19sNgr8ri1/eW1HAc+3u4gCaX7pynluoPnV4y/dvebiXG1r7jEzsyF3H2x3HUXR/NKV89xA82slXZYREcmQmruISIbU3Ku7rt0FFEzzS1fOcwPNr2V0zV1EJEP65C4ikqGObu5m1mVmj5nZ3VXeW2tmo2b2ePnX37ejxmaY2dNmtr1c/1CV983MvmVmT5rZNjM7uR11NqKOub3HzPZOOH9JfeWUmfWa2UYz+5WZ7TSz0yveT/bcQV3zS/b8mdnCCXU/bmYvmtlnK8YUfv7q+bKOnF0M7AQOn+b977n7Z2axniL8tbtPl6t9P/C28q/TgG+Xf0/FTHMD+LG7L5+1alrrm8B97n6umf058IaK91M/d7XmB4meP3d/AngnlD5AAiPADyqGFX7+OvaTu5kNAB8Abmh3LW10NnCLlzwC9JrZ0e0uqtOZ2eHAGZS+3hJ3f9Xd91QMS/bc1Tm/XJwF/MbdKx/YLPz8dWxzB74BfB54bYYxHyr/k2mjmc2bYVysHPhPM9tqZhdUeb8f2DXh9XB5WwpqzQ3gdDP7hZnda2YnzGZxTToeGAX+tXzZ8AYze2PFmJTPXT3zg3TP30SrgfVVthd+/jqyuZvZcuA5d986w7C7gPnuvgj4IXDzrBTXWkvc/WRK/wS80MzOqHi/2tcnphKfqjW3Ryk9pv1XwL8Am2a7wCYcApwMfNvdFwP/B3yhYkzK566e+aV8/gAoX25aAfx7tberbGvp+evI5k7pS79XmNnTwO3AmWZ268QB7r7b3V8pv7weOGV2S2yeuz9T/v05Stf8Tq0YMgxM/BfJAPDM7FTXnFpzc/cX3f2l8s/3AN1mdtSsF9qYYWDY3beUX2+k1AwrxyR57qhjfomfv3HvBx519/+t8l7h568jm7u7X+buA+4+n9I/mx5y949MHFNx/WsFpRuvyTCzN5rZYeM/A38L/HfFsM3Ax8p37t8N7HX3Z2e51GD1zM3M3mJmVv75VEr/W98927U2wt3/B9hlZgvLm84CdlQMS/LcQX3zS/n8TbCG6pdkYBbOX6enZSYxs3XAkLtvBi4ysxXAAeAFYG07a2vAXwA/KP//4xDg39z9PjP7RwB3/w5wD7AMeBL4E/CJNtUaqp65nQt82swOAPuA1Z7WE3v/BNxW/qf9b4FPZHLuxtWaX9Lnz8zeAPwN8A8Tts3q+dMTqiIiGerIyzIiIrlTcxcRyZCau4hIhtTcRUQypOYuIpIhNXcRkQypuYuIZEjNXUQkQ/8PK3ebjfkeaxkAAAAASUVORK5CYII=\n",
35 | "text/plain": [
36 | ""
37 | ]
38 | },
39 | "metadata": {
40 | "needs_background": "light"
41 | },
42 | "output_type": "display_data"
43 | }
44 | ],
45 | "source": [
46 | "# data\n",
47 | "def create_data():\n",
48 | " iris = load_iris()\n",
49 | " df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
50 | " df['label'] = iris.target\n",
51 | " df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']\n",
52 | " data = np.array(df.iloc[:100, [0, 1, -1]])\n",
53 | " for i in range(len(data)):\n",
54 | " if data[i,-1] == 0:\n",
55 | " data[i,-1] = -1\n",
56 | " # print(data)\n",
57 | " return data[:,:2], data[:,-1]\n",
58 | "X, y = create_data()\n",
59 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
60 | "plt.scatter(X[:50,0],X[:50,1], label='0')\n",
61 | "plt.scatter(X[50:,0],X[50:,1], label='1')\n",
62 | "plt.legend()"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "#adaboost in python\n",
72 | "class AdaBoost:\n",
73 | " def __init__(self, n_estimators=50, learning_rate=1.0):\n",
74 | " self.clf_num = n_estimators\n",
75 | " self.learning_rate = learning_rate\n",
76 | "\n",
77 | " def init_args(self, datasets, labels):\n",
78 | "\n",
79 | " self.X = datasets\n",
80 | " self.Y = labels\n",
81 | " self.M, self.N = datasets.shape\n",
82 | "\n",
83 | " # 弱分类器数目和集合\n",
84 | " self.clf_sets = []\n",
85 | "\n",
86 | " # 初始化weights\n",
87 | " self.weights = [1.0 / self.M] * self.M\n",
88 | "\n",
89 | " # G(x)系数 alpha\n",
90 | " self.alpha = []\n",
91 | "\n",
92 | " def _G(self, features, labels, weights):\n",
93 | " m = len(features)\n",
94 | " error = 100000.0 # 无穷大\n",
95 | " best_v = 0.0\n",
96 | " # 单维features\n",
97 | " features_min = min(features)\n",
98 | " features_max = max(features)\n",
99 | " n_step = (features_max - features_min +\n",
100 | " self.learning_rate) // self.learning_rate\n",
101 | " direct, compare_array = None, None\n",
102 | " for i in range(1, int(n_step)):\n",
103 | " v = features_min + self.learning_rate * i\n",
104 | "\n",
105 | " if v not in features:\n",
106 | " # 误分类计算\n",
107 | " compare_array_positive = np.array(\n",
108 | " [1 if features[k] > v else -1 for k in range(m)])\n",
109 | " weight_error_positive = sum([\n",
110 | " weights[k] for k in range(m)\n",
111 | " if compare_array_positive[k] != labels[k]\n",
112 | " ])\n",
113 | "\n",
114 | " compare_array_nagetive = np.array(\n",
115 | " [-1 if features[k] > v else 1 for k in range(m)])\n",
116 | " weight_error_nagetive = sum([\n",
117 | " weights[k] for k in range(m)\n",
118 | " if compare_array_nagetive[k] != labels[k]\n",
119 | " ])\n",
120 | "\n",
121 | " if weight_error_positive < weight_error_nagetive:\n",
122 | " weight_error = weight_error_positive\n",
123 | " _compare_array = compare_array_positive\n",
124 | " direct = 'positive'\n",
125 | " else:\n",
126 | " weight_error = weight_error_nagetive\n",
127 | " _compare_array = compare_array_nagetive\n",
128 | " direct = 'nagetive'\n",
129 | "\n",
130 | " # print('v:{} error:{}'.format(v, weight_error))\n",
131 | " if weight_error < error:\n",
132 | " error = weight_error\n",
133 | " compare_array = _compare_array\n",
134 | " best_v = v\n",
135 | " return best_v, direct, error, compare_array\n",
136 | "\n",
137 | " # 计算alpha\n",
138 | " def _alpha(self, error):\n",
139 | " return 0.5 * np.log((1 - error) / error)\n",
140 | "\n",
141 | " # 规范化因子\n",
142 | " def _Z(self, weights, a, clf):\n",
143 | " return sum([\n",
144 | " weights[i] * np.exp(-1 * a * self.Y[i] * clf[i])\n",
145 | " for i in range(self.M)\n",
146 | " ])\n",
147 | "\n",
148 | " # 权值更新\n",
149 | " def _w(self, a, clf, Z):\n",
150 | " for i in range(self.M):\n",
151 | " self.weights[i] = self.weights[i] * np.exp(\n",
152 | " -1 * a * self.Y[i] * clf[i]) / Z\n",
153 | "\n",
154 | " # G(x)的线性组合\n",
155 | " def _f(self, alpha, clf_sets):\n",
156 | " pass\n",
157 | "\n",
158 | " def G(self, x, v, direct):\n",
159 | " if direct == 'positive':\n",
160 | " return 1 if x > v else -1\n",
161 | " else:\n",
162 | " return -1 if x > v else 1\n",
163 | "\n",
164 | " def fit(self, X, y):\n",
165 | " self.init_args(X, y)\n",
166 | "\n",
167 | " for epoch in range(self.clf_num):\n",
168 | " best_clf_error, best_v, clf_result = 100000, None, None\n",
169 | " # 根据特征维度, 选择误差最小的\n",
170 | " for j in range(self.N):\n",
171 | " features = self.X[:, j]\n",
172 | " # 分类阈值,分类误差,分类结果\n",
173 | " v, direct, error, compare_array = self._G(\n",
174 | " features, self.Y, self.weights)\n",
175 | "\n",
176 | " if error < best_clf_error:\n",
177 | " best_clf_error = error\n",
178 | " best_v = v\n",
179 | " final_direct = direct\n",
180 | " clf_result = compare_array\n",
181 | " axis = j\n",
182 | " if best_clf_error == 0:\n",
183 | " break\n",
184 | "\n",
185 | " # 计算G(x)系数a\n",
186 | " a = self._alpha(best_clf_error)\n",
187 | " self.alpha.append(a)\n",
188 | " # 记录分类器\n",
189 | " self.clf_sets.append((axis, best_v, final_direct))\n",
190 | " # 规范化因子\n",
191 | " Z = self._Z(self.weights, a, clf_result)\n",
192 | " # 权值更新\n",
193 | " self._w(a, clf_result, Z)\n",
194 | "\n",
195 | " def predict(self, feature):\n",
196 | " result = 0.0\n",
197 | " for i in range(len(self.clf_sets)):\n",
198 | " axis, clf_v, direct = self.clf_sets[i]\n",
199 | " f_input = feature[axis]\n",
200 | " result += self.alpha[i] * self.G(f_input, clf_v, direct)\n",
201 | " # sign\n",
202 | " return 1 if result > 0 else -1\n",
203 | "\n",
204 | " def score(self, X_test, y_test):\n",
205 | " right_count = 0\n",
206 | " for i in range(len(X_test)):\n",
207 | " feature = X_test[i]\n",
208 | " if self.predict(feature) == y_test[i]:\n",
209 | " right_count += 1\n",
210 | "\n",
211 | " return right_count / len(X_test)"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 4,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "X = np.arange(10).reshape(10, 1)\n",
221 | "y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])\n",
222 | "clf = AdaBoost(n_estimators=3, learning_rate=0.5)\n",
223 | "clf.fit(X, y)\n",
224 | "X, y = create_data()\n",
225 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)\n"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 5,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | "0.5757575757575758"
237 | ]
238 | },
239 | "execution_count": 5,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "clf = AdaBoost(n_estimators=10, learning_rate=0.2)\n",
246 | "clf.fit(X_train, y_train)\n",
247 | "clf.score(X_test, y_test)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 6,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "name": "stdout",
257 | "output_type": "stream",
258 | "text": [
259 | "average score:67.182%\n"
260 | ]
261 | }
262 | ],
263 | "source": [
264 | "# 100次结果\n",
265 | "result = []\n",
266 | "for i in range(1, 101):\n",
267 | " X, y = create_data()\n",
268 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)\n",
269 | " clf = AdaBoost(n_estimators=100, learning_rate=0.2)\n",
270 | " clf.fit(X_train, y_train)\n",
271 | " r = clf.score(X_test, y_test)\n",
272 | " result.append(r)\n",
273 | "\n",
274 | "print('average score:{:.3f}%'.format(sum(result)))"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 7,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "data": {
284 | "text/plain": [
285 | "0.9090909090909091"
286 | ]
287 | },
288 | "execution_count": 7,
289 | "metadata": {},
290 | "output_type": "execute_result"
291 | }
292 | ],
293 | "source": [
294 | "#adaboost in sklearn\n",
295 | "from sklearn.ensemble import AdaBoostClassifier\n",
296 | "clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)\n",
297 | "clf.fit(X_train, y_train)\n",
298 | "clf.score(X_test, y_test)"
299 | ]
300 | }
301 | ],
302 | "metadata": {
303 | "kernelspec": {
304 | "display_name": "Python 3",
305 | "language": "python",
306 | "name": "python3"
307 | },
308 | "language_info": {
309 | "codemirror_mode": {
310 | "name": "ipython",
311 | "version": 3
312 | },
313 | "file_extension": ".py",
314 | "mimetype": "text/x-python",
315 | "name": "python",
316 | "nbconvert_exporter": "python",
317 | "pygments_lexer": "ipython3",
318 | "version": "3.7.3"
319 | }
320 | },
321 | "nbformat": 4,
322 | "nbformat_minor": 2
323 | }
324 |
--------------------------------------------------------------------------------
/Part5/BoostTree.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.datasets import load_iris
10 | from sklearn.model_selection import train_test_split
11 | import matplotlib.pyplot as plt
12 | get_ipython().run_line_magic('matplotlib', 'inline')
13 |
14 |
15 | # In[2]:
16 |
17 |
18 | # data
19 | def create_data():
20 | iris = load_iris()
21 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
22 | df['label'] = iris.target
23 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
24 | data = np.array(df.iloc[:100, [0, 1, -1]])
25 | for i in range(len(data)):
26 | if data[i,-1] == 0:
27 | data[i,-1] = -1
28 | # print(data)
29 | return data[:,:2], data[:,-1]
30 | X, y = create_data()
31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
32 | plt.scatter(X[:50,0],X[:50,1], label='0')
33 | plt.scatter(X[50:,0],X[50:,1], label='1')
34 | plt.legend()
35 |
36 |
37 | # In[3]:
38 |
39 |
40 | #adaboost in python
41 | class AdaBoost:
42 | def __init__(self, n_estimators=50, learning_rate=1.0):
43 | self.clf_num = n_estimators
44 | self.learning_rate = learning_rate
45 |
46 | def init_args(self, datasets, labels):
47 |
48 | self.X = datasets
49 | self.Y = labels
50 | self.M, self.N = datasets.shape
51 |
52 | # 弱分类器数目和集合
53 | self.clf_sets = []
54 |
55 | # 初始化weights
56 | self.weights = [1.0 / self.M] * self.M
57 |
58 | # G(x)系数 alpha
59 | self.alpha = []
60 |
61 | def _G(self, features, labels, weights):
62 | m = len(features)
63 | error = 100000.0 # 无穷大
64 | best_v = 0.0
65 | # 单维features
66 | features_min = min(features)
67 | features_max = max(features)
68 | n_step = (features_max - features_min +
69 | self.learning_rate) // self.learning_rate
70 | direct, compare_array = None, None
71 | for i in range(1, int(n_step)):
72 | v = features_min + self.learning_rate * i
73 |
74 | if v not in features:
75 | # 误分类计算
76 | compare_array_positive = np.array(
77 | [1 if features[k] > v else -1 for k in range(m)])
78 | weight_error_positive = sum([
79 | weights[k] for k in range(m)
80 | if compare_array_positive[k] != labels[k]
81 | ])
82 |
83 | compare_array_nagetive = np.array(
84 | [-1 if features[k] > v else 1 for k in range(m)])
85 | weight_error_nagetive = sum([
86 | weights[k] for k in range(m)
87 | if compare_array_nagetive[k] != labels[k]
88 | ])
89 |
90 | if weight_error_positive < weight_error_nagetive:
91 | weight_error = weight_error_positive
92 | _compare_array = compare_array_positive
93 | direct = 'positive'
94 | else:
95 | weight_error = weight_error_nagetive
96 | _compare_array = compare_array_nagetive
97 | direct = 'nagetive'
98 |
99 | # print('v:{} error:{}'.format(v, weight_error))
100 | if weight_error < error:
101 | error = weight_error
102 | compare_array = _compare_array
103 | best_v = v
104 | return best_v, direct, error, compare_array
105 |
106 | # 计算alpha
107 | def _alpha(self, error):
108 | return 0.5 * np.log((1 - error) / error)
109 |
110 | # 规范化因子
111 | def _Z(self, weights, a, clf):
112 | return sum([
113 | weights[i] * np.exp(-1 * a * self.Y[i] * clf[i])
114 | for i in range(self.M)
115 | ])
116 |
117 | # 权值更新
118 | def _w(self, a, clf, Z):
119 | for i in range(self.M):
120 | self.weights[i] = self.weights[i] * np.exp(
121 | -1 * a * self.Y[i] * clf[i]) / Z
122 |
123 | # G(x)的线性组合
124 | def _f(self, alpha, clf_sets):
125 | pass
126 |
127 | def G(self, x, v, direct):
128 | if direct == 'positive':
129 | return 1 if x > v else -1
130 | else:
131 | return -1 if x > v else 1
132 |
133 | def fit(self, X, y):
134 | self.init_args(X, y)
135 |
136 | for epoch in range(self.clf_num):
137 | best_clf_error, best_v, clf_result = 100000, None, None
138 | # 根据特征维度, 选择误差最小的
139 | for j in range(self.N):
140 | features = self.X[:, j]
141 | # 分类阈值,分类误差,分类结果
142 | v, direct, error, compare_array = self._G(
143 | features, self.Y, self.weights)
144 |
145 | if error < best_clf_error:
146 | best_clf_error = error
147 | best_v = v
148 | final_direct = direct
149 | clf_result = compare_array
150 | axis = j
151 | if best_clf_error == 0:
152 | break
153 |
154 | # 计算G(x)系数a
155 | a = self._alpha(best_clf_error)
156 | self.alpha.append(a)
157 | # 记录分类器
158 | self.clf_sets.append((axis, best_v, final_direct))
159 | # 规范化因子
160 | Z = self._Z(self.weights, a, clf_result)
161 | # 权值更新
162 | self._w(a, clf_result, Z)
163 |
164 | def predict(self, feature):
165 | result = 0.0
166 | for i in range(len(self.clf_sets)):
167 | axis, clf_v, direct = self.clf_sets[i]
168 | f_input = feature[axis]
169 | result += self.alpha[i] * self.G(f_input, clf_v, direct)
170 | # sign
171 | return 1 if result > 0 else -1
172 |
173 | def score(self, X_test, y_test):
174 | right_count = 0
175 | for i in range(len(X_test)):
176 | feature = X_test[i]
177 | if self.predict(feature) == y_test[i]:
178 | right_count += 1
179 |
180 | return right_count / len(X_test)
181 |
182 |
183 | # In[4]:
184 |
185 |
186 | X = np.arange(10).reshape(10, 1)
187 | y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
188 | clf = AdaBoost(n_estimators=3, learning_rate=0.5)
189 | clf.fit(X, y)
190 | X, y = create_data()
191 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
192 |
193 |
194 | # In[5]:
195 |
196 |
197 | clf = AdaBoost(n_estimators=10, learning_rate=0.2)
198 | clf.fit(X_train, y_train)
199 | clf.score(X_test, y_test)
200 |
201 |
202 | # In[6]:
203 |
204 |
205 | # 100次结果
206 | result = []
207 | for i in range(1, 101):
208 | X, y = create_data()
209 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
210 | clf = AdaBoost(n_estimators=100, learning_rate=0.2)
211 | clf.fit(X_train, y_train)
212 | r = clf.score(X_test, y_test)
213 | result.append(r)
214 |
215 | print('average score:{:.3f}%'.format(sum(result)))
216 |
217 |
218 | # In[7]:
219 |
220 |
221 | #adaboost in sklearn
222 | from sklearn.ensemble import AdaBoostClassifier
223 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5)
224 | clf.fit(X_train, y_train)
225 | clf.score(X_test, y_test)
226 |
227 |
--------------------------------------------------------------------------------
/Part6/GBDT.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import math\n",
11 | "# 计算信息熵\n",
12 | "def calculate_entropy(y):\n",
13 | " log2 = math.log2\n",
14 | " unique_labels = np.unique(y)\n",
15 | " entropy = 0\n",
16 | " for label in unique_labels:\n",
17 | " count = len(y[y == label])\n",
18 | " p = count / len(y)\n",
19 | " entropy += -p * log2(p)\n",
20 | " return entropy\n",
21 | "# 定义树的节点\n",
22 | "class DecisionNode():\n",
23 | " def __init__(self, feature_i=None, threshold=None,\n",
24 | " value=None, true_branch=None, false_branch=None):\n",
25 | " self.feature_i = feature_i \n",
26 | " self.threshold = threshold \n",
27 | " self.value = value \n",
28 | " self.true_branch = true_branch \n",
29 | " self.false_branch = false_branch\n",
30 | "def divide_on_feature(X, feature_i, threshold):\n",
31 | " split_func = None\n",
32 | " if isinstance(threshold, int) or isinstance(threshold, float):\n",
33 | " split_func = lambda sample: sample[feature_i] >= threshold\n",
34 | " else:\n",
35 | " split_func = lambda sample: sample[feature_i] == threshold\n",
36 | "\n",
37 | " X_1 = np.array([sample for sample in X if split_func(sample)])\n",
38 | " X_2 = np.array([sample for sample in X if not split_func(sample)])\n",
39 | "\n",
40 | " return np.array([X_1, X_2])\n",
41 | "# 超类\n",
42 | "class DecisionTree(object):\n",
43 | " def __init__(self, min_samples_split=2, min_impurity=1e-7,\n",
44 | " max_depth=float(\"inf\"), loss=None):\n",
45 | " self.root = None #根节点\n",
46 | " self.min_samples_split = min_samples_split\n",
47 | " self.min_impurity = min_impurity\n",
48 | " self.max_depth = max_depth\n",
49 | " # 计算值 如果是分类问题就是信息增益,回归问题就基尼指数\n",
50 | " self._impurity_calculation = None\n",
51 | " self._leaf_value_calculation = None #计算叶子\n",
52 | " self.one_dim = None\n",
53 | " self.loss = loss\n",
54 | "\n",
55 | " def fit(self, X, y, loss=None):\n",
56 | " self.one_dim = len(np.shape(y)) == 1\n",
57 | " self.root = self._build_tree(X, y)\n",
58 | " self.loss=None\n",
59 | "\n",
60 | " def _build_tree(self, X, y, current_depth=0):\n",
61 | " \"\"\"\n",
62 | " 递归求解树\n",
63 | " \"\"\"\n",
64 | "\n",
65 | " largest_impurity = 0\n",
66 | " best_criteria = None\n",
67 | " best_sets = None\n",
68 | " \n",
69 | " if len(np.shape(y)) == 1:\n",
70 | " y = np.expand_dims(y, axis=1)\n",
71 | "\n",
72 | " Xy = np.concatenate((X, y), axis=1)\n",
73 | "\n",
74 | " n_samples, n_features = np.shape(X)\n",
75 | "\n",
76 | " if n_samples >= self.min_samples_split and current_depth <= self.max_depth:\n",
77 | " # 计算每一个特征的增益值\n",
78 | " for feature_i in range(n_features):\n",
79 | " feature_values = np.expand_dims(X[:, feature_i], axis=1)\n",
80 | " unique_values = np.unique(feature_values)\n",
81 | "\n",
82 | " for threshold in unique_values:\n",
83 | " Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)\n",
84 | " \n",
85 | " if len(Xy1) > 0 and len(Xy2) > 0:\n",
86 | " y1 = Xy1[:, n_features:]\n",
87 | " y2 = Xy2[:, n_features:]\n",
88 | "\n",
89 | " # 计算增益值\n",
90 | " impurity = self._impurity_calculation(y, y1, y2)\n",
91 | "\n",
92 | " if impurity > largest_impurity:\n",
93 | " largest_impurity = impurity\n",
94 | " best_criteria = {\"feature_i\": feature_i, \"threshold\": threshold}\n",
95 | " best_sets = {\n",
96 | " \"leftX\": Xy1[:, :n_features], \n",
97 | " \"lefty\": Xy1[:, n_features:], \n",
98 | " \"rightX\": Xy2[:, :n_features], \n",
99 | " \"righty\": Xy2[:, n_features:] \n",
100 | " }\n",
101 | "\n",
102 | " if largest_impurity > self.min_impurity:\n",
103 | " true_branch = self._build_tree(best_sets[\"leftX\"], best_sets[\"lefty\"], current_depth + 1)\n",
104 | " false_branch = self._build_tree(best_sets[\"rightX\"], best_sets[\"righty\"], current_depth + 1)\n",
105 | " return DecisionNode(feature_i=best_criteria[\"feature_i\"], threshold=best_criteria[\n",
106 | " \"threshold\"], true_branch=true_branch, false_branch=false_branch)\n",
107 | " \n",
108 | " # 计算节点的目标值\n",
109 | " leaf_value = self._leaf_value_calculation(y)\n",
110 | " \n",
111 | " \n",
112 | " return DecisionNode(value=leaf_value)\n",
113 | "\n",
114 | "\n",
115 | " def predict_value(self, x, tree=None):\n",
116 | " \"\"\"\n",
117 | " 预测\n",
118 | " \"\"\"\n",
119 | "\n",
120 | " if tree is None:\n",
121 | " tree = self.root\n",
122 | "\n",
123 | " if tree.value is not None:\n",
124 | " return tree.value\n",
125 | "\n",
126 | " feature_value = x[tree.feature_i]\n",
127 | "\n",
128 | " branch = tree.false_branch\n",
129 | " if isinstance(feature_value, int) or isinstance(feature_value, float):\n",
130 | " if feature_value >= tree.threshold:\n",
131 | " branch = tree.true_branch\n",
132 | " elif feature_value == tree.threshold:\n",
133 | " branch = tree.true_branch\n",
134 | "\n",
135 | " return self.predict_value(x, branch)\n",
136 | "\n",
137 | " def predict(self, X):\n",
138 | " y_pred = []\n",
139 | " for x in X:\n",
140 | " y_pred.append(self.predict_value(x))\n",
141 | " return y_pred\n",
142 | "def calculate_variance(X):\n",
143 | " \"\"\" Return the variance of the features in dataset X \"\"\"\n",
144 | " mean = np.ones(np.shape(X)) * X.mean(0)\n",
145 | " n_samples = np.shape(X)[0]\n",
146 | " variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))\n",
147 | " \n",
148 | " return variance\n",
149 | "class RegressionTree(DecisionTree):\n",
150 | " def _calculate_variance_reduction(self, y, y1, y2):\n",
151 | " var_tot = calculate_variance(y)\n",
152 | " var_1 = calculate_variance(y1)\n",
153 | " var_2 = calculate_variance(y2)\n",
154 | " frac_1 = len(y1) / len(y)\n",
155 | " frac_2 = len(y2) / len(y)\n",
156 | "\n",
157 | " # 使用方差缩减\n",
158 | " variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)\n",
159 | "\n",
160 | " return sum(variance_reduction)\n",
161 | "\n",
162 | " def _mean_of_y(self, y):\n",
163 | " value = np.mean(y, axis=0)\n",
164 | " return value if len(value) > 1 else value[0]\n",
165 | "\n",
166 | " def fit(self, X, y):\n",
167 | " self._impurity_calculation = self._calculate_variance_reduction\n",
168 | " self._leaf_value_calculation = self._mean_of_y\n",
169 | " super(RegressionTree, self).fit(X, y)\n",
170 | "\n",
171 | "class GradientBoosting(object):\n",
172 | " def __init__(self, n_estimators, learning_rate, min_samples_split,\n",
173 | " min_impurity, max_depth, regression):\n",
174 | " self.n_estimators = n_estimators\n",
175 | " self.learning_rate = learning_rate\n",
176 | " self.min_samples_split = min_samples_split\n",
177 | " self.min_impurity = min_impurity\n",
178 | " self.max_depth = max_depth\n",
179 | " self.regression = regression\n",
180 | " \n",
181 | " self.loss = SquareLoss()\n",
182 | " if not self.regression:\n",
183 | " self.loss = CrossEntropy()\n",
184 | "\n",
185 | " self.trees = []\n",
186 | " for _ in range(n_estimators):\n",
187 | " tree = RegressionTree(\n",
188 | " min_samples_split=self.min_samples_split,\n",
189 | " min_impurity=min_impurity,\n",
190 | " max_depth=self.max_depth)\n",
191 | " self.trees.append(tree)\n",
192 | "\n",
193 | "\n",
194 | " def fit(self, X, y):\n",
195 | " y_pred = np.full(np.shape(y), np.mean(y, axis=0))\n",
196 | " for i in range(self.n_estimators):\n",
197 | " gradient = self.loss.gradient(y, y_pred)\n",
198 | " self.trees[i].fit(X, gradient)\n",
199 | " update = self.trees[i].predict(X)\n",
200 | " # Update y prediction\n",
201 | " y_pred -= np.multiply(self.learning_rate, update)\n",
202 | "\n",
203 | "\n",
204 | " def predict(self, X):\n",
205 | " y_pred = np.array([])\n",
206 | " for tree in self.trees:\n",
207 | " update = tree.predict(X)\n",
208 | " update = np.multiply(self.learning_rate, update)\n",
209 | " y_pred = -update if not y_pred.any() else y_pred - update\n",
210 | "\n",
211 | " if not self.regression:\n",
212 | " y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)\n",
213 | " y_pred = np.argmax(y_pred, axis=1)\n",
214 | " return y_pred"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 2,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "class Loss(object):\n",
224 | " def loss(self, y_true, y_pred):\n",
225 | " return NotImplementedError()\n",
226 | "\n",
227 | " def gradient(self, y, y_pred):\n",
228 | " raise NotImplementedError()\n",
229 | "\n",
230 | " def acc(self, y, y_pred):\n",
231 | " return 0\n",
232 | "# 如果是回归模型\n",
233 | "class SquareLoss(Loss):\n",
234 | " def __init__(self): pass\n",
235 | "\n",
236 | " def loss(self, y, y_pred):\n",
237 | " return 0.5 * np.power((y - y_pred), 2)\n",
238 | "\n",
239 | " def gradient(self, y, y_pred):\n",
240 | " return -(y - y_pred)\n",
241 | "# 如果是分类模型\n",
242 | "class CrossEntropy(Loss):\n",
243 | " def __init__(self): pass\n",
244 | "\n",
245 | " def loss(self, y, p):\n",
246 | " # Avoid division by zero\n",
247 | " p = np.clip(p, 1e-15, 1 - 1e-15)\n",
248 | " return - y * np.log(p) - (1 - y) * np.log(1 - p)\n",
249 | "\n",
250 | " def acc(self, y, p):\n",
251 | " return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))\n",
252 | "\n",
253 | " def gradient(self, y, p):\n",
254 | " # Avoid division by zero\n",
255 | " p = np.clip(p, 1e-15, 1 - 1e-15)\n",
256 | " return - (y / p) + (1 - y) / (1 - p)\n",
257 | " \n",
258 | "class GradientBoostingRegressor(GradientBoosting):\n",
259 | " def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,\n",
260 | " min_var_red=1e-7, max_depth=4, debug=False):\n",
261 | " super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators, \n",
262 | " learning_rate=learning_rate, \n",
263 | " min_samples_split=min_samples_split, \n",
264 | " min_impurity=min_var_red,\n",
265 | " max_depth=max_depth,\n",
266 | " regression=True)"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 6,
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "data": {
276 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAGihJREFUeJzt3X+MXWWdx/H3d4dZOiowoYwrzJQtP0yjQNfCCJImxAV3q7UWgiyU4I8qC7sGFwwuRgxBbUzAkOCPJdEUyALCFrsVS2H5sQhLVAI1U8B2bSWCoJ2BXYZii6wFyvDdP+6ddubOnbn3ufeeuc/z3M8raTr33Ken3+cc/XJ7zuc819wdERHJy5+1uwAREWk9NXcRkQypuYuIZEjNXUQkQ2ruIiIZUnMXEcmQmruISIbU3EVEMqTmLiKSof3qHWhmXcAQMOLuyyreWwlcA4yUN13n7jfMtL9DDjnE58+fH1SsiEin27Rp00vu3ldrXN3NHbgE2AYcOM37P3T3z9e7s/nz5zM0NBTw14uIiJn9rp5xdV2WMbMB4KPAjJ/GRUQkDvVec/828CXgrRnGfNzMNpvZOjObV22AmV1oZkNmNjQ6Ohpaq4iI1KlmczezZcCL7r5phmF3AfPdfSHwE+DmaoPcfbW7D7r7YF9fzUtGIiLSoHquuS8GlpvZUmAOcKCZ3erunxgf4O47Joy/Hvhma8sUEWmdPXv2MDw8zGuvvdbuUqY1Z84cBgYG6O7ubujP12zu7n45cDmAmX0Q+OeJjb28/VB3f6H8cjmlG68iIlEaHh7mgAMOYP78+ZhZu8uZwt3ZsWMHw8PDHHHEEQ3to+Gcu5mtMrPl5ZcXm9mvzOyXwMXAykb3KyJStNdee425c+dG2dgBzIy5c+c29S+LkCgk7v4w8HD55ysnbN/76V4kN+ufGOGa+5/i+Z27Oay3h8uWLOCMRf3tLkuaFGtjH9dsfUHNXaTTrH9ihMvv2MLuPWMAjOzczeV3bAFQg5eoafkBkRlcc/9Texv7uN17xrjm/qfaVJHk4r777mPBggUcffTRXH311S3fv5q7yAye37k7aLtIPcbGxrjooou499572bp1K2vWrGHr1q0t/Tt0WUZkBof19jBSpZEf1tvThmqkXVp93+UXv/gFRx99NEceeSQAK1as4M477+S9731vq0rWJ3eRmVy2ZAE93V2TtvV0d3HZkgVtqkhm2/h9l5Gdu3H23XdZ/8RIzT87nZGREebN2/cg/8DAACMjje+vGjV3kRmcsaifq848jv7eHgzo7+3hqjOP083UDlLEfRd3n7Kt1ekdXZYRqeGMRf1q5h2siPsuAwMDbN++fe/r4eFhDjvssIb3V40+uYuIzGC6+yvN3Hd5//vfz29+8xueffZZ3njjDW6//XaWL19e+w8GUHMXEZlBEfdd9ttvP6677jqWLFnCe97zHs4++2yOOeaYZkud/He0dG8iIpkZvyTX6qeUly5dytKlS1tRYlVq7iIiNaR430WXZUREMqTmLiKSITV3EZEMqbmLiGRIzV1EJENq7pKN9U+MsPjqhzjiy//B4qsfamrtD5Giffazn+Wd73wnxx57bCH7V3OXLBSxuJNIkVauXMl9991X2P7V3CUL+lINKdTmtfCtY+FrvaXfN69tepennHIKBx98cAuKq04PMUkW9KUaUpjNa+Gui2FP+X9Lu7aXXgMsPLt9ddWgT+6ShSIWdxIB4MFV+xr7uD27S9sjpuYuWdCXakhhdg2HbY+ELstIFopa3EmEgwZKl2KqbY+YmrtkI8XFnSQBp105+Zo7QHdPaXsTzj33XB5++GFeeuklBgYG+PrXv87555/fZLH7qLlL01r95cEiURm/afrgqtKlmIMGSo29yZupa9asaUFx01Nzl6aM58vHY4jj+XJADV7ysfDsqJMx1eiGqjRF+XKROKm5S1OUL5dUuXu7S5hRs/WpuUtTlC+XFM2ZM4cdO3ZE2+DdnR07djBnzpyG96Fr7tKUy5YsmHTNHZQvl/gNDAwwPDzM6Ohou0uZ1pw5cxgYaDxuqeYuTVG+XFLU3d3NEUcc0e4yClV3czezLmAIGHH3ZRXv7Q/cApwA7ADOcffnWlinREz5cpH4hHxyvwTYBhxY5b3zgT+4+9FmtgL4JnBOC+oTSYoy/xKLum6omtkA8FHghmmGnA7cXP55HXCamVnz5YmkQ2vKS0zqTct8G/gS8NY07/cD2wHc/U1gFzC36epEEqLMv8SkZnM3s2XAi+6+aaZhVbZNyRiZ2YVmNmRmQzHfpRZphDL/EpN6PrkvBpab2XPA7cCpZnZrxZhhYB6Ame0HHAS8XLkjd1/t7oPuPtjX19dU4SKxUeZfYlKzubv75e4+4O7zgRXAQ+7+iYphG4BPl38+qzwmzqcDRAqiNeUlJg3n3M1sFTDk7huAG4EfmNnTlD6xr2hRfSLJUOZfYmLt+oA9ODjoQ0NDbfm7RURSZWab3H2w1jg9oSrRumL9FtZs3M6YO11mnHvSPL5xxnHtLkskCWruEqUr1m/h1sd+v/f1mPve12rwIrVpVUiJ0pqNVb6zcobtIjKZmrtEaWyae0HTbReRydTcJUpd06xeMd12EZlMzV2idO5J84K2i8hkuqEqURq/aaq0jEhjlHMXEUmIcu7SlPOuf5RHntm3PNDiow7mtgtObmNF7aM12iVFuuYuU1Q2doBHnnmZ865/tE0VtY/WaJdUqbnLFJWNvdb2nGmNdkmVmrvIDLRGu6RKzV1kBlqjXVKl5i5TLD7q4KDtOdMa7ZIqNXeZ4rYLTp7SyDs1LXPGon6uOvM4+nt7MKC/t4erzjxOaRmJnnLuIiIJUc5dmlJUtjtkv8qXizROzV2mGM92j0cAx7PdQFPNNWS/RdUg0il0zV2mKCrbHbJf5ctFmqPmLlMUle0O2a/y5SLNUXOXKYrKdofsV/lykeaoucsURWW7Q/arfLlIc3RDVaYYv2HZ6qRKyH6LqkGkUyjnLiKSEOXcC5ZiBjvFmkWkMWruDUgxg51izSLSON1QbUCKGewUaxaRxqm5NyDFDHaKNYtI49TcG5BiBjvFmkWkcWruDUgxg51izSLSON1QbUCKGewUaxaRxtXMuZvZHOCnwP6U/mOwzt2/WjFmJXANMP6V8Ne5+w0z7Vc5dxGRcK3Mub8OnOrur5pZN/BzM7vX3R+rGPdDd/98I8XK7Lhi/RbWbNzOmDtdZpx70jy+ccZxTY+NJT8fSx0iMajZ3L300f7V8svu8q/2PNYqDbti/RZufez3e1+Pue99Xdm0Q8bGkp+PpQ6RWNR1Q9XMuszsSeBF4AF331hl2MfNbLOZrTOzeS2tUpq2ZuP2ureHjI0lPx9LHSKxqKu5u/uYu78PGABONLNjK4bcBcx394XAT4Cbq+3HzC40syEzGxodHW2mbgk0Ns29lWrbQ8bGkp+PpQ6RWARFId19J/Aw8OGK7Tvc/fXyy+uBE6b586vdfdDdB/v6+hooVxrVZVb39pCxseTnY6lDJBY1m7uZ9ZlZb/nnHuBDwK8rxhw64eVyYFsri5TmnXtS9Stl1baHjI0lPx9LHSKxqCctcyhws5l1UfqPwVp3v9vMVgFD7r4BuNjMlgNvAi8DK4sqWBozfiO0ngRMyNhY8vOx1CESC63nLiKSEK3nXrCiMtUh+fIi9x0yvxSPRXI2r4UHV8GuYThoAE67Ehae3e6qJGJq7g0oKlMdki8vct8h80vxWCRn81q462LYU07+7Npeeg1q8DItLRzWgKIy1SH58iL3HTK/FI9Fch5cta+xj9uzu7RdZBpq7g0oKlMdki8vct8h80vxWCRn13DYdhHU3BtSVKY6JF9e5L5D5pfisUjOQQNh20VQc29IUZnqkHx5kfsOmV+KxyI5p10J3RX/sezuKW0XmYZuqDagqEx1SL68yH2HzC/FY5Gc8ZumSstIAOXcRUQSopy7TBFDdl0Sp7x9MtTcO0QM2XVJnPL2SdEN1Q4RQ3ZdEqe8fVLU3DtEDNl1SZzy9klRc+8QMWTXJXHK2ydFzb1DxJBdl8Qpb58U3VDtEDFk1yVxytsnRTl3EZGEKOdeVlReO2S/saxLrux6ZHLPjOc+vxBtOBZZN/ei8toh+41lXXJl1yOTe2Y89/mFaNOxyPqGalF57ZD9xrIuubLrkck9M577/EK06Vhk3dyLymuH7DeWdcmVXY9M7pnx3OcXok3HIuvmXlReO2S/saxLrux6ZHLPjOc+vxBtOhZZN/ei8toh+41lXXJl1yOTe2Y89/mFaNOxyPqGalF57ZD9xrIuubLrkck9M577/EK06Vgo5y4ikhDl3AsWQ37+vOsf5ZFnXt77evFRB3PbBSc3XYNIVu6+FDbdBD4G1gUnrIRl1za/38hz/Flfcy/KeGZ8ZOdunH2Z8fVPjMzafisbO8Ajz7zMedc/2lQNIlm5+1IYurHU2KH0+9CNpe3NGM+u79oO+L7s+ua1TZfcKmruDYghP1/Z2GttF+lIm24K216vBHL8au4NiCE/LyJ18LGw7fVKIMev5t6AGPLzIlIH6wrbXq8Ecvxq7g2IIT+/+KiDq+5juu0iHemElWHb65VAjl/NvQFnLOrnqjOPo7+3BwP6e3u46szjWpKfr3e/t11w8pRGrrSMSIVl18Lg+fs+qVtX6XWzaZmFZ8PHvgsHzQOs9PvHvhtVWkY5dxGRhLQs525mc4CfAvuXx69z969WjNkfuAU4AdgBnOPuzzVQd02h+fLU1jAPWfs992NRaI44JPtcVB1Fzi/yDHZTQueW87GYQT0PMb0OnOrur5pZN/BzM7vX3R+bMOZ84A/ufrSZrQC+CZzT6mJD1yRPbQ3zkLXfcz8Wha6BPZ59HjeefYapDb6oOoqcX85rqYfOLedjUUPNa+5e8mr5ZXf5V+W1nNOBm8s/rwNOM2v9soeh+fLU1jAPWfs992NRaI44JPtcVB1Fzi+BDHbDQueW87Gooa4bqmbWZWZPAi8CD7j7xooh/cB2AHd/E9gFzK2ynwvNbMjMhkZHR4OLDc2Bp5YbD1n7PfdjUWiOOCT7XFQdRc4vgQx2w0LnlvOxqKGu5u7uY+7+PmAAONHMjq0YUu1T+pSO5O6r3X3Q3Qf7+vqCiw3NgaeWGw9Z+z33Y1Fojjgk+1xUHUXOL4EMdsNC55bzsaghKArp7juBh4EPV7w1DMwDMLP9gIOAlj8HH5ovT20N85C133M/FoXmiEOyz0XVUeT8EshgNyx0bjkfixrqScv0AXvcfaeZ9QAfonTDdKINwKeBR4GzgIe8gIxl6Jrkqa1hHrL2e+7HotA1sMdvmtaTlimqjiLnl/Na6qFzy/lY1FAz525mCyndLO2i9El/rbuvMrNVwJC7byjHJX8ALKL0iX2Fu/92pv0q5y4iEq5lOXd330ypaVduv3LCz68BfxdapIiIFCP7L+tI7sEdmR0hD7bE8BBMkQ/upPaQVgznIwFZN/fkHtyR2RHyYEsMD8EU+eBOag9pxXA+EpH1wmHJPbgjsyPkwZYYHoIp8sGd1B7SiuF8JCLr5p7cgzsyO0IebInhIZgiH9xJ7SGtGM5HIrJu7sk9uCOzI+TBlhgeginywZ3UHtKK4XwkIuvmntyDOzI7Qh5sieEhmCIf3EntIa0Yzkcism7uRX2phiQu5IsWYvhShtAaYphfavvNkL6sQ0QkIS17iEmk44V8sUcsUqs5lux6LHW0gJq7yExCvtgjFqnVHEt2PZY6WiTra+4iTQv5Yo9YpFZzLNn1WOpoETV3kZmEfLFHLFKrOZbseix1tIiau8hMQr7YIxap1RxLdj2WOlpEzV1kJiFf7BGL1GqOJbseSx0touYuMpNl18Lg+fs+9VpX6XWMNybHpVZzLNn1WOpoEeXcRUQSopy7zJ4Us8FF1VxUvjzFYyxtpeYuzUkxG1xUzUXly1M8xtJ2uuYuzUkxG1xUzUXly1M8xtJ2au7SnBSzwUXVXFS+PMVjLG2n5i7NSTEbXFTNReXLUzzG0nZq7tKcFLPBRdVcVL48xWMsbafmLs1JMRtcVM1F5ctTPMbSdsq5i4gkpN6cuz65Sz42r4VvHQtf6y39vnnt7O+3qBpEAinnLnkoKgsesl/l0SUi+uQueSgqCx6yX+XRJSJq7pKHorLgIftVHl0iouYueSgqCx6yX+XRJSJq7pKHorLgIftVHl0iouYueSgqCx6yX+XRJSI1c+5mNg+4BXgX8Baw2t2/UzHmg8CdwLPlTXe4+4x3kZRzFxEJ18r13N8Evujuj5vZAcAmM3vA3bdWjPuZuy9rpFiJUIrrh4fUnOL8YqDjloyazd3dXwBeKP/8RzPbBvQDlc1dcpFiXlt59OLpuCUl6Jq7mc0HFgEbq7x9spn90szuNbNjWlCbtEuKeW3l0Yun45aUup9QNbN3AD8CvuDur1S8/Tjwl+7+qpktBdYD766yjwuBCwEOP/zwhouWgqWY11YevXg6bkmp65O7mXVTauy3ufsdle+7+yvu/mr553uAbjM7pMq41e4+6O6DfX19TZYuhUkxr608evF03JJSs7mbmQE3AtvcverapWb2rvI4zOzE8n53tLJQmUUp5rWVRy+ejltS6rkssxj4JLDFzJ4sb/sKcDiAu38fOAv4nJm9CewGVni71hKW5o3fHEspFRFSc4rzi4GOW1K0nruISEJamXOXWClzPNndl8Kmm0pfSG1dpa+3a/ZbkEQSpeaeKmWOJ7v7Uhi6cd9rH9v3Wg1eOpDWlkmVMseTbbopbLtI5tTcU6XM8WQ+FrZdJHNq7qlS5ngy6wrbLpI5NfdUKXM82Qkrw7aLZE7NPVVaO3yyZdfC4Pn7PqlbV+m1bqZKh1LOXUQkIcq5N2D9EyNcc/9TPL9zN4f19nDZkgWcsai/3WW1Tu65+NznFwMd42SouZetf2KEy+/Ywu49pXTFyM7dXH7HFoA8Gnzuufjc5xcDHeOk6Jp72TX3P7W3sY/bvWeMa+5/qk0VtVjuufjc5xcDHeOkqLmXPb9zd9D25OSei899fjHQMU6KmnvZYb09QduTk3suPvf5xUDHOClq7mWXLVlAT/fkB156uru4bMmCNlXUYrnn4nOfXwx0jJOiG6pl4zdNs03L5L4Wd+7zi4GOcVKUcxcRSUi9OXddlhFJwea18K1j4Wu9pd83r01j39I2uiwjErsi8+XKrmdLn9xFYldkvlzZ9WypuYvErsh8ubLr2VJzF4ldkflyZdezpeYuErsi8+XKrmdLzV0kdkWu3a/vBciWcu4iIglRzl1EpIOpuYuIZEjNXUQkQ2ruIiIZUnMXEcmQmruISIbU3EVEMqTmLiKSoZrN3czmmdl/mdk2M/uVmV1SZYyZ2XfN7Gkz22xmxxdTrjRF63aLdIx61nN/E/iiuz9uZgcAm8zsAXffOmHMR4B3l3+dBHyv/LvEQut2i3SUmp/c3f0Fd3+8/PMfgW1A5ReLng7c4iWPAb1mdmjLq5XGad1ukY4SdM3dzOYDi4CNFW/1A9snvB5m6n8AMLMLzWzIzIZGR0fDKpXmaN1ukY5Sd3M3s3cAPwK+4O6vVL5d5Y9MWZHM3Ve7+6C7D/b19YVVKs3Rut0iHaWu5m5m3ZQa+23ufkeVIcPAvAmvB4Dnmy9PWkbrdot0lHrSMgbcCGxz92unGbYB+FQ5NfMBYJe7v9DCOqVZWrdbpKPUk5ZZDHwS2GJmT5a3fQU4HMDdvw/cAywFngb+BHym9aVK0xaerWYu0iFqNnd3/znVr6lPHOPARa0qSkREmqMnVEVEMqTmLiKSITV3EZEMqbmLiGRIzV1EJENq7iIiGVJzFxHJkJUi6m34i81Ggd+15S+v7RDgpXYXUSDNL105zw00v3r8pbvXXJyrbc09ZmY25O6D7a6jKJpfunKeG2h+raTLMiIiGVJzFxHJkJp7davbXUDBNL905Tw30PxaRtfcRUQypE/uIiIZ6ujmbmZdZvaEmd1d5b2VZjZqZk+Wf/19O2pshpk9Z2ZbyvUPVXnfzOy7Zva0mW02s+PbUWcj6pjbB81s14Tzl9RXTplZr5mtM7Nfm9k2Mzu54v1kzx3UNb9kz5+ZLZhQ95Nm9oqZfaFiTOHnr54v68jZJcA24MBp3v+hu39+Fuspwl+7+3S52o8A7y7/Ogn4Xvn3VMw0N4CfufuyWaumtb4D3OfuZ5nZnwNvq3g/9XNXa36Q6Plz96eA90HpAyQwAvy4Yljh569jP7mb2QDwUeCGdtfSRqcDt3jJY0CvmR3a7qI6nZkdCJxC6estcfc33H1nxbBkz12d88vFacAz7l75wGbh569jmzvwbeBLwFszjPl4+Z9M68xs3gzjYuXAf5rZJjO7sMr7/cD2Ca+Hy9tSUGtuACeb2S/N7F4zO2Y2i2vSkcAo8K/ly4Y3mNnbK8akfO7qmR+ke/4mWgGsqbK98PPXkc3dzJYBL7r7phmG3QXMd/eFwE+Am2eluNZa7O7HU/on4EVmdkrF+9W+PjGV+FStuT1O6THtvwL+BVg/2wU2YT/geOB77r4I+D/gyxVjUj539cwv5fMHQPly03Lg36u9XWVbS89fRzZ3Sl/6vdzMngNuB041s1snDnD3He7+evnl9cAJs1ti89z9+fLvL1K65ndixZBhYOK/SAaA52enuubUmpu7v+Lur5Z/vgfoNrNDZr3QxgwDw+6+sfx6HaVmWDkmyXNHHfNL/PyN+wjwuLv/b5X3Cj9/Hdnc3f1ydx9w9/mU/tn0kLt/YuKYiutfyyndeE2Gmb3dzA4Y/xn4W+C/K4ZtAD5VvnP/AWCXu78wy6UGq2duZvYuM7PyzydS+t/6jtmutRHu/j/AdjNbUN50GrC1YliS5w7qm1/K52+Cc6l+SQZm4fx1elpmEjNbBQy5+wbgYjNbDrwJvAysbGdtDfgL4Mfl/3/sB/ybu99nZv8I4O7fB+4BlgJPA38CPtOmWkPVM7ezgM+Z2ZvAbmCFp/XE3j8Bt5X/af9b4DOZnLtxteaX9Pkzs7cBfwP8w4Rts3r+9ISqiEiGOvKyjIhI7tTcRUQypOYuIpIhNXcRkQypuYuIZEjNXUQkQ2ruIiIZUnMXEcnQ/wPmMFqpaGCFHwAAAABJRU5ErkJggg==\n",
277 | "text/plain": [
278 | ""
279 | ]
280 | },
281 | "metadata": {},
282 | "output_type": "display_data"
283 | }
284 | ],
285 | "source": [
286 | "import pandas as pd\n",
287 | "from sklearn.cross_validation import train_test_split\n",
288 | "from sklearn.datasets import load_iris\n",
289 | "import matplotlib.pyplot as plt\n",
290 | "# data\n",
291 | "def create_data():\n",
292 | " iris = load_iris()\n",
293 | " df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
294 | " df['label'] = iris.target\n",
295 | " df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']\n",
296 | " data = np.array(df.iloc[:100, [0, 1, -1]])\n",
297 | " for i in range(len(data)):\n",
298 | " if data[i,-1] == 0:\n",
299 | " data[i,-1] = -1\n",
300 | " # print(data)\n",
301 | " return data[:,:2], data[:,-1]\n",
302 | "X, y = create_data()\n",
303 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
304 | "plt.scatter(X[:50,0],X[:50,1], label='0')\n",
305 | "plt.scatter(X[50:,0],X[50:,1], label='1')\n",
306 | "plt.legend()\n",
307 | "plt.show()"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 7,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "X = np.arange(10).reshape(10, 1)\n",
317 | "y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])\n",
318 | "X, y = create_data()\n",
319 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 8,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "class Loss(object):\n",
329 | " def loss(self, y_true, y_pred):\n",
330 | " return NotImplementedError()\n",
331 | "\n",
332 | " def gradient(self, y, y_pred):\n",
333 | " raise NotImplementedError()\n",
334 | "\n",
335 | " def acc(self, y, y_pred):\n",
336 | " return 0\n",
337 | "class SquareLoss(Loss):\n",
338 | " def __init__(self): pass\n",
339 | "\n",
340 | " def loss(self, y, y_pred):\n",
341 | " return 0.5 * np.power((y - y_pred), 2)\n",
342 | "\n",
343 | " def gradient(self, y, y_pred):\n",
344 | " return -(y - y_pred)\n",
345 | "\n",
346 | "model = GradientBoostingRegressor()\n",
347 | "model.fit(X_train, y_train)\n",
348 | "y_pred = model.predict(X_test)\n",
349 | "\n",
350 | "y_pred_line = model.predict(X)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 10,
356 | "metadata": {},
357 | "outputs": [
358 | {
359 | "data": {
360 | "text/plain": [
361 | "0.45411547657534923"
362 | ]
363 | },
364 | "execution_count": 10,
365 | "metadata": {},
366 | "output_type": "execute_result"
367 | }
368 | ],
369 | "source": [
370 | "from sklearn.metrics import mean_squared_error\n",
371 | "mse = mean_squared_error(y_test, y_pred)\n",
372 | "mse"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 11,
378 | "metadata": {},
379 | "outputs": [
380 | {
381 | "data": {
382 | "text/plain": [
383 | "0.9333333333333333"
384 | ]
385 | },
386 | "execution_count": 11,
387 | "metadata": {},
388 | "output_type": "execute_result"
389 | }
390 | ],
391 | "source": [
392 | "#分类模型\n",
393 | "class GradientBoostingClassifier(GradientBoosting):\n",
394 | " def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,\n",
395 | " min_info_gain=1e-7, max_depth=2, debug=False):\n",
396 | " super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators, \n",
397 | " learning_rate=learning_rate, \n",
398 | " min_samples_split=min_samples_split, \n",
399 | " min_impurity=min_info_gain,\n",
400 | " max_depth=max_depth,\n",
401 | " regression=False)\n",
402 | "\n",
403 | " def fit(self, X, y):\n",
404 | " y = to_categorical(y)\n",
405 | " super(GradientBoostingClassifier, self).fit(X, y)\n",
406 | " \n",
407 | "def to_categorical(x, n_col=None):\n",
408 | " \"\"\" One-hot encoding of nominal values \"\"\"\n",
409 | " if not n_col:\n",
410 | " n_col = np.amax(x) + 1\n",
411 | " one_hot = np.zeros((x.shape[0], n_col))\n",
412 | " one_hot[np.arange(x.shape[0]), x] = 1\n",
413 | " return one_hot\n",
414 | "\n",
415 | "from sklearn import datasets\n",
416 | "from sklearn.metrics import accuracy_score\n",
417 | "data = datasets.load_iris()\n",
418 | "X = data.data\n",
419 | "y = data.target\n",
420 | "\n",
421 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)\n",
422 | "\n",
423 | "clf = GradientBoostingClassifier()\n",
424 | "clf.fit(X_train, y_train)\n",
425 | "y_pred = clf.predict(X_test)\n",
426 | "accuracy = accuracy_score(y_test, y_pred)\n",
427 | "accuracy"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "metadata": {},
434 | "outputs": [],
435 | "source": []
436 | }
437 | ],
438 | "metadata": {
439 | "kernelspec": {
440 | "display_name": "Python 3",
441 | "language": "python",
442 | "name": "python3"
443 | },
444 | "language_info": {
445 | "codemirror_mode": {
446 | "name": "ipython",
447 | "version": 3
448 | },
449 | "file_extension": ".py",
450 | "mimetype": "text/x-python",
451 | "name": "python",
452 | "nbconvert_exporter": "python",
453 | "pygments_lexer": "ipython3",
454 | "version": "3.6.4"
455 | }
456 | },
457 | "nbformat": 4,
458 | "nbformat_minor": 2
459 | }
460 |
--------------------------------------------------------------------------------
/Part6/GBDT.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import math
9 | # 计算信息熵
10 | def calculate_entropy(y):
11 | log2 = math.log2
12 | unique_labels = np.unique(y)
13 | entropy = 0
14 | for label in unique_labels:
15 | count = len(y[y == label])
16 | p = count / len(y)
17 | entropy += -p * log2(p)
18 | return entropy
19 | # 定义树的节点
20 | class DecisionNode():
21 | def __init__(self, feature_i=None, threshold=None,
22 | value=None, true_branch=None, false_branch=None):
23 | self.feature_i = feature_i
24 | self.threshold = threshold
25 | self.value = value
26 | self.true_branch = true_branch
27 | self.false_branch = false_branch
28 | def divide_on_feature(X, feature_i, threshold):
29 | split_func = None
30 | if isinstance(threshold, int) or isinstance(threshold, float):
31 | split_func = lambda sample: sample[feature_i] >= threshold
32 | else:
33 | split_func = lambda sample: sample[feature_i] == threshold
34 |
35 | X_1 = np.array([sample for sample in X if split_func(sample)])
36 | X_2 = np.array([sample for sample in X if not split_func(sample)])
37 |
38 | return np.array([X_1, X_2])
39 | # 超类
40 | class DecisionTree(object):
41 | def __init__(self, min_samples_split=2, min_impurity=1e-7,
42 | max_depth=float("inf"), loss=None):
43 | self.root = None #根节点
44 | self.min_samples_split = min_samples_split
45 | self.min_impurity = min_impurity
46 | self.max_depth = max_depth
47 | # 计算值 如果是分类问题就是信息增益,回归问题就基尼指数
48 | self._impurity_calculation = None
49 | self._leaf_value_calculation = None #计算叶子
50 | self.one_dim = None
51 | self.loss = loss
52 |
53 | def fit(self, X, y, loss=None):
54 | self.one_dim = len(np.shape(y)) == 1
55 | self.root = self._build_tree(X, y)
56 | self.loss=None
57 |
58 | def _build_tree(self, X, y, current_depth=0):
59 | """
60 | 递归求解树
61 | """
62 |
63 | largest_impurity = 0
64 | best_criteria = None
65 | best_sets = None
66 |
67 | if len(np.shape(y)) == 1:
68 | y = np.expand_dims(y, axis=1)
69 |
70 | Xy = np.concatenate((X, y), axis=1)
71 |
72 | n_samples, n_features = np.shape(X)
73 |
74 | if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
75 | # 计算每一个特征的增益值
76 | for feature_i in range(n_features):
77 | feature_values = np.expand_dims(X[:, feature_i], axis=1)
78 | unique_values = np.unique(feature_values)
79 |
80 | for threshold in unique_values:
81 | Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
82 |
83 | if len(Xy1) > 0 and len(Xy2) > 0:
84 | y1 = Xy1[:, n_features:]
85 | y2 = Xy2[:, n_features:]
86 |
87 | # 计算增益值
88 | impurity = self._impurity_calculation(y, y1, y2)
89 |
90 | if impurity > largest_impurity:
91 | largest_impurity = impurity
92 | best_criteria = {"feature_i": feature_i, "threshold": threshold}
93 | best_sets = {
94 | "leftX": Xy1[:, :n_features],
95 | "lefty": Xy1[:, n_features:],
96 | "rightX": Xy2[:, :n_features],
97 | "righty": Xy2[:, n_features:]
98 | }
99 |
100 | if largest_impurity > self.min_impurity:
101 | true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
102 | false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
103 | return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
104 | "threshold"], true_branch=true_branch, false_branch=false_branch)
105 |
106 | # 计算节点的目标值
107 | leaf_value = self._leaf_value_calculation(y)
108 |
109 |
110 | return DecisionNode(value=leaf_value)
111 |
112 |
113 | def predict_value(self, x, tree=None):
114 | """
115 | 预测
116 | """
117 |
118 | if tree is None:
119 | tree = self.root
120 |
121 | if tree.value is not None:
122 | return tree.value
123 |
124 | feature_value = x[tree.feature_i]
125 |
126 | branch = tree.false_branch
127 | if isinstance(feature_value, int) or isinstance(feature_value, float):
128 | if feature_value >= tree.threshold:
129 | branch = tree.true_branch
130 | elif feature_value == tree.threshold:
131 | branch = tree.true_branch
132 |
133 | return self.predict_value(x, branch)
134 |
135 | def predict(self, X):
136 | y_pred = []
137 | for x in X:
138 | y_pred.append(self.predict_value(x))
139 | return y_pred
140 | def calculate_variance(X):
141 | """ Return the variance of the features in dataset X """
142 | mean = np.ones(np.shape(X)) * X.mean(0)
143 | n_samples = np.shape(X)[0]
144 | variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))
145 |
146 | return variance
147 | class RegressionTree(DecisionTree):
148 | def _calculate_variance_reduction(self, y, y1, y2):
149 | var_tot = calculate_variance(y)
150 | var_1 = calculate_variance(y1)
151 | var_2 = calculate_variance(y2)
152 | frac_1 = len(y1) / len(y)
153 | frac_2 = len(y2) / len(y)
154 |
155 | # 使用方差缩减
156 | variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)
157 |
158 | return sum(variance_reduction)
159 |
160 | def _mean_of_y(self, y):
161 | value = np.mean(y, axis=0)
162 | return value if len(value) > 1 else value[0]
163 |
164 | def fit(self, X, y):
165 | self._impurity_calculation = self._calculate_variance_reduction
166 | self._leaf_value_calculation = self._mean_of_y
167 | super(RegressionTree, self).fit(X, y)
168 |
169 | class GradientBoosting(object):
170 | def __init__(self, n_estimators, learning_rate, min_samples_split,
171 | min_impurity, max_depth, regression):
172 | self.n_estimators = n_estimators
173 | self.learning_rate = learning_rate
174 | self.min_samples_split = min_samples_split
175 | self.min_impurity = min_impurity
176 | self.max_depth = max_depth
177 | self.regression = regression
178 |
179 | self.loss = SquareLoss()
180 | if not self.regression:
181 | self.loss = CrossEntropy()
182 |
183 | self.trees = []
184 | for _ in range(n_estimators):
185 | tree = RegressionTree(
186 | min_samples_split=self.min_samples_split,
187 | min_impurity=min_impurity,
188 | max_depth=self.max_depth)
189 | self.trees.append(tree)
190 |
191 |
192 | def fit(self, X, y):
193 | y_pred = np.full(np.shape(y), np.mean(y, axis=0))
194 | for i in range(self.n_estimators):
195 | gradient = self.loss.gradient(y, y_pred)
196 | self.trees[i].fit(X, gradient)
197 | update = self.trees[i].predict(X)
198 | # Update y prediction
199 | y_pred -= np.multiply(self.learning_rate, update)
200 |
201 |
202 | def predict(self, X):
203 | y_pred = np.array([])
204 | for tree in self.trees:
205 | update = tree.predict(X)
206 | update = np.multiply(self.learning_rate, update)
207 | y_pred = -update if not y_pred.any() else y_pred - update
208 |
209 | if not self.regression:
210 | y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
211 | y_pred = np.argmax(y_pred, axis=1)
212 | return y_pred
213 |
214 |
215 | # In[2]:
216 |
217 |
218 | class Loss(object):
219 | def loss(self, y_true, y_pred):
220 | return NotImplementedError()
221 |
222 | def gradient(self, y, y_pred):
223 | raise NotImplementedError()
224 |
225 | def acc(self, y, y_pred):
226 | return 0
227 | # 如果是回归模型
228 | class SquareLoss(Loss):
229 | def __init__(self): pass
230 |
231 | def loss(self, y, y_pred):
232 | return 0.5 * np.power((y - y_pred), 2)
233 |
234 | def gradient(self, y, y_pred):
235 | return -(y - y_pred)
236 | # 如果是分类模型
237 | class CrossEntropy(Loss):
238 | def __init__(self): pass
239 |
240 | def loss(self, y, p):
241 | # Avoid division by zero
242 | p = np.clip(p, 1e-15, 1 - 1e-15)
243 | return - y * np.log(p) - (1 - y) * np.log(1 - p)
244 |
245 | def acc(self, y, p):
246 | return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))
247 |
248 | def gradient(self, y, p):
249 | # Avoid division by zero
250 | p = np.clip(p, 1e-15, 1 - 1e-15)
251 | return - (y / p) + (1 - y) / (1 - p)
252 |
253 | class GradientBoostingRegressor(GradientBoosting):
254 | def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
255 | min_var_red=1e-7, max_depth=4, debug=False):
256 | super(GradientBoostingRegressor, self).__init__(n_estimators=n_estimators,
257 | learning_rate=learning_rate,
258 | min_samples_split=min_samples_split,
259 | min_impurity=min_var_red,
260 | max_depth=max_depth,
261 | regression=True)
262 |
263 |
264 | # In[6]:
265 |
266 |
267 | import pandas as pd
268 | from sklearn.cross_validation import train_test_split
269 | from sklearn.datasets import load_iris
270 | import matplotlib.pyplot as plt
271 | # data
272 | def create_data():
273 | iris = load_iris()
274 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
275 | df['label'] = iris.target
276 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
277 | data = np.array(df.iloc[:100, [0, 1, -1]])
278 | for i in range(len(data)):
279 | if data[i,-1] == 0:
280 | data[i,-1] = -1
281 | # print(data)
282 | return data[:,:2], data[:,-1]
283 | X, y = create_data()
284 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
285 | plt.scatter(X[:50,0],X[:50,1], label='0')
286 | plt.scatter(X[50:,0],X[50:,1], label='1')
287 | plt.legend()
288 | plt.show()
289 |
290 |
291 | # In[7]:
292 |
293 |
294 | X = np.arange(10).reshape(10, 1)
295 | y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
296 | X, y = create_data()
297 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
298 |
299 |
300 | # In[8]:
301 |
302 |
303 | class Loss(object):
304 | def loss(self, y_true, y_pred):
305 | return NotImplementedError()
306 |
307 | def gradient(self, y, y_pred):
308 | raise NotImplementedError()
309 |
310 | def acc(self, y, y_pred):
311 | return 0
312 | class SquareLoss(Loss):
313 | def __init__(self): pass
314 |
315 | def loss(self, y, y_pred):
316 | return 0.5 * np.power((y - y_pred), 2)
317 |
318 | def gradient(self, y, y_pred):
319 | return -(y - y_pred)
320 |
321 | model = GradientBoostingRegressor()
322 | model.fit(X_train, y_train)
323 | y_pred = model.predict(X_test)
324 |
325 | y_pred_line = model.predict(X)
326 |
327 |
328 | # In[10]:
329 |
330 |
331 | from sklearn.metrics import mean_squared_error
332 | mse = mean_squared_error(y_test, y_pred)
333 | mse
334 |
335 |
336 | # In[11]:
337 |
338 |
339 | #分类模型
340 | class GradientBoostingClassifier(GradientBoosting):
341 | def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
342 | min_info_gain=1e-7, max_depth=2, debug=False):
343 | super(GradientBoostingClassifier, self).__init__(n_estimators=n_estimators,
344 | learning_rate=learning_rate,
345 | min_samples_split=min_samples_split,
346 | min_impurity=min_info_gain,
347 | max_depth=max_depth,
348 | regression=False)
349 |
350 | def fit(self, X, y):
351 | y = to_categorical(y)
352 | super(GradientBoostingClassifier, self).fit(X, y)
353 |
354 | def to_categorical(x, n_col=None):
355 | """ One-hot encoding of nominal values """
356 | if not n_col:
357 | n_col = np.amax(x) + 1
358 | one_hot = np.zeros((x.shape[0], n_col))
359 | one_hot[np.arange(x.shape[0]), x] = 1
360 | return one_hot
361 |
362 | from sklearn import datasets
363 | from sklearn.metrics import accuracy_score
364 | data = datasets.load_iris()
365 | X = data.data
366 | y = data.target
367 |
368 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
369 |
370 | clf = GradientBoostingClassifier()
371 | clf.fit(X_train, y_train)
372 | y_pred = clf.predict(X_test)
373 | accuracy = accuracy_score(y_test, y_pred)
374 | accuracy
375 |
376 |
--------------------------------------------------------------------------------
/Part6/xgboost.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "from sklearn import datasets\n",
11 | "from sklearn.metrics import accuracy_score,mean_squared_error\n",
12 | "# 定义Sigmoid函数用来做对数几率回归\n",
13 | "\n",
14 | "class Sigmoid():\n",
15 | " def __call__(self, x):\n",
16 | " return 1 / (1 + np.exp(-x))\n",
17 | "\n",
18 | " def gradient(self, x):\n",
19 | " return self.__call__(x) * (1 - self.__call__(x))\n",
20 | "\n",
21 | "class LogisticLoss():\n",
22 | " def __init__(self):\n",
23 | " sigmoid = Sigmoid()\n",
24 | " self.log_func = sigmoid\n",
25 | " self.log_grad = sigmoid.gradient\n",
26 | "\n",
27 | " def loss(self, y, y_pred):\n",
28 | " y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)\n",
29 | " p = self.log_func(y_pred)\n",
30 | " return y * np.log(p) + (1 - y) * np.log(1 - p)\n",
31 | "\n",
32 | " # gradient w.r.t y_pred\n",
33 | " def gradient(self, y, y_pred):\n",
34 | " p = self.log_func(y_pred)\n",
35 | " return -(y - p)\n",
36 | "\n",
37 | " # w.r.t y_pred\n",
38 | " def hess(self, y, y_pred):\n",
39 | " p = self.log_func(y_pred)\n",
40 | " return p * (1 - p)\n",
41 | "def to_categorical(x, n_col=None):\n",
42 | " \"\"\" One-hot encoding of nominal values \"\"\"\n",
43 | " if not n_col:\n",
44 | " n_col = np.amax(x) + 1\n",
45 | " one_hot = np.zeros((x.shape[0], n_col))\n",
46 | " one_hot[np.arange(x.shape[0]), x] = 1\n",
47 | " return one_hot\n",
48 | "\n",
49 | "def divide_on_feature(X, feature_i, threshold):\n",
50 | " \"\"\" Divide dataset based on if sample value on feature index is larger than\n",
51 | " the given threshold \"\"\"\n",
52 | " split_func = None\n",
53 | " if isinstance(threshold, int) or isinstance(threshold, float):\n",
54 | " split_func = lambda sample: sample[feature_i] >= threshold\n",
55 | " else:\n",
56 | " split_func = lambda sample: sample[feature_i] == threshold\n",
57 | "\n",
58 | " X_1 = np.array([sample for sample in X if split_func(sample)])\n",
59 | " X_2 = np.array([sample for sample in X if not split_func(sample)])\n",
60 | "\n",
61 | " return np.array([X_1, X_2])\n",
62 | "class DecisionNode():\n",
63 | " \"\"\"Class that represents a decision node or leaf in the decision tree\n",
64 | " Parameters:\n",
65 | " -----------\n",
66 | " feature_i: int\n",
67 | " Feature index which we want to use as the threshold measure.\n",
68 | " threshold: float\n",
69 | " The value that we will compare feature values at feature_i against to \n",
70 | " determine the prediction.\n",
71 | " value: float\n",
72 | " The class prediction if classification tree, or float value if regression tree.\n",
73 | " true_branch: DecisionNode\n",
74 | " Next decision node for samples where features value met the threshold.\n",
75 | " false_branch: DecisionNode\n",
76 | " Next decision node for samples where features value did not meet the threshold.\n",
77 | " \"\"\"\n",
78 | " def __init__(self, feature_i=None, threshold=None,\n",
79 | " value=None, true_branch=None, false_branch=None):\n",
80 | " self.feature_i = feature_i # Index for the feature that is tested\n",
81 | " self.threshold = threshold # Threshold value for feature\n",
82 | " self.value = value # Value if the node is a leaf in the tree\n",
83 | " self.true_branch = true_branch # 'Left' subtree\n",
84 | " self.false_branch = false_branch # 'Right' subtree\n",
85 | "\n",
86 | "\n",
87 | "# Super class of RegressionTree and ClassificationTree\n",
88 | "class DecisionTree(object):\n",
89 | " \"\"\"Super class of RegressionTree and ClassificationTree.\n",
90 | " Parameters:\n",
91 | " -----------\n",
92 | " min_samples_split: int\n",
93 | " The minimum number of samples needed to make a split when building a tree.\n",
94 | " min_impurity: float\n",
95 | " The minimum impurity required to split the tree further. \n",
96 | " max_depth: int\n",
97 | " The maximum depth of a tree.\n",
98 | " loss: function\n",
99 | " Loss function that is used for Gradient Boosting models to calculate impurity.\n",
100 | " \"\"\"\n",
101 | " def __init__(self, min_samples_split=2, min_impurity=1e-7,\n",
102 | " max_depth=float(\"inf\"), loss=None):\n",
103 | " self.root = None # Root node in dec. tree\n",
104 | " # Minimum n of samples to justify split\n",
105 | " self.min_samples_split = min_samples_split\n",
106 | " # The minimum impurity to justify split\n",
107 | " self.min_impurity = min_impurity\n",
108 | " # The maximum depth to grow the tree to\n",
109 | " self.max_depth = max_depth\n",
110 | " # Function to calculate impurity (classif.=>info gain, regr=>variance reduct.)\n",
111 | " self._impurity_calculation = None\n",
112 | " # Function to determine prediction of y at leaf\n",
113 | " self._leaf_value_calculation = None\n",
114 | " # If y is one-hot encoded (multi-dim) or not (one-dim)\n",
115 | " self.one_dim = None\n",
116 | " # If Gradient Boost\n",
117 | " self.loss = loss\n",
118 | "\n",
119 | " def fit(self, X, y, loss=None):\n",
120 | " \"\"\" Build decision tree \"\"\"\n",
121 | " self.one_dim = len(np.shape(y)) == 1\n",
122 | " self.root = self._build_tree(X, y)\n",
123 | " self.loss=None\n",
124 | "\n",
125 | " def _build_tree(self, X, y, current_depth=0):\n",
126 | " \"\"\" Recursive method which builds out the decision tree and splits X and respective y\n",
127 | " on the feature of X which (based on impurity) best separates the data\"\"\"\n",
128 | "\n",
129 | " largest_impurity = 0\n",
130 | " best_criteria = None # Feature index and threshold\n",
131 | " best_sets = None # Subsets of the data\n",
132 | "\n",
133 | " # Check if expansion of y is needed\n",
134 | " if len(np.shape(y)) == 1:\n",
135 | " y = np.expand_dims(y, axis=1)\n",
136 | "\n",
137 | " # Add y as last column of X\n",
138 | " Xy = np.concatenate((X, y), axis=1)\n",
139 | "\n",
140 | " n_samples, n_features = np.shape(X)\n",
141 | "\n",
142 | " if n_samples >= self.min_samples_split and current_depth <= self.max_depth:\n",
143 | " # Calculate the impurity for each feature\n",
144 | " for feature_i in range(n_features):\n",
145 | " # All values of feature_i\n",
146 | " feature_values = np.expand_dims(X[:, feature_i], axis=1)\n",
147 | " unique_values = np.unique(feature_values)\n",
148 | "\n",
149 | " # Iterate through all unique values of feature column i and\n",
150 | " # calculate the impurity\n",
151 | " for threshold in unique_values:\n",
152 | " # Divide X and y depending on if the feature value of X at index feature_i\n",
153 | " # meets the threshold\n",
154 | " Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)\n",
155 | " \n",
156 | " if len(Xy1) > 0 and len(Xy2) > 0:\n",
157 | " # Select the y-values of the two sets\n",
158 | " y1 = Xy1[:, n_features:]\n",
159 | " y2 = Xy2[:, n_features:]\n",
160 | "\n",
161 | " # Calculate impurity\n",
162 | " impurity = self._impurity_calculation(y, y1, y2)\n",
163 | "\n",
164 | " # If this threshold resulted in a higher information gain than previously\n",
165 | " # recorded save the threshold value and the feature\n",
166 | " # index\n",
167 | " if impurity > largest_impurity:\n",
168 | " largest_impurity = impurity\n",
169 | " best_criteria = {\"feature_i\": feature_i, \"threshold\": threshold}\n",
170 | " best_sets = {\n",
171 | " \"leftX\": Xy1[:, :n_features], # X of left subtree\n",
172 | " \"lefty\": Xy1[:, n_features:], # y of left subtree\n",
173 | " \"rightX\": Xy2[:, :n_features], # X of right subtree\n",
174 | " \"righty\": Xy2[:, n_features:] # y of right subtree\n",
175 | " }\n",
176 | "\n",
177 | " if largest_impurity > self.min_impurity:\n",
178 | " # Build subtrees for the right and left branches\n",
179 | " true_branch = self._build_tree(best_sets[\"leftX\"], best_sets[\"lefty\"], current_depth + 1)\n",
180 | " false_branch = self._build_tree(best_sets[\"rightX\"], best_sets[\"righty\"], current_depth + 1)\n",
181 | " return DecisionNode(feature_i=best_criteria[\"feature_i\"], threshold=best_criteria[\n",
182 | " \"threshold\"], true_branch=true_branch, false_branch=false_branch)\n",
183 | "\n",
184 | " # We're at leaf => determine value\n",
185 | " leaf_value = self._leaf_value_calculation(y)\n",
186 | "\n",
187 | " return DecisionNode(value=leaf_value)\n",
188 | "\n",
189 | "\n",
190 | " def predict_value(self, x, tree=None):\n",
191 | " \"\"\" Do a recursive search down the tree and make a prediction of the data sample by the\n",
192 | " value of the leaf that we end up at \"\"\"\n",
193 | "\n",
194 | " if tree is None:\n",
195 | " tree = self.root\n",
196 | "\n",
197 | " # If we have a value (i.e we're at a leaf) => return value as the prediction\n",
198 | " if tree.value is not None:\n",
199 | " return tree.value\n",
200 | "\n",
201 | " # Choose the feature that we will test\n",
202 | " feature_value = x[tree.feature_i]\n",
203 | "\n",
204 | " # Determine if we will follow left or right branch\n",
205 | " branch = tree.false_branch\n",
206 | " if isinstance(feature_value, int) or isinstance(feature_value, float):\n",
207 | " if feature_value >= tree.threshold:\n",
208 | " branch = tree.true_branch\n",
209 | " elif feature_value == tree.threshold:\n",
210 | " branch = tree.true_branch\n",
211 | "\n",
212 | " # Test subtree\n",
213 | " return self.predict_value(x, branch)\n",
214 | "\n",
215 | " def predict(self, X):\n",
216 | " \"\"\" Classify samples one by one and return the set of labels \"\"\"\n",
217 | " y_pred = []\n",
218 | " for x in X:\n",
219 | " y_pred.append(self.predict_value(x))\n",
220 | " return y_pred\n",
221 | "\n",
222 | " def print_tree(self, tree=None, indent=\" \"):\n",
223 | " \"\"\" Recursively print the decision tree \"\"\"\n",
224 | " if not tree:\n",
225 | " tree = self.root\n",
226 | "\n",
227 | " # If we're at leaf => print the label\n",
228 | " if tree.value is not None:\n",
229 | " print (tree.value)\n",
230 | " # Go deeper down the tree\n",
231 | " else:\n",
232 | " # Print test\n",
233 | " print (\"%s:%s? \" % (tree.feature_i, tree.threshold))\n",
234 | " # Print the true scenario\n",
235 | " print (\"%sT->\" % (indent), end=\"\")\n",
236 | " self.print_tree(tree.true_branch, indent + indent)\n",
237 | " # Print the false scenario\n",
238 | " print (\"%sF->\" % (indent), end=\"\")\n",
239 | " self.print_tree(tree.false_branch, indent + indent)\n",
240 | "\n",
241 | "\n",
242 | "\n",
243 | "class XGBoostRegressionTree(DecisionTree):\n",
244 | " \"\"\"\n",
245 | " Regression tree for XGBoost\n",
246 | " - Reference -\n",
247 | " http://xgboost.readthedocs.io/en/latest/model.html\n",
248 | " \"\"\"\n",
249 | "\n",
250 | " def _split(self, y):\n",
251 | " \"\"\" y contains y_true in left half of the middle column and \n",
252 | " y_pred in the right half. Split and return the two matrices \"\"\"\n",
253 | " col = int(np.shape(y)[1]/2)\n",
254 | " y, y_pred = y[:, :col], y[:, col:]\n",
255 | " return y, y_pred\n",
256 | "\n",
257 | " def _gain(self, y, y_pred):\n",
258 | " nominator = np.power((y * self.loss.gradient(y, y_pred)).sum(), 2)\n",
259 | " denominator = self.loss.hess(y, y_pred).sum()\n",
260 | " return 0.5 * (nominator / denominator)\n",
261 | "\n",
262 | " def _gain_by_taylor(self, y, y1, y2):\n",
263 | " # Split\n",
264 | " y, y_pred = self._split(y)\n",
265 | " y1, y1_pred = self._split(y1)\n",
266 | " y2, y2_pred = self._split(y2)\n",
267 | "\n",
268 | " true_gain = self._gain(y1, y1_pred)\n",
269 | " false_gain = self._gain(y2, y2_pred)\n",
270 | " gain = self._gain(y, y_pred)\n",
271 | " return true_gain + false_gain - gain\n",
272 | "\n",
273 | " def _approximate_update(self, y):\n",
274 | " # y split into y, y_pred\n",
275 | " y, y_pred = self._split(y)\n",
276 | " # Newton's Method\n",
277 | " gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0)\n",
278 | " hessian = np.sum(self.loss.hess(y, y_pred), axis=0)\n",
279 | " update_approximation = gradient / hessian \n",
280 | "\n",
281 | " return update_approximation\n",
282 | "\n",
283 | " def fit(self, X, y):\n",
284 | " self._impurity_calculation = self._gain_by_taylor\n",
285 | " self._leaf_value_calculation = self._approximate_update\n",
286 | " super(XGBoostRegressionTree, self).fit(X, y)"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 2,
292 | "metadata": {},
293 | "outputs": [
294 | {
295 | "name": "stderr",
296 | "output_type": "stream",
297 | "text": [
298 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
299 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
300 | ]
301 | }
302 | ],
303 | "source": [
304 | "class XGBoost(object):\n",
305 | " \"\"\"The XGBoost classifier.\n",
306 | " Reference: http://xgboost.readthedocs.io/en/latest/model.html\n",
307 | " Parameters:\n",
308 | " -----------\n",
309 | " n_estimators: int\n",
310 | " The number of classification trees that are used.\n",
311 | " learning_rate: float\n",
312 | " The step length that will be taken when following the negative gradient during\n",
313 | " training.\n",
314 | " min_samples_split: int\n",
315 | " The minimum number of samples needed to make a split when building a tree.\n",
316 | " min_impurity: float\n",
317 | " The minimum impurity required to split the tree further. \n",
318 | " max_depth: int\n",
319 | " The maximum depth of a tree.\n",
320 | " \"\"\"\n",
321 | " def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,\n",
322 | " min_impurity=1e-7, max_depth=2):\n",
323 | " self.n_estimators = n_estimators # Number of trees\n",
324 | " self.learning_rate = learning_rate # Step size for weight update\n",
325 | " self.min_samples_split = min_samples_split # The minimum n of sampels to justify split\n",
326 | " self.min_impurity = min_impurity # Minimum variance reduction to continue\n",
327 | " self.max_depth = max_depth # Maximum depth for tree\n",
328 | "\n",
329 | " \n",
330 | " # Log loss for classification\n",
331 | " self.loss = LogisticLoss()\n",
332 | "\n",
333 | " # Initialize regression trees\n",
334 | " self.trees = []\n",
335 | " for _ in range(n_estimators):\n",
336 | " tree = XGBoostRegressionTree(\n",
337 | " min_samples_split=self.min_samples_split,\n",
338 | " min_impurity=min_impurity,\n",
339 | " max_depth=self.max_depth,\n",
340 | " loss=self.loss)\n",
341 | "\n",
342 | " self.trees.append(tree)\n",
343 | "\n",
344 | " def fit(self, X, y):\n",
345 | " y = to_categorical(y)\n",
346 | "\n",
347 | " y_pred = np.zeros(np.shape(y))\n",
348 | " for i in range(self.n_estimators):\n",
349 | " tree = self.trees[i]\n",
350 | " y_and_pred = np.concatenate((y, y_pred), axis=1)\n",
351 | " tree.fit(X, y_and_pred)\n",
352 | " update_pred = tree.predict(X)\n",
353 | "\n",
354 | " y_pred -= np.multiply(self.learning_rate, update_pred)\n",
355 | "\n",
356 | " def predict(self, X):\n",
357 | " y_pred = None\n",
358 | " # Make predictions\n",
359 | " for tree in self.trees:\n",
360 | " # Estimate gradient and update prediction\n",
361 | " update_pred = tree.predict(X)\n",
362 | " if y_pred is None:\n",
363 | " y_pred = np.zeros_like(update_pred)\n",
364 | " y_pred -= np.multiply(self.learning_rate, update_pred)\n",
365 | "\n",
366 | " # Turn into probability distribution (Softmax)\n",
367 | " y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)\n",
368 | " # Set label to the value that maximizes probability\n",
369 | " y_pred = np.argmax(y_pred, axis=1)\n",
370 | " return y_pred\n",
371 | " \n",
372 | "data = datasets.load_iris()\n",
373 | "X = data.data\n",
374 | "y = data.target\n",
375 | "\n",
376 | "from sklearn.cross_validation import train_test_split\n",
377 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 3,
383 | "metadata": {},
384 | "outputs": [
385 | {
386 | "data": {
387 | "text/plain": [
388 | "0.9333333333333333"
389 | ]
390 | },
391 | "execution_count": 3,
392 | "metadata": {},
393 | "output_type": "execute_result"
394 | }
395 | ],
396 | "source": [
397 | "clf = XGBoost()\n",
398 | "clf.fit(X_train, y_train)\n",
399 | "y_pred = clf.predict(X_test)\n",
400 | "accuracy = accuracy_score(y_test,y_pred)\n",
401 | "accuracy"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": []
410 | }
411 | ],
412 | "metadata": {
413 | "kernelspec": {
414 | "display_name": "Python 3",
415 | "language": "python",
416 | "name": "python3"
417 | },
418 | "language_info": {
419 | "codemirror_mode": {
420 | "name": "ipython",
421 | "version": 3
422 | },
423 | "file_extension": ".py",
424 | "mimetype": "text/x-python",
425 | "name": "python",
426 | "nbconvert_exporter": "python",
427 | "pygments_lexer": "ipython3",
428 | "version": "3.6.4"
429 | }
430 | },
431 | "nbformat": 4,
432 | "nbformat_minor": 2
433 | }
434 |
--------------------------------------------------------------------------------
/Part6/xgboost.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | from sklearn import datasets
9 | from sklearn.metrics import accuracy_score,mean_squared_error
10 | # 定义Sigmoid函数用来做对数几率回归
11 |
12 | class Sigmoid():
13 | def __call__(self, x):
14 | return 1 / (1 + np.exp(-x))
15 |
16 | def gradient(self, x):
17 | return self.__call__(x) * (1 - self.__call__(x))
18 |
19 | class LogisticLoss():
20 | def __init__(self):
21 | sigmoid = Sigmoid()
22 | self.log_func = sigmoid
23 | self.log_grad = sigmoid.gradient
24 |
25 | def loss(self, y, y_pred):
26 | y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
27 | p = self.log_func(y_pred)
28 | return y * np.log(p) + (1 - y) * np.log(1 - p)
29 |
30 | # gradient w.r.t y_pred
31 | def gradient(self, y, y_pred):
32 | p = self.log_func(y_pred)
33 | return -(y - p)
34 |
35 | # w.r.t y_pred
36 | def hess(self, y, y_pred):
37 | p = self.log_func(y_pred)
38 | return p * (1 - p)
39 | def to_categorical(x, n_col=None):
40 | """ One-hot encoding of nominal values """
41 | if not n_col:
42 | n_col = np.amax(x) + 1
43 | one_hot = np.zeros((x.shape[0], n_col))
44 | one_hot[np.arange(x.shape[0]), x] = 1
45 | return one_hot
46 |
47 | def divide_on_feature(X, feature_i, threshold):
48 | """ Divide dataset based on if sample value on feature index is larger than
49 | the given threshold """
50 | split_func = None
51 | if isinstance(threshold, int) or isinstance(threshold, float):
52 | split_func = lambda sample: sample[feature_i] >= threshold
53 | else:
54 | split_func = lambda sample: sample[feature_i] == threshold
55 |
56 | X_1 = np.array([sample for sample in X if split_func(sample)])
57 | X_2 = np.array([sample for sample in X if not split_func(sample)])
58 |
59 | return np.array([X_1, X_2])
60 | class DecisionNode():
61 | """Class that represents a decision node or leaf in the decision tree
62 | Parameters:
63 | -----------
64 | feature_i: int
65 | Feature index which we want to use as the threshold measure.
66 | threshold: float
67 | The value that we will compare feature values at feature_i against to
68 | determine the prediction.
69 | value: float
70 | The class prediction if classification tree, or float value if regression tree.
71 | true_branch: DecisionNode
72 | Next decision node for samples where features value met the threshold.
73 | false_branch: DecisionNode
74 | Next decision node for samples where features value did not meet the threshold.
75 | """
76 | def __init__(self, feature_i=None, threshold=None,
77 | value=None, true_branch=None, false_branch=None):
78 | self.feature_i = feature_i # Index for the feature that is tested
79 | self.threshold = threshold # Threshold value for feature
80 | self.value = value # Value if the node is a leaf in the tree
81 | self.true_branch = true_branch # 'Left' subtree
82 | self.false_branch = false_branch # 'Right' subtree
83 |
84 |
85 | # Super class of RegressionTree and ClassificationTree
86 | class DecisionTree(object):
87 | """Super class of RegressionTree and ClassificationTree.
88 | Parameters:
89 | -----------
90 | min_samples_split: int
91 | The minimum number of samples needed to make a split when building a tree.
92 | min_impurity: float
93 | The minimum impurity required to split the tree further.
94 | max_depth: int
95 | The maximum depth of a tree.
96 | loss: function
97 | Loss function that is used for Gradient Boosting models to calculate impurity.
98 | """
99 | def __init__(self, min_samples_split=2, min_impurity=1e-7,
100 | max_depth=float("inf"), loss=None):
101 | self.root = None # Root node in dec. tree
102 | # Minimum n of samples to justify split
103 | self.min_samples_split = min_samples_split
104 | # The minimum impurity to justify split
105 | self.min_impurity = min_impurity
106 | # The maximum depth to grow the tree to
107 | self.max_depth = max_depth
108 | # Function to calculate impurity (classif.=>info gain, regr=>variance reduct.)
109 | self._impurity_calculation = None
110 | # Function to determine prediction of y at leaf
111 | self._leaf_value_calculation = None
112 | # If y is one-hot encoded (multi-dim) or not (one-dim)
113 | self.one_dim = None
114 | # If Gradient Boost
115 | self.loss = loss
116 |
117 | def fit(self, X, y, loss=None):
118 | """ Build decision tree """
119 | self.one_dim = len(np.shape(y)) == 1
120 | self.root = self._build_tree(X, y)
121 | self.loss=None
122 |
123 | def _build_tree(self, X, y, current_depth=0):
124 | """ Recursive method which builds out the decision tree and splits X and respective y
125 | on the feature of X which (based on impurity) best separates the data"""
126 |
127 | largest_impurity = 0
128 | best_criteria = None # Feature index and threshold
129 | best_sets = None # Subsets of the data
130 |
131 | # Check if expansion of y is needed
132 | if len(np.shape(y)) == 1:
133 | y = np.expand_dims(y, axis=1)
134 |
135 | # Add y as last column of X
136 | Xy = np.concatenate((X, y), axis=1)
137 |
138 | n_samples, n_features = np.shape(X)
139 |
140 | if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
141 | # Calculate the impurity for each feature
142 | for feature_i in range(n_features):
143 | # All values of feature_i
144 | feature_values = np.expand_dims(X[:, feature_i], axis=1)
145 | unique_values = np.unique(feature_values)
146 |
147 | # Iterate through all unique values of feature column i and
148 | # calculate the impurity
149 | for threshold in unique_values:
150 | # Divide X and y depending on if the feature value of X at index feature_i
151 | # meets the threshold
152 | Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
153 |
154 | if len(Xy1) > 0 and len(Xy2) > 0:
155 | # Select the y-values of the two sets
156 | y1 = Xy1[:, n_features:]
157 | y2 = Xy2[:, n_features:]
158 |
159 | # Calculate impurity
160 | impurity = self._impurity_calculation(y, y1, y2)
161 |
162 | # If this threshold resulted in a higher information gain than previously
163 | # recorded save the threshold value and the feature
164 | # index
165 | if impurity > largest_impurity:
166 | largest_impurity = impurity
167 | best_criteria = {"feature_i": feature_i, "threshold": threshold}
168 | best_sets = {
169 | "leftX": Xy1[:, :n_features], # X of left subtree
170 | "lefty": Xy1[:, n_features:], # y of left subtree
171 | "rightX": Xy2[:, :n_features], # X of right subtree
172 | "righty": Xy2[:, n_features:] # y of right subtree
173 | }
174 |
175 | if largest_impurity > self.min_impurity:
176 | # Build subtrees for the right and left branches
177 | true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
178 | false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
179 | return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
180 | "threshold"], true_branch=true_branch, false_branch=false_branch)
181 |
182 | # We're at leaf => determine value
183 | leaf_value = self._leaf_value_calculation(y)
184 |
185 | return DecisionNode(value=leaf_value)
186 |
187 |
188 | def predict_value(self, x, tree=None):
189 | """ Do a recursive search down the tree and make a prediction of the data sample by the
190 | value of the leaf that we end up at """
191 |
192 | if tree is None:
193 | tree = self.root
194 |
195 | # If we have a value (i.e we're at a leaf) => return value as the prediction
196 | if tree.value is not None:
197 | return tree.value
198 |
199 | # Choose the feature that we will test
200 | feature_value = x[tree.feature_i]
201 |
202 | # Determine if we will follow left or right branch
203 | branch = tree.false_branch
204 | if isinstance(feature_value, int) or isinstance(feature_value, float):
205 | if feature_value >= tree.threshold:
206 | branch = tree.true_branch
207 | elif feature_value == tree.threshold:
208 | branch = tree.true_branch
209 |
210 | # Test subtree
211 | return self.predict_value(x, branch)
212 |
213 | def predict(self, X):
214 | """ Classify samples one by one and return the set of labels """
215 | y_pred = []
216 | for x in X:
217 | y_pred.append(self.predict_value(x))
218 | return y_pred
219 |
220 | def print_tree(self, tree=None, indent=" "):
221 | """ Recursively print the decision tree """
222 | if not tree:
223 | tree = self.root
224 |
225 | # If we're at leaf => print the label
226 | if tree.value is not None:
227 | print (tree.value)
228 | # Go deeper down the tree
229 | else:
230 | # Print test
231 | print ("%s:%s? " % (tree.feature_i, tree.threshold))
232 | # Print the true scenario
233 | print ("%sT->" % (indent), end="")
234 | self.print_tree(tree.true_branch, indent + indent)
235 | # Print the false scenario
236 | print ("%sF->" % (indent), end="")
237 | self.print_tree(tree.false_branch, indent + indent)
238 |
239 |
240 |
241 | class XGBoostRegressionTree(DecisionTree):
242 | """
243 | Regression tree for XGBoost
244 | - Reference -
245 | http://xgboost.readthedocs.io/en/latest/model.html
246 | """
247 |
248 | def _split(self, y):
249 | """ y contains y_true in left half of the middle column and
250 | y_pred in the right half. Split and return the two matrices """
251 | col = int(np.shape(y)[1]/2)
252 | y, y_pred = y[:, :col], y[:, col:]
253 | return y, y_pred
254 |
255 | def _gain(self, y, y_pred):
256 | nominator = np.power((y * self.loss.gradient(y, y_pred)).sum(), 2)
257 | denominator = self.loss.hess(y, y_pred).sum()
258 | return 0.5 * (nominator / denominator)
259 |
260 | def _gain_by_taylor(self, y, y1, y2):
261 | # Split
262 | y, y_pred = self._split(y)
263 | y1, y1_pred = self._split(y1)
264 | y2, y2_pred = self._split(y2)
265 |
266 | true_gain = self._gain(y1, y1_pred)
267 | false_gain = self._gain(y2, y2_pred)
268 | gain = self._gain(y, y_pred)
269 | return true_gain + false_gain - gain
270 |
271 | def _approximate_update(self, y):
272 | # y split into y, y_pred
273 | y, y_pred = self._split(y)
274 | # Newton's Method
275 | gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0)
276 | hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
277 | update_approximation = gradient / hessian
278 |
279 | return update_approximation
280 |
281 | def fit(self, X, y):
282 | self._impurity_calculation = self._gain_by_taylor
283 | self._leaf_value_calculation = self._approximate_update
284 | super(XGBoostRegressionTree, self).fit(X, y)
285 |
286 |
287 | # In[2]:
288 |
289 |
290 | class XGBoost(object):
291 | """The XGBoost classifier.
292 | Reference: http://xgboost.readthedocs.io/en/latest/model.html
293 | Parameters:
294 | -----------
295 | n_estimators: int
296 | The number of classification trees that are used.
297 | learning_rate: float
298 | The step length that will be taken when following the negative gradient during
299 | training.
300 | min_samples_split: int
301 | The minimum number of samples needed to make a split when building a tree.
302 | min_impurity: float
303 | The minimum impurity required to split the tree further.
304 | max_depth: int
305 | The maximum depth of a tree.
306 | """
307 | def __init__(self, n_estimators=200, learning_rate=0.001, min_samples_split=2,
308 | min_impurity=1e-7, max_depth=2):
309 | self.n_estimators = n_estimators # Number of trees
310 | self.learning_rate = learning_rate # Step size for weight update
311 | self.min_samples_split = min_samples_split # The minimum n of sampels to justify split
312 | self.min_impurity = min_impurity # Minimum variance reduction to continue
313 | self.max_depth = max_depth # Maximum depth for tree
314 |
315 |
316 | # Log loss for classification
317 | self.loss = LogisticLoss()
318 |
319 | # Initialize regression trees
320 | self.trees = []
321 | for _ in range(n_estimators):
322 | tree = XGBoostRegressionTree(
323 | min_samples_split=self.min_samples_split,
324 | min_impurity=min_impurity,
325 | max_depth=self.max_depth,
326 | loss=self.loss)
327 |
328 | self.trees.append(tree)
329 |
330 | def fit(self, X, y):
331 | y = to_categorical(y)
332 |
333 | y_pred = np.zeros(np.shape(y))
334 | for i in range(self.n_estimators):
335 | tree = self.trees[i]
336 | y_and_pred = np.concatenate((y, y_pred), axis=1)
337 | tree.fit(X, y_and_pred)
338 | update_pred = tree.predict(X)
339 |
340 | y_pred -= np.multiply(self.learning_rate, update_pred)
341 |
342 | def predict(self, X):
343 | y_pred = None
344 | # Make predictions
345 | for tree in self.trees:
346 | # Estimate gradient and update prediction
347 | update_pred = tree.predict(X)
348 | if y_pred is None:
349 | y_pred = np.zeros_like(update_pred)
350 | y_pred -= np.multiply(self.learning_rate, update_pred)
351 |
352 | # Turn into probability distribution (Softmax)
353 | y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)
354 | # Set label to the value that maximizes probability
355 | y_pred = np.argmax(y_pred, axis=1)
356 | return y_pred
357 |
358 | data = datasets.load_iris()
359 | X = data.data
360 | y = data.target
361 |
362 | from sklearn.cross_validation import train_test_split
363 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
364 |
365 |
366 | # In[3]:
367 |
368 |
369 | clf = XGBoost()
370 | clf.fit(X_train, y_train)
371 | y_pred = clf.predict(X_test)
372 | accuracy = accuracy_score(y_test,y_pred)
373 | accuracy
374 |
375 |
--------------------------------------------------------------------------------
/Part7/SVM.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "from sklearn.datasets import load_iris\n",
12 | "from sklearn.model_selection import train_test_split\n",
13 | "import matplotlib.pyplot as plt\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "data": {
24 | "text/plain": [
25 | ""
26 | ]
27 | },
28 | "execution_count": 2,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | },
32 | {
33 | "data": {
34 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAGihJREFUeJzt3X+MXWWdx/H3d4dZOiowoYwrzJQtP0yjQNfCCJImxAV3q7UWgiyU4I8qC7sGFwwuRgxBbUzAkOCPJdEUyALCFrsVS2H5sQhLVAI1U8B2bSWCoJ2BXYZii6wFyvDdP+6ddubOnbn3ufeeuc/z3M8raTr33Ken3+cc/XJ7zuc819wdERHJy5+1uwAREWk9NXcRkQypuYuIZEjNXUQkQ2ruIiIZUnMXEcmQmruISIbU3EVEMqTmLiKSof3qHWhmXcAQMOLuyyreWwlcA4yUN13n7jfMtL9DDjnE58+fH1SsiEin27Rp00vu3ldrXN3NHbgE2AYcOM37P3T3z9e7s/nz5zM0NBTw14uIiJn9rp5xdV2WMbMB4KPAjJ/GRUQkDvVec/828CXgrRnGfNzMNpvZOjObV22AmV1oZkNmNjQ6Ohpaq4iI1KlmczezZcCL7r5phmF3AfPdfSHwE+DmaoPcfbW7D7r7YF9fzUtGIiLSoHquuS8GlpvZUmAOcKCZ3erunxgf4O47Joy/Hvhma8sUEWmdPXv2MDw8zGuvvdbuUqY1Z84cBgYG6O7ubujP12zu7n45cDmAmX0Q+OeJjb28/VB3f6H8cjmlG68iIlEaHh7mgAMOYP78+ZhZu8uZwt3ZsWMHw8PDHHHEEQ3to+Gcu5mtMrPl5ZcXm9mvzOyXwMXAykb3KyJStNdee425c+dG2dgBzIy5c+c29S+LkCgk7v4w8HD55ysnbN/76V4kN+ufGOGa+5/i+Z27Oay3h8uWLOCMRf3tLkuaFGtjH9dsfUHNXaTTrH9ihMvv2MLuPWMAjOzczeV3bAFQg5eoafkBkRlcc/9Texv7uN17xrjm/qfaVJHk4r777mPBggUcffTRXH311S3fv5q7yAye37k7aLtIPcbGxrjooou499572bp1K2vWrGHr1q0t/Tt0WUZkBof19jBSpZEf1tvThmqkXVp93+UXv/gFRx99NEceeSQAK1as4M477+S9731vq0rWJ3eRmVy2ZAE93V2TtvV0d3HZkgVtqkhm2/h9l5Gdu3H23XdZ/8RIzT87nZGREebN2/cg/8DAACMjje+vGjV3kRmcsaifq848jv7eHgzo7+3hqjOP083UDlLEfRd3n7Kt1ekdXZYRqeGMRf1q5h2siPsuAwMDbN++fe/r4eFhDjvssIb3V40+uYuIzGC6+yvN3Hd5//vfz29+8xueffZZ3njjDW6//XaWL19e+w8GUHMXEZlBEfdd9ttvP6677jqWLFnCe97zHs4++2yOOeaYZkud/He0dG8iIpkZvyTX6qeUly5dytKlS1tRYlVq7iIiNaR430WXZUREMqTmLiKSITV3EZEMqbmLiGRIzV1EJENq7pKN9U+MsPjqhzjiy//B4qsfamrtD5Giffazn+Wd73wnxx57bCH7V3OXLBSxuJNIkVauXMl9991X2P7V3CUL+lINKdTmtfCtY+FrvaXfN69tepennHIKBx98cAuKq04PMUkW9KUaUpjNa+Gui2FP+X9Lu7aXXgMsPLt9ddWgT+6ShSIWdxIB4MFV+xr7uD27S9sjpuYuWdCXakhhdg2HbY+ELstIFopa3EmEgwZKl2KqbY+YmrtkI8XFnSQBp105+Zo7QHdPaXsTzj33XB5++GFeeuklBgYG+PrXv87555/fZLH7qLlL01r95cEiURm/afrgqtKlmIMGSo29yZupa9asaUFx01Nzl6aM58vHY4jj+XJADV7ysfDsqJMx1eiGqjRF+XKROKm5S1OUL5dUuXu7S5hRs/WpuUtTlC+XFM2ZM4cdO3ZE2+DdnR07djBnzpyG96Fr7tKUy5YsmHTNHZQvl/gNDAwwPDzM6Ohou0uZ1pw5cxgYaDxuqeYuTVG+XFLU3d3NEUcc0e4yClV3czezLmAIGHH3ZRXv7Q/cApwA7ADOcffnWlinREz5cpH4hHxyvwTYBhxY5b3zgT+4+9FmtgL4JnBOC+oTSYoy/xKLum6omtkA8FHghmmGnA7cXP55HXCamVnz5YmkQ2vKS0zqTct8G/gS8NY07/cD2wHc/U1gFzC36epEEqLMv8SkZnM3s2XAi+6+aaZhVbZNyRiZ2YVmNmRmQzHfpRZphDL/EpN6PrkvBpab2XPA7cCpZnZrxZhhYB6Ame0HHAS8XLkjd1/t7oPuPtjX19dU4SKxUeZfYlKzubv75e4+4O7zgRXAQ+7+iYphG4BPl38+qzwmzqcDRAqiNeUlJg3n3M1sFTDk7huAG4EfmNnTlD6xr2hRfSLJUOZfYmLt+oA9ODjoQ0NDbfm7RURSZWab3H2w1jg9oSrRumL9FtZs3M6YO11mnHvSPL5xxnHtLkskCWruEqUr1m/h1sd+v/f1mPve12rwIrVpVUiJ0pqNVb6zcobtIjKZmrtEaWyae0HTbReRydTcJUpd06xeMd12EZlMzV2idO5J84K2i8hkuqEqURq/aaq0jEhjlHMXEUmIcu7SlPOuf5RHntm3PNDiow7mtgtObmNF7aM12iVFuuYuU1Q2doBHnnmZ865/tE0VtY/WaJdUqbnLFJWNvdb2nGmNdkmVmrvIDLRGu6RKzV1kBlqjXVKl5i5TLD7q4KDtOdMa7ZIqNXeZ4rYLTp7SyDs1LXPGon6uOvM4+nt7MKC/t4erzjxOaRmJnnLuIiIJUc5dmlJUtjtkv8qXizROzV2mGM92j0cAx7PdQFPNNWS/RdUg0il0zV2mKCrbHbJf5ctFmqPmLlMUle0O2a/y5SLNUXOXKYrKdofsV/lykeaoucsURWW7Q/arfLlIc3RDVaYYv2HZ6qRKyH6LqkGkUyjnLiKSEOXcC5ZiBjvFmkWkMWruDUgxg51izSLSON1QbUCKGewUaxaRxqm5NyDFDHaKNYtI49TcG5BiBjvFmkWkcWruDUgxg51izSLSON1QbUCKGewUaxaRxtXMuZvZHOCnwP6U/mOwzt2/WjFmJXANMP6V8Ne5+w0z7Vc5dxGRcK3Mub8OnOrur5pZN/BzM7vX3R+rGPdDd/98I8XK7Lhi/RbWbNzOmDtdZpx70jy+ccZxTY+NJT8fSx0iMajZ3L300f7V8svu8q/2PNYqDbti/RZufez3e1+Pue99Xdm0Q8bGkp+PpQ6RWNR1Q9XMuszsSeBF4AF331hl2MfNbLOZrTOzeS2tUpq2ZuP2ureHjI0lPx9LHSKxqKu5u/uYu78PGABONLNjK4bcBcx394XAT4Cbq+3HzC40syEzGxodHW2mbgk0Ns29lWrbQ8bGkp+PpQ6RWARFId19J/Aw8OGK7Tvc/fXyy+uBE6b586vdfdDdB/v6+hooVxrVZVb39pCxseTnY6lDJBY1m7uZ9ZlZb/nnHuBDwK8rxhw64eVyYFsri5TmnXtS9Stl1baHjI0lPx9LHSKxqCctcyhws5l1UfqPwVp3v9vMVgFD7r4BuNjMlgNvAi8DK4sqWBozfiO0ngRMyNhY8vOx1CESC63nLiKSEK3nXrCiMtUh+fIi9x0yvxSPRXI2r4UHV8GuYThoAE67Ehae3e6qJGJq7g0oKlMdki8vct8h80vxWCRn81q462LYU07+7Npeeg1q8DItLRzWgKIy1SH58iL3HTK/FI9Fch5cta+xj9uzu7RdZBpq7g0oKlMdki8vct8h80vxWCRn13DYdhHU3BtSVKY6JF9e5L5D5pfisUjOQQNh20VQc29IUZnqkHx5kfsOmV+KxyI5p10J3RX/sezuKW0XmYZuqDagqEx1SL68yH2HzC/FY5Gc8ZumSstIAOXcRUQSopy7TBFDdl0Sp7x9MtTcO0QM2XVJnPL2SdEN1Q4RQ3ZdEqe8fVLU3DtEDNl1SZzy9klRc+8QMWTXJXHK2ydFzb1DxJBdl8Qpb58U3VDtEDFk1yVxytsnRTl3EZGEKOdeVlReO2S/saxLrux6ZHLPjOc+vxBtOBZZN/ei8toh+41lXXJl1yOTe2Y89/mFaNOxyPqGalF57ZD9xrIuubLrkck9M577/EK06Vhk3dyLymuH7DeWdcmVXY9M7pnx3OcXok3HIuvmXlReO2S/saxLrux6ZHLPjOc+vxBtOhZZN/ei8toh+41lXXJl1yOTe2Y89/mFaNOxyPqGalF57ZD9xrIuubLrkck9M577/EK06Vgo5y4ikhDl3AsWQ37+vOsf5ZFnXt77evFRB3PbBSc3XYNIVu6+FDbdBD4G1gUnrIRl1za/38hz/Flfcy/KeGZ8ZOdunH2Z8fVPjMzafisbO8Ajz7zMedc/2lQNIlm5+1IYurHU2KH0+9CNpe3NGM+u79oO+L7s+ua1TZfcKmruDYghP1/Z2GttF+lIm24K216vBHL8au4NiCE/LyJ18LGw7fVKIMev5t6AGPLzIlIH6wrbXq8Ecvxq7g2IIT+/+KiDq+5juu0iHemElWHb65VAjl/NvQFnLOrnqjOPo7+3BwP6e3u46szjWpKfr3e/t11w8pRGrrSMSIVl18Lg+fs+qVtX6XWzaZmFZ8PHvgsHzQOs9PvHvhtVWkY5dxGRhLQs525mc4CfAvuXx69z969WjNkfuAU4AdgBnOPuzzVQd02h+fLU1jAPWfs992NRaI44JPtcVB1Fzi/yDHZTQueW87GYQT0PMb0OnOrur5pZN/BzM7vX3R+bMOZ84A/ufrSZrQC+CZzT6mJD1yRPbQ3zkLXfcz8Wha6BPZ59HjeefYapDb6oOoqcX85rqYfOLedjUUPNa+5e8mr5ZXf5V+W1nNOBm8s/rwNOM2v9soeh+fLU1jAPWfs992NRaI44JPtcVB1Fzi+BDHbDQueW87Gooa4bqmbWZWZPAi8CD7j7xooh/cB2AHd/E9gFzK2ynwvNbMjMhkZHR4OLDc2Bp5YbD1n7PfdjUWiOOCT7XFQdRc4vgQx2w0LnlvOxqKGu5u7uY+7+PmAAONHMjq0YUu1T+pSO5O6r3X3Q3Qf7+vqCiw3NgaeWGw9Z+z33Y1Fojjgk+1xUHUXOL4EMdsNC55bzsaghKArp7juBh4EPV7w1DMwDMLP9gIOAlj8HH5ovT20N85C133M/FoXmiEOyz0XVUeT8EshgNyx0bjkfixrqScv0AXvcfaeZ9QAfonTDdKINwKeBR4GzgIe8gIxl6Jrkqa1hHrL2e+7HotA1sMdvmtaTlimqjiLnl/Na6qFzy/lY1FAz525mCyndLO2i9El/rbuvMrNVwJC7byjHJX8ALKL0iX2Fu/92pv0q5y4iEq5lOXd330ypaVduv3LCz68BfxdapIiIFCP7L+tI7sEdmR0hD7bE8BBMkQ/upPaQVgznIwFZN/fkHtyR2RHyYEsMD8EU+eBOag9pxXA+EpH1wmHJPbgjsyPkwZYYHoIp8sGd1B7SiuF8JCLr5p7cgzsyO0IebInhIZgiH9xJ7SGtGM5HIrJu7sk9uCOzI+TBlhgeginywZ3UHtKK4XwkIuvmntyDOzI7Qh5sieEhmCIf3EntIa0Yzkcism7uRX2phiQu5IsWYvhShtAaYphfavvNkL6sQ0QkIS17iEmk44V8sUcsUqs5lux6LHW0gJq7yExCvtgjFqnVHEt2PZY6WiTra+4iTQv5Yo9YpFZzLNn1WOpoETV3kZmEfLFHLFKrOZbseix1tIiau8hMQr7YIxap1RxLdj2WOlpEzV1kJiFf7BGL1GqOJbseSx0touYuMpNl18Lg+fs+9VpX6XWMNybHpVZzLNn1WOpoEeXcRUQSopy7zJ4Us8FF1VxUvjzFYyxtpeYuzUkxG1xUzUXly1M8xtJ2uuYuzUkxG1xUzUXly1M8xtJ2au7SnBSzwUXVXFS+PMVjLG2n5i7NSTEbXFTNReXLUzzG0nZq7tKcFLPBRdVcVL48xWMsbafmLs1JMRtcVM1F5ctTPMbSdsq5i4gkpN6cuz65Sz42r4VvHQtf6y39vnnt7O+3qBpEAinnLnkoKgsesl/l0SUi+uQueSgqCx6yX+XRJSJq7pKHorLgIftVHl0iouYueSgqCx6yX+XRJSJq7pKHorLgIftVHl0iouYueSgqCx6yX+XRJSI1c+5mNg+4BXgX8Baw2t2/UzHmg8CdwLPlTXe4+4x3kZRzFxEJ18r13N8Evujuj5vZAcAmM3vA3bdWjPuZuy9rpFiJUIrrh4fUnOL8YqDjloyazd3dXwBeKP/8RzPbBvQDlc1dcpFiXlt59OLpuCUl6Jq7mc0HFgEbq7x9spn90szuNbNjWlCbtEuKeW3l0Yun45aUup9QNbN3AD8CvuDur1S8/Tjwl+7+qpktBdYD766yjwuBCwEOP/zwhouWgqWY11YevXg6bkmp65O7mXVTauy3ufsdle+7+yvu/mr553uAbjM7pMq41e4+6O6DfX19TZYuhUkxr608evF03JJSs7mbmQE3AtvcverapWb2rvI4zOzE8n53tLJQmUUp5rWVRy+ejltS6rkssxj4JLDFzJ4sb/sKcDiAu38fOAv4nJm9CewGVni71hKW5o3fHEspFRFSc4rzi4GOW1K0nruISEJamXOXWClzPNndl8Kmm0pfSG1dpa+3a/ZbkEQSpeaeKmWOJ7v7Uhi6cd9rH9v3Wg1eOpDWlkmVMseTbbopbLtI5tTcU6XM8WQ+FrZdJHNq7qlS5ngy6wrbLpI5NfdUKXM82Qkrw7aLZE7NPVVaO3yyZdfC4Pn7PqlbV+m1bqZKh1LOXUQkIcq5N2D9EyNcc/9TPL9zN4f19nDZkgWcsai/3WW1Tu65+NznFwMd42SouZetf2KEy+/Ywu49pXTFyM7dXH7HFoA8Gnzuufjc5xcDHeOk6Jp72TX3P7W3sY/bvWeMa+5/qk0VtVjuufjc5xcDHeOkqLmXPb9zd9D25OSei899fjHQMU6KmnvZYb09QduTk3suPvf5xUDHOClq7mWXLVlAT/fkB156uru4bMmCNlXUYrnn4nOfXwx0jJOiG6pl4zdNs03L5L4Wd+7zi4GOcVKUcxcRSUi9OXddlhFJwea18K1j4Wu9pd83r01j39I2uiwjErsi8+XKrmdLn9xFYldkvlzZ9WypuYvErsh8ubLr2VJzF4ldkflyZdezpeYuErsi8+XKrmdLzV0kdkWu3a/vBciWcu4iIglRzl1EpIOpuYuIZEjNXUQkQ2ruIiIZUnMXEcmQmruISIbU3EVEMqTmLiKSoZrN3czmmdl/mdk2M/uVmV1SZYyZ2XfN7Gkz22xmxxdTrjRF63aLdIx61nN/E/iiuz9uZgcAm8zsAXffOmHMR4B3l3+dBHyv/LvEQut2i3SUmp/c3f0Fd3+8/PMfgW1A5ReLng7c4iWPAb1mdmjLq5XGad1ukY4SdM3dzOYDi4CNFW/1A9snvB5m6n8AMLMLzWzIzIZGR0fDKpXmaN1ukY5Sd3M3s3cAPwK+4O6vVL5d5Y9MWZHM3Ve7+6C7D/b19YVVKs3Rut0iHaWu5m5m3ZQa+23ufkeVIcPAvAmvB4Dnmy9PWkbrdot0lHrSMgbcCGxz92unGbYB+FQ5NfMBYJe7v9DCOqVZWrdbpKPUk5ZZDHwS2GJmT5a3fQU4HMDdvw/cAywFngb+BHym9aVK0xaerWYu0iFqNnd3/znVr6lPHOPARa0qSkREmqMnVEVEMqTmLiKSITV3EZEMqbmLiGRIzV1EJENq7iIiGVJzFxHJkJUi6m34i81Ggd+15S+v7RDgpXYXUSDNL105zw00v3r8pbvXXJyrbc09ZmY25O6D7a6jKJpfunKeG2h+raTLMiIiGVJzFxHJkJp7davbXUDBNL905Tw30PxaRtfcRUQypE/uIiIZ6ujmbmZdZvaEmd1d5b2VZjZqZk+Wf/19O2pshpk9Z2ZbyvUPVXnfzOy7Zva0mW02s+PbUWcj6pjbB81s14Tzl9RXTplZr5mtM7Nfm9k2Mzu54v1kzx3UNb9kz5+ZLZhQ95Nm9oqZfaFiTOHnr54v68jZJcA24MBp3v+hu39+Fuspwl+7+3S52o8A7y7/Ogn4Xvn3VMw0N4CfufuyWaumtb4D3OfuZ5nZnwNvq3g/9XNXa36Q6Plz96eA90HpAyQwAvy4Yljh569jP7mb2QDwUeCGdtfSRqcDt3jJY0CvmR3a7qI6nZkdCJxC6estcfc33H1nxbBkz12d88vFacAz7l75wGbh569jmzvwbeBLwFszjPl4+Z9M68xs3gzjYuXAf5rZJjO7sMr7/cD2Ca+Hy9tSUGtuACeb2S/N7F4zO2Y2i2vSkcAo8K/ly4Y3mNnbK8akfO7qmR+ke/4mWgGsqbK98PPXkc3dzJYBL7r7phmG3QXMd/eFwE+Am2eluNZa7O7HU/on4EVmdkrF+9W+PjGV+FStuT1O6THtvwL+BVg/2wU2YT/geOB77r4I+D/gyxVjUj539cwv5fMHQPly03Lg36u9XWVbS89fRzZ3Sl/6vdzMngNuB041s1snDnD3He7+evnl9cAJs1ti89z9+fLvL1K65ndixZBhYOK/SAaA52enuubUmpu7v+Lur5Z/vgfoNrNDZr3QxgwDw+6+sfx6HaVmWDkmyXNHHfNL/PyN+wjwuLv/b5X3Cj9/Hdnc3f1ydx9w9/mU/tn0kLt/YuKYiutfyyndeE2Gmb3dzA4Y/xn4W+C/K4ZtAD5VvnP/AWCXu78wy6UGq2duZvYuM7PyzydS+t/6jtmutRHu/j/AdjNbUN50GrC1YliS5w7qm1/K52+Cc6l+SQZm4fx1elpmEjNbBQy5+wbgYjNbDrwJvAysbGdtDfgL4Mfl/3/sB/ybu99nZv8I4O7fB+4BlgJPA38CPtOmWkPVM7ezgM+Z2ZvAbmCFp/XE3j8Bt5X/af9b4DOZnLtxteaX9Pkzs7cBfwP8w4Rts3r+9ISqiEiGOvKyjIhI7tTcRUQypOYuIpIhNXcRkQypuYuIZEjNXUQkQ2ruIiIZUnMXEcnQ/wPmMFqpaGCFHwAAAABJRU5ErkJggg==\n",
35 | "text/plain": [
36 | ""
37 | ]
38 | },
39 | "metadata": {},
40 | "output_type": "display_data"
41 | }
42 | ],
43 | "source": [
44 | "# data\n",
45 | "def create_data():\n",
46 | " iris = load_iris()\n",
47 | " df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
48 | " df['label'] = iris.target\n",
49 | " df.columns = [\n",
50 | " 'sepal length', 'sepal width', 'petal length', 'petal width', 'label'\n",
51 | " ]\n",
52 | " data = np.array(df.iloc[:100, [0, 1, -1]])\n",
53 | " for i in range(len(data)):\n",
54 | " if data[i, -1] == 0:\n",
55 | " data[i, -1] = -1\n",
56 | " # print(data)\n",
57 | " return data[:, :2], data[:, -1]\n",
58 | "\n",
59 | "X, y = create_data()\n",
60 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n",
61 | "\n",
62 | "\n",
63 | "plt.scatter(X[:50,0],X[:50,1], label='0')\n",
64 | "plt.scatter(X[50:,0],X[50:,1], label='1')\n",
65 | "plt.legend()"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 3,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "\n",
75 | "class SVM:\n",
76 | " def __init__(self, max_iter=100, kernel='linear'):\n",
77 | " self.max_iter = max_iter\n",
78 | " self._kernel = kernel\n",
79 | "\n",
80 | " def init_args(self, features, labels):\n",
81 | " self.m, self.n = features.shape\n",
82 | " self.X = features\n",
83 | " self.Y = labels\n",
84 | " self.b = 0.0\n",
85 | "\n",
86 | " # 将Ei保存在一个列表里\n",
87 | " self.alpha = np.ones(self.m)\n",
88 | " self.E = [self._E(i) for i in range(self.m)]\n",
89 | " # 松弛变量\n",
90 | " self.C = 1.0\n",
91 | "\n",
92 | " def _KKT(self, i):\n",
93 | " y_g = self._g(i) * self.Y[i]\n",
94 | " if self.alpha[i] == 0:\n",
95 | " return y_g >= 1\n",
96 | " elif 0 < self.alpha[i] < self.C:\n",
97 | " return y_g == 1\n",
98 | " else:\n",
99 | " return y_g <= 1\n",
100 | "\n",
101 | " # g(x)预测值,输入xi(X[i])\n",
102 | " def _g(self, i):\n",
103 | " r = self.b\n",
104 | " for j in range(self.m):\n",
105 | " r += self.alpha[j] * self.Y[j] * self.kernel(self.X[i], self.X[j])\n",
106 | " return r\n",
107 | "\n",
108 | " # 核函数\n",
109 | " def kernel(self, x1, x2):\n",
110 | " if self._kernel == 'linear':\n",
111 | " return sum([x1[k] * x2[k] for k in range(self.n)])\n",
112 | " elif self._kernel == 'poly':\n",
113 | " return (sum([x1[k] * x2[k] for k in range(self.n)]) + 1)**2\n",
114 | "\n",
115 | " return 0\n",
116 | "\n",
117 | " # E(x)为g(x)对输入x的预测值和y的差\n",
118 | " def _E(self, i):\n",
119 | " return self._g(i) - self.Y[i]\n",
120 | "\n",
121 | " def _init_alpha(self):\n",
122 | " # 外层循环首先遍历所有满足0= 0:\n",
135 | " j = min(range(self.m), key=lambda x: self.E[x])\n",
136 | " else:\n",
137 | " j = max(range(self.m), key=lambda x: self.E[x])\n",
138 | " return i, j\n",
139 | "\n",
140 | " def _compare(self, _alpha, L, H):\n",
141 | " if _alpha > H:\n",
142 | " return H\n",
143 | " elif _alpha < L:\n",
144 | " return L\n",
145 | " else:\n",
146 | " return _alpha\n",
147 | "\n",
148 | " def fit(self, features, labels):\n",
149 | " self.init_args(features, labels)\n",
150 | "\n",
151 | " for t in range(self.max_iter):\n",
152 | " # train\n",
153 | " i1, i2 = self._init_alpha()\n",
154 | "\n",
155 | " # 边界\n",
156 | " if self.Y[i1] == self.Y[i2]:\n",
157 | " L = max(0, self.alpha[i1] + self.alpha[i2] - self.C)\n",
158 | " H = min(self.C, self.alpha[i1] + self.alpha[i2])\n",
159 | " else:\n",
160 | " L = max(0, self.alpha[i2] - self.alpha[i1])\n",
161 | " H = min(self.C, self.C + self.alpha[i2] - self.alpha[i1])\n",
162 | "\n",
163 | " E1 = self.E[i1]\n",
164 | " E2 = self.E[i2]\n",
165 | " # eta=K11+K22-2K12\n",
166 | " eta = self.kernel(self.X[i1], self.X[i1]) + self.kernel(\n",
167 | " self.X[i2],\n",
168 | " self.X[i2]) - 2 * self.kernel(self.X[i1], self.X[i2])\n",
169 | " if eta <= 0:\n",
170 | " # print('eta <= 0')\n",
171 | " continue\n",
172 | "\n",
173 | " alpha2_new_unc = self.alpha[i2] + self.Y[i2] * (\n",
174 | " E1 - E2) / eta #此处有修改,根据书上应该是E1 - E2,书上130-131页\n",
175 | " alpha2_new = self._compare(alpha2_new_unc, L, H)\n",
176 | "\n",
177 | " alpha1_new = self.alpha[i1] + self.Y[i1] * self.Y[i2] * (\n",
178 | " self.alpha[i2] - alpha2_new)\n",
179 | "\n",
180 | " b1_new = -E1 - self.Y[i1] * self.kernel(self.X[i1], self.X[i1]) * (\n",
181 | " alpha1_new - self.alpha[i1]) - self.Y[i2] * self.kernel(\n",
182 | " self.X[i2],\n",
183 | " self.X[i1]) * (alpha2_new - self.alpha[i2]) + self.b\n",
184 | " b2_new = -E2 - self.Y[i1] * self.kernel(self.X[i1], self.X[i2]) * (\n",
185 | " alpha1_new - self.alpha[i1]) - self.Y[i2] * self.kernel(\n",
186 | " self.X[i2],\n",
187 | " self.X[i2]) * (alpha2_new - self.alpha[i2]) + self.b\n",
188 | "\n",
189 | " if 0 < alpha1_new < self.C:\n",
190 | " b_new = b1_new\n",
191 | " elif 0 < alpha2_new < self.C:\n",
192 | " b_new = b2_new\n",
193 | " else:\n",
194 | " # 选择中点\n",
195 | " b_new = (b1_new + b2_new) / 2\n",
196 | "\n",
197 | " # 更新参数\n",
198 | " self.alpha[i1] = alpha1_new\n",
199 | " self.alpha[i2] = alpha2_new\n",
200 | " self.b = b_new\n",
201 | "\n",
202 | " self.E[i1] = self._E(i1)\n",
203 | " self.E[i2] = self._E(i2)\n",
204 | " return 'train done!'\n",
205 | "\n",
206 | " def predict(self, data):\n",
207 | " r = self.b\n",
208 | " for i in range(self.m):\n",
209 | " r += self.alpha[i] * self.Y[i] * self.kernel(data, self.X[i])\n",
210 | "\n",
211 | " return 1 if r > 0 else -1\n",
212 | "\n",
213 | " def score(self, X_test, y_test):\n",
214 | " right_count = 0\n",
215 | " for i in range(len(X_test)):\n",
216 | " result = self.predict(X_test[i])\n",
217 | " if result == y_test[i]:\n",
218 | " right_count += 1\n",
219 | " return right_count / len(X_test)\n",
220 | "\n",
221 | " def _weight(self):\n",
222 | " # linear model\n",
223 | " yx = self.Y.reshape(-1, 1) * self.X\n",
224 | " self.w = np.dot(yx.T, self.alpha)\n",
225 | " return self.w"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 7,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | "0.6"
237 | ]
238 | },
239 | "execution_count": 7,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "svm = SVM(max_iter=200)\n",
246 | "svm.fit(X_train, y_train)\n",
247 | "svm.score(X_test, y_test)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 8,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "data": {
257 | "text/plain": [
258 | "1.0"
259 | ]
260 | },
261 | "execution_count": 8,
262 | "metadata": {},
263 | "output_type": "execute_result"
264 | }
265 | ],
266 | "source": [
267 | "#sklearn实例\n",
268 | "from sklearn.svm import SVC\n",
269 | "clf = SVC()\n",
270 | "clf.fit(X_train, y_train)\n",
271 | "clf.score(X_test, y_test)"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": []
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.6.4"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 2
303 | }
304 |
--------------------------------------------------------------------------------
/Part7/SVM.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from sklearn.datasets import load_iris
10 | from sklearn.model_selection import train_test_split
11 | import matplotlib.pyplot as plt
12 | get_ipython().run_line_magic('matplotlib', 'inline')
13 |
14 |
15 | # In[2]:
16 |
17 |
18 | # data
19 | def create_data():
20 | iris = load_iris()
21 | df = pd.DataFrame(iris.data, columns=iris.feature_names)
22 | df['label'] = iris.target
23 | df.columns = [
24 | 'sepal length', 'sepal width', 'petal length', 'petal width', 'label'
25 | ]
26 | data = np.array(df.iloc[:100, [0, 1, -1]])
27 | for i in range(len(data)):
28 | if data[i, -1] == 0:
29 | data[i, -1] = -1
30 | # print(data)
31 | return data[:, :2], data[:, -1]
32 |
33 | X, y = create_data()
34 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
35 |
36 |
37 | plt.scatter(X[:50,0],X[:50,1], label='0')
38 | plt.scatter(X[50:,0],X[50:,1], label='1')
39 | plt.legend()
40 |
41 |
42 | # In[3]:
43 |
44 |
45 |
46 | class SVM:
47 | def __init__(self, max_iter=100, kernel='linear'):
48 | self.max_iter = max_iter
49 | self._kernel = kernel
50 |
51 | def init_args(self, features, labels):
52 | self.m, self.n = features.shape
53 | self.X = features
54 | self.Y = labels
55 | self.b = 0.0
56 |
57 | # 将Ei保存在一个列表里
58 | self.alpha = np.ones(self.m)
59 | self.E = [self._E(i) for i in range(self.m)]
60 | # 松弛变量
61 | self.C = 1.0
62 |
63 | def _KKT(self, i):
64 | y_g = self._g(i) * self.Y[i]
65 | if self.alpha[i] == 0:
66 | return y_g >= 1
67 | elif 0 < self.alpha[i] < self.C:
68 | return y_g == 1
69 | else:
70 | return y_g <= 1
71 |
72 | # g(x)预测值,输入xi(X[i])
73 | def _g(self, i):
74 | r = self.b
75 | for j in range(self.m):
76 | r += self.alpha[j] * self.Y[j] * self.kernel(self.X[i], self.X[j])
77 | return r
78 |
79 | # 核函数
80 | def kernel(self, x1, x2):
81 | if self._kernel == 'linear':
82 | return sum([x1[k] * x2[k] for k in range(self.n)])
83 | elif self._kernel == 'poly':
84 | return (sum([x1[k] * x2[k] for k in range(self.n)]) + 1)**2
85 |
86 | return 0
87 |
88 | # E(x)为g(x)对输入x的预测值和y的差
89 | def _E(self, i):
90 | return self._g(i) - self.Y[i]
91 |
92 | def _init_alpha(self):
93 | # 外层循环首先遍历所有满足0= 0:
106 | j = min(range(self.m), key=lambda x: self.E[x])
107 | else:
108 | j = max(range(self.m), key=lambda x: self.E[x])
109 | return i, j
110 |
111 | def _compare(self, _alpha, L, H):
112 | if _alpha > H:
113 | return H
114 | elif _alpha < L:
115 | return L
116 | else:
117 | return _alpha
118 |
119 | def fit(self, features, labels):
120 | self.init_args(features, labels)
121 |
122 | for t in range(self.max_iter):
123 | # train
124 | i1, i2 = self._init_alpha()
125 |
126 | # 边界
127 | if self.Y[i1] == self.Y[i2]:
128 | L = max(0, self.alpha[i1] + self.alpha[i2] - self.C)
129 | H = min(self.C, self.alpha[i1] + self.alpha[i2])
130 | else:
131 | L = max(0, self.alpha[i2] - self.alpha[i1])
132 | H = min(self.C, self.C + self.alpha[i2] - self.alpha[i1])
133 |
134 | E1 = self.E[i1]
135 | E2 = self.E[i2]
136 | # eta=K11+K22-2K12
137 | eta = self.kernel(self.X[i1], self.X[i1]) + self.kernel(
138 | self.X[i2],
139 | self.X[i2]) - 2 * self.kernel(self.X[i1], self.X[i2])
140 | if eta <= 0:
141 | # print('eta <= 0')
142 | continue
143 |
144 | alpha2_new_unc = self.alpha[i2] + self.Y[i2] * (
145 | E1 - E2) / eta #此处有修改,根据书上应该是E1 - E2,书上130-131页
146 | alpha2_new = self._compare(alpha2_new_unc, L, H)
147 |
148 | alpha1_new = self.alpha[i1] + self.Y[i1] * self.Y[i2] * (
149 | self.alpha[i2] - alpha2_new)
150 |
151 | b1_new = -E1 - self.Y[i1] * self.kernel(self.X[i1], self.X[i1]) * (
152 | alpha1_new - self.alpha[i1]) - self.Y[i2] * self.kernel(
153 | self.X[i2],
154 | self.X[i1]) * (alpha2_new - self.alpha[i2]) + self.b
155 | b2_new = -E2 - self.Y[i1] * self.kernel(self.X[i1], self.X[i2]) * (
156 | alpha1_new - self.alpha[i1]) - self.Y[i2] * self.kernel(
157 | self.X[i2],
158 | self.X[i2]) * (alpha2_new - self.alpha[i2]) + self.b
159 |
160 | if 0 < alpha1_new < self.C:
161 | b_new = b1_new
162 | elif 0 < alpha2_new < self.C:
163 | b_new = b2_new
164 | else:
165 | # 选择中点
166 | b_new = (b1_new + b2_new) / 2
167 |
168 | # 更新参数
169 | self.alpha[i1] = alpha1_new
170 | self.alpha[i2] = alpha2_new
171 | self.b = b_new
172 |
173 | self.E[i1] = self._E(i1)
174 | self.E[i2] = self._E(i2)
175 | return 'train done!'
176 |
177 | def predict(self, data):
178 | r = self.b
179 | for i in range(self.m):
180 | r += self.alpha[i] * self.Y[i] * self.kernel(data, self.X[i])
181 |
182 | return 1 if r > 0 else -1
183 |
184 | def score(self, X_test, y_test):
185 | right_count = 0
186 | for i in range(len(X_test)):
187 | result = self.predict(X_test[i])
188 | if result == y_test[i]:
189 | right_count += 1
190 | return right_count / len(X_test)
191 |
192 | def _weight(self):
193 | # linear model
194 | yx = self.Y.reshape(-1, 1) * self.X
195 | self.w = np.dot(yx.T, self.alpha)
196 | return self.w
197 |
198 |
199 | # In[7]:
200 |
201 |
202 | svm = SVM(max_iter=200)
203 | svm.fit(X_train, y_train)
204 | svm.score(X_test, y_test)
205 |
206 |
207 | # In[8]:
208 |
209 |
210 | #sklearn实例
211 | from sklearn.svm import SVC
212 | clf = SVC()
213 | clf.fit(X_train, y_train)
214 | clf.score(X_test, y_test)
215 |
216 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ML-Python-
2 | 主要是在学习李航的统计学习方法和周志华的机器学习西瓜书的笔记和相关的代码实现
3 |
4 | Part1笔记讲解:https://blog.csdn.net/qq_28821995/article/details/95457661
5 |
6 | Part2笔记讲解:https://blog.csdn.net/qq_28821995/article/details/96144693
7 |
8 | Part3笔记讲解:https://blog.csdn.net/qq_28821995/article/details/96838812
9 |
10 | Part4笔记讲解:https://blog.csdn.net/qq_28821995/article/details/100170020
11 |
12 | Part5笔记讲解:https://blog.csdn.net/qq_28821995/article/details/100972513
13 |
14 | Part6笔记讲解:https://blog.csdn.net/qq_28821995/article/details/101996594
15 |
16 | Part7笔记讲解:https://blog.csdn.net/qq_28821995/article/details/102639452
--------------------------------------------------------------------------------