├── .idea
├── ML note.iml
├── dictionaries
│ └── Vam.xml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── 1.liner model
├── 1.basic liner model.py
├── 2.ridge analysis.py
├── 3.Lasso regression.py
├── 4. ElasticNet.py
├── 5.logistic regression.py
├── 6.LDA.py
└── README.md
├── 10.Ensemble
├── 10.1 Adaboost classifer.py
├── 10.2 adaboost regression.py
├── 10.3 RF_classifier.py
├── 10.4 RF_regression.py
├── 10.5 Gradient_Classifier.py
├── 10.6 Gradient_regresion.py
└── README.md
├── 11. Preprocessing
├── 11.1 Binarize.py
├── 11.2 One-hot encoder.py
├── 11.3 normalize.py
├── 11.4 standardize.py
├── 11.5 feature_seleticon_filter.py
├── 11.6 feature_selection_bagging.py
├── 11.7 feature_selection_embeded.py
├── 11.8 pipeline.py
├── 11.9 dictionary learning.py
└── README.md
├── 12. Model evaluation
├── 12.1 Loss function.py
├── 12.2 data split.py
├── 12.3 validation_curve.py
├── 12.4 grid_search.py
├── 12.5 classification_metrics.py
├── 12.6 learning curve.py
├── 12.7 regression_metrics.py
└── README.md
├── 2.decision tree(DT)
├── 2.1 Decision Tree-Classifier.py
├── 2.2 Decision Tree- Regression.py
└── README.md
├── 3.Bayes
├── 3.1 Gaussian Bayes.py
├── 3.2 Multinomial NB.py
├── 3.3 Bernoulli NB.py
└── README.md
├── 4. KNN
├── 4.1 KNN classification.py
├── 4.2 KNN regressor.py
└── README.md
├── 5.Dimension_Reduction
├── 5.1 PCA.py
├── 5.2 KPCA.py
├── 5.3 MDS.py
├── 5.4 Isomap.py
├── 5.5 LLE.py
└── README.md
├── 6. Clustering
├── 6.1 Kmeans.py
├── 6.2 DBSCAN.py
├── 6.3 Agglomerative Clustering.py
├── 6.4 GaussianMixture.py
└── README.md
├── 7. Support Vector Machine
├── 7.1 SVM-liner_SVC.py
├── 7.2 SVM-unliner_SVC.py
├── 7.3 liner_SVR.py
├── 7.4 unliner_SVR.py
└── README.md
├── 8. Artificial Neural Network
├── README.md
└── test.py
├── 9. Semi-Supervised Learning
├── 9.1 labelpropogation.py
├── 9.2LabelSpreading.py
└── README.md
└── README.md
/.idea/ML note.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/dictionaries/Vam.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | AngularJS
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/1.liner model/1.basic liner model.py:
--------------------------------------------------------------------------------
1 | # import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model, discriminant_analysis, cross_validation
4 |
5 |
6 | '''
7 | load_data : get the diabetes data from the pkg of sklearn
8 | return:
9 | 1 array for the regression problem.
10 | train_data, test_data, train_value, test_value
11 | '''
12 |
13 | def load_data():
14 |
15 | diabetes = datasets.load_diabetes()
16 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
17 | test_size=0.25,random_state=0)
18 |
19 |
20 | '''
21 | test_LR: the code that train the model
22 | param data: *data is a parameter that can change
23 | Return: None
24 | '''
25 | def test_LinearRegression(*data):
26 |
27 | X_train,X_test,y_train,y_test=data
28 | regr = linear_model.LinearRegression()
29 | regr.fit(X_train, y_train)
30 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_))
31 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2)))
32 | print('Score: {0}'.format(regr.score(X_test, y_test)))
33 | # the main function
34 | if __name__=='__main__':
35 | X_train,X_test,y_train,y_test=load_data()
36 | test_LinearRegression(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/1.liner model/2.ridge analysis.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model,cross_validation
4 |
5 |
6 | def load_data():
7 | '''
8 | load for the dataset
9 | return:
10 | 1 array for the regression problem.
11 | train_data, test_data, train_value, test_value
12 | '''
13 |
14 | diabetes = datasets.load_diabetes()
15 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
16 | test_size=0.25,random_state=0)
17 |
18 | def test_Ridge(*data):
19 | '''
20 | test the ridge analysis
21 | :param data: train_data, test_data, train_value, test_value
22 | :return: None
23 | '''
24 |
25 | X_train,X_test,y_train,y_test=data
26 | regr = linear_model.Ridge()
27 | regr.fit(X_train, y_train)
28 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_))
29 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2)))
30 | print('Score: {0}' .format(regr.score(X_test, y_test)))
31 | def test_Ridge_alpha(*data):
32 | '''
33 | test the score with different alpha param
34 | :param data: train_data, test_data, train_value, test_value
35 | :return: None
36 | '''
37 |
38 | X_train,X_test,y_train,y_test=data
39 | alphas=[0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000]
40 | '''
41 | actually, smaller alpha means a better score. But consider the calculation power, we need to trade off.
42 | '''
43 | scores=[]
44 | for i,alpha in enumerate(alphas):
45 | regr = linear_model.Ridge(alpha=alpha)
46 | regr.fit(X_train, y_train)
47 | scores.append(regr.score(X_test, y_test))
48 | ## graph
49 | fig=plt.figure()
50 | ax=fig.add_subplot(1,1,1)
51 | ax.plot(alphas,scores)
52 | ax.set_xlabel(r"$\alpha$")
53 | ax.set_ylabel(r"score")
54 | ax.set_xscale('log')
55 | ax.set_title("Ridge")
56 | plt.show()
57 | if __name__=='__main__':
58 | X_train,X_test,y_train,y_test=load_data()
59 | test_Ridge(X_train,X_test,y_train,y_test)
60 | test_Ridge_alpha(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/1.liner model/3.Lasso regression.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model,cross_validation
4 | def load_data():
5 | '''
6 | load for the dataset
7 | return:
8 | 1 array for the regression problem.
9 | train_data, test_data, train_value, test_value
10 | '''
11 | diabetes = datasets.load_diabetes()
12 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
13 | test_size=0.25,random_state=0)
14 | def test_Lasso(*data):
15 | '''
16 | test for lasso
17 | :param data: train_data, test_data, train_value, test_value
18 | :return: None
19 | '''
20 |
21 | X_train,X_test,y_train,y_test=data
22 | regr = linear_model.Lasso()
23 | regr.fit(X_train, y_train)
24 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_, regr.intercept_))
25 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2)))
26 | print('Score: {0}'.format(regr.score(X_test, y_test)))
27 | def test_Lasso_alpha(*data):
28 | '''
29 | test the score with different alpha
30 | :param data: train_data, test_data, train_value, test_value
31 | :return: None
32 | '''
33 |
34 | X_train,X_test,y_train,y_test=data
35 | alphas=[0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000]
36 | scores=[]
37 | for i,alpha in enumerate(alphas):
38 | regr = linear_model.Lasso(alpha=alpha)
39 | regr.fit(X_train, y_train)
40 | scores.append(regr.score(X_test, y_test))
41 | ## graph
42 | fig=plt.figure()
43 | ax=fig.add_subplot(1,1,1)
44 | ax.plot(alphas,scores)
45 | ax.set_xlabel(r"$\alpha$")
46 | ax.set_ylabel(r"score")
47 | ax.set_xscale('log')
48 | ax.set_title("Lasso")
49 | plt.show()
50 | if __name__=='__main__':
51 | X_train,X_test,y_train,y_test=load_data()
52 | test_Lasso(X_train,X_test,y_train,y_test)
53 | test_Lasso_alpha(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/1.liner model/4. ElasticNet.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from sklearn import datasets, linear_model,cross_validation
5 |
6 | def load_data():
7 | '''
8 | load for the dataset
9 | return:
10 | 1 array for the regression problem.
11 | train_data, test_data, train_value, test_value
12 | '''
13 | diabetes = datasets.load_diabetes()
14 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
15 | test_size=0.25,random_state=0)
16 |
17 | def test_ElasticNet(*data):
18 | '''
19 | test for Elastic Net
20 | :param data: train_data, test_data, train_value, test_value
21 | :return: None
22 | '''
23 | X_train,X_test,y_train,y_test=data
24 | regr = linear_model.ElasticNet()
25 | regr.fit(X_train, y_train)
26 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_, regr.intercept_))
27 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2)))
28 | print('Score: {0}'.format(regr.score(X_test, y_test)))
29 | def test_ElasticNet_alpha_rho(*data):
30 | '''
31 | test score with different alpha and l1_ratio
32 | :param data: train_data, test_data, train_value, test_value
33 | :return: None
34 | '''
35 | X_train,X_test,y_train,y_test=data
36 | alphas=np.logspace(-2,2)
37 | rhos=np.linspace(0.01,1)
38 | scores=[]
39 | for alpha in alphas:
40 | for rho in rhos:
41 | regr = linear_model.ElasticNet(alpha=alpha,l1_ratio=rho)
42 | regr.fit(X_train, y_train)
43 | scores.append(regr.score(X_test, y_test))
44 | ## graph
45 | alphas, rhos = np.meshgrid(alphas, rhos)
46 | scores=np.array(scores).reshape(alphas.shape)
47 | from mpl_toolkits.mplot3d import Axes3D # this part works well in py3
48 | from matplotlib import cm
49 | fig=plt.figure()
50 | ax=Axes3D(fig)
51 | surf = ax.plot_surface(alphas, rhos, scores, rstride=1, cstride=1, cmap=cm.jet,
52 | linewidth=0, antialiased=False)
53 | fig.colorbar(surf, shrink=0.5, aspect=5)
54 | ax.set_xlabel(r"$\alpha$")
55 | ax.set_ylabel(r"$\rho$")
56 | ax.set_zlabel("score")
57 | ax.set_title("ElasticNet")
58 | plt.show()
59 | if __name__=='__main__':
60 | X_train,X_test,y_train,y_test=load_data()
61 | test_ElasticNet(X_train,X_test,y_train,y_test)
62 | test_ElasticNet_alpha_rho(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/1.liner model/5.logistic regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | from sklearn import datasets, linear_model,cross_validation
6 |
7 | def load_data():
8 | '''
9 | load for the dataset
10 | return:
11 | 1 array for the classification problem.
12 | train_data, test_data, train_value, test_value
13 | '''
14 | iris=datasets.load_iris() # Use the IRIS data. This data has 3 class and 150 examples
15 | X_train=iris.data
16 | y_train=iris.target
17 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
18 | random_state=0,stratify=y_train)
19 | def test_LogisticRegression(*data):
20 | '''
21 | test of LR
22 | :param data: train_data, test_data, train_value, test_value
23 | :return: None
24 | '''
25 | X_train,X_test,y_train,y_test=data
26 | regr = linear_model.LogisticRegression()
27 | regr.fit(X_train, y_train)
28 | print('Coefficients: {0}, intercept {1}'.format(regr.coef_,regr.intercept_))
29 | print('Score: {0}' .format(regr.score(X_test, y_test)))
30 | def test_LogisticRegression_multinomial(*data):
31 | '''
32 | Test with different multi_class
33 | :param data: train_data, test_data, train_value, test_value
34 | :return: None
35 | '''
36 | X_train,X_test,y_train,y_test=data
37 | regr = linear_model.LogisticRegression(multi_class='multinomial',solver='lbfgs')
38 | regr.fit(X_train, y_train)
39 | print('Coefficients: {0}, intercept {1}'.format(regr.coef_,regr.intercept_))
40 | print('Score: {0}' .format(regr.score(X_test, y_test)))
41 | def test_LogisticRegression_C(*data):
42 | '''
43 | test score with different C
44 | :param data: train_data, test_data, train_value, test_value
45 | :return: None
46 | '''
47 | X_train,X_test,y_train,y_test=data
48 | Cs=np.logspace(-2,4,num=100)
49 | scores=[]
50 | for C in Cs:
51 | regr = linear_model.LogisticRegression(C=C)
52 | regr.fit(X_train, y_train)
53 | scores.append(regr.score(X_test, y_test))
54 | ## graph
55 | fig=plt.figure()
56 | ax=fig.add_subplot(1,1,1)
57 | ax.plot(Cs,scores)
58 | ax.set_xlabel(r"C")
59 | ax.set_ylabel(r"score")
60 | ax.set_xscale('log')
61 | ax.set_title("LogisticRegression")
62 | plt.show()
63 |
64 | if __name__=='__main__':
65 | X_train,X_test,y_train,y_test=load_data()
66 | test_LogisticRegression(X_train,X_test,y_train,y_test)
67 | test_LogisticRegression_multinomial(X_train,X_test,y_train,y_test)
68 | test_LogisticRegression_C(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/1.liner model/6.LDA.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | from sklearn import datasets, discriminant_analysis,cross_validation
6 |
7 | def load_data():
8 | '''
9 | load for the dataset
10 | return:
11 | 1 array for the classification problem.
12 | train_data, test_data, train_value, test_value
13 | '''
14 | iris=datasets.load_iris()
15 | X_train=iris.data
16 | y_train=iris.target
17 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
18 | random_state=0,stratify=y_train)
19 | def test_LinearDiscriminantAnalysis(*data):
20 | '''
21 | test of LDA
22 | :param data: train_data, test_data, train_value, test_value
23 | :return: None
24 | '''
25 | X_train,X_test,y_train,y_test=data
26 | lda = discriminant_analysis.LinearDiscriminantAnalysis()
27 | lda.fit(X_train, y_train)
28 | print('Coefficients: {0}, intercept {1}'.format(lda.coef_,lda.intercept_))
29 | print('Score: {0}' .format( lda.score(X_test, y_test)))
30 | def plot_LDA(converted_X,y):
31 | '''
32 | plot the graph after transfer
33 | :param converted_X: train data after transfer
34 | :param y: train_value
35 | :return: None
36 | '''
37 | from mpl_toolkits.mplot3d import Axes3D
38 | fig=plt.figure()
39 | ax=Axes3D(fig)
40 | colors='rgb'
41 | markers='o*s'
42 | for target,color,marker in zip([0,1,2],colors,markers):
43 | pos=(y==target).ravel()
44 | X=converted_X[pos,:]
45 | ax.scatter(X[:,0], X[:,1], X[:,2],color=color,marker=marker,
46 | label="Label {0}".format(target))
47 | ax.legend(loc="best")
48 | fig.suptitle("Iris After LDA")
49 | plt.show()
50 | def run_plot_LDA():
51 | '''
52 | run LDA
53 | :return: None
54 | '''
55 | X_train,X_test,y_train,y_test=load_data()
56 | X=np.vstack((X_train,X_test))
57 | Y=np.vstack((y_train.reshape(y_train.size,1),y_test.reshape(y_test.size,1)))
58 | lda = discriminant_analysis.LinearDiscriminantAnalysis()
59 | lda.fit(X, Y)
60 | converted_X=np.dot(X,np.transpose(lda.coef_))+lda.intercept_
61 | plot_LDA(converted_X,Y)
62 | def test_LinearDiscriminantAnalysis_solver(*data):
63 | '''
64 | test score with different solver
65 | :param data: train_data, test_data, train_value, test_value
66 | :return: None
67 | '''
68 | X_train,X_test,y_train,y_test=data
69 | solvers=['svd','lsqr','eigen']
70 | for solver in solvers:
71 | if(solver=='svd'):
72 | lda = discriminant_analysis.LinearDiscriminantAnalysis(solver=solver)
73 | else:
74 | lda = discriminant_analysis.LinearDiscriminantAnalysis(solver=solver,
75 | shrinkage=None)
76 | lda.fit(X_train, y_train)
77 | print('Score at solver={0}: {1}'.format(solver, lda.score(X_test, y_test)))
78 | def test_LinearDiscriminantAnalysis_shrinkage(*data):
79 | '''
80 | test score with different shrinkage
81 | :param data: train_data, test_data, train_value, test_value
82 | :return: None
83 | '''
84 | X_train,X_test,y_train,y_test=data
85 | shrinkages=np.linspace(0.0,1.0,num=20)
86 | scores=[]
87 | for shrinkage in shrinkages:
88 | lda = discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr',
89 | shrinkage=shrinkage)
90 | lda.fit(X_train, y_train)
91 | scores.append(lda.score(X_test, y_test))
92 | ## graph
93 | fig=plt.figure()
94 | ax=fig.add_subplot(1,1,1)
95 | ax.plot(shrinkages,scores)
96 | ax.set_xlabel(r"shrinkage")
97 | ax.set_ylabel(r"score")
98 | ax.set_ylim(0,1.05)
99 | ax.set_title("LinearDiscriminantAnalysis")
100 | plt.show()
101 |
102 | if __name__=='__main__':
103 | X_train,X_test,y_train,y_test=load_data()
104 | test_LinearDiscriminantAnalysis(X_train,X_test,y_train,y_test)
105 | run_plot_LDA()
106 | test_LinearDiscriminantAnalysis_solver(X_train,X_test,y_train,y_test)
107 | test_LinearDiscriminantAnalysis_shrinkage(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/1.liner model/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 线性模型的一般表达形式:
3 |
4 | f(x) = W*X + b
5 |
6 | 其中, W= (w1, w2, w3, …, wn)T, 称之为权重向量。 权重向量直接的表现了各个特征在预测中的重要性。广义线性模型可以理解为是一个N维的线性模型。
7 |
8 | 线性回归,是一种监督性学习。结果是连续的,可以理解为回归分析; 结果是离散的, 可以理解为是分类问题。
9 |
10 | # 比较常见的相关模型有:
11 |
12 | 岭回归, Lasso回归, Elastic Net, 逻辑回归, 线性判别分析等。
13 |
14 | # 线性模型的损失函数一般为平房损失函数:
15 |
16 | (预测值-真实值)的平方和;(目标是使损失函数最小。)
17 |
18 | # 对特征使用归一化( Feature Scaling):
19 |
20 | 对特征进行处理,使之特征空间更加圆润适合训练学习。优点:1)提升模型的收敛速度;2)提神模型精度
21 |
22 | # 正则化:
23 |
24 | 对于由于变量过多而导致的过拟合问题,有两种主要的解决方法。其一是降低维度,同时会避免唯独灾难。其二就是正则化,保留了所有的变量的同时改变了他们的数量级以改变模型性能。
25 |
26 | # 逻辑回归
27 |
28 | 对于逻辑回归而言,多了一个激活函数。
29 |
30 | # 线性判别分析(LDA)
31 |
32 | LDA的思想是: 在训练时,将训练样本投影到一条直线上,是的同类的点尽可能接近,异类的点尽可能远离。在预测时,根据投影位置来判断类别。
33 |
34 | LAD的目标函数是:使同类点的方差尽可能小–J1,异类中心点距离尽可能大–J2。即使J1/J2尽可能小。
35 |
36 | # 实战代码:GitHub
37 |
38 | 1.线性模型:
39 |
40 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/1.basic%20liner%20model.py
41 |
42 | 2.岭分析
43 |
44 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/2.ridge%20analysis.py
45 |
46 | 3.Lasso分析
47 |
48 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/3.Lasso%20regression.py
49 |
50 | 4.Elastic NET
51 |
52 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/4.%20ElasticNet.py
53 |
54 | 5.逻辑回归
55 |
56 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/5.logistic%20regression.py
57 |
58 | 6. LDA
59 |
60 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/6.LDA.py
61 |
--------------------------------------------------------------------------------
/10.Ensemble/10.1 Adaboost classifer.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets,cross_validation,ensemble
4 | def load_data_classification():
5 | '''
6 | load data set for classification
7 | :return: train_data, test_data, train_value, test_value
8 | '''
9 | digits=datasets.load_digits()
10 | return cross_validation.train_test_split(digits.data,digits.target,
11 | test_size=0.25,random_state=0,stratify=digits.target)
12 | def test_AdaBoostClassifier(*data):
13 | '''
14 | test Ada score with different number of classifiers
15 | :param data: train_data, test_data, train_value, test_value
16 | :return: None
17 | '''
18 | X_train,X_test,y_train,y_test=data
19 | clf=ensemble.AdaBoostClassifier(learning_rate=0.1)
20 | clf.fit(X_train,y_train)
21 | ## graph
22 | fig=plt.figure()
23 | ax=fig.add_subplot(1,1,1)
24 | estimators_num=len(clf.estimators_)
25 | X=range(1,estimators_num+1)
26 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score")
27 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score")
28 | ax.set_xlabel("estimator num")
29 | ax.set_ylabel("score")
30 | ax.legend(loc="best")
31 | ax.set_title("AdaBoostClassifier")
32 | plt.show()
33 | def test_AdaBoostClassifier_base_classifier(*data):
34 | '''
35 | test Adaboost classifier with different number of classifier, and category of classifier
36 | :param data: train_data, test_data, train_value, test_value
37 | :return: None
38 | '''
39 | from sklearn.naive_bayes import GaussianNB
40 | X_train,X_test,y_train,y_test=data
41 | fig=plt.figure()
42 | ax=fig.add_subplot(2,1,1)
43 |
44 | clf=ensemble.AdaBoostClassifier(learning_rate=0.1)
45 | clf.fit(X_train,y_train)
46 | ## graph
47 | estimators_num=len(clf.estimators_)
48 | X=range(1,estimators_num+1)
49 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score")
50 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score")
51 | ax.set_xlabel("estimator num")
52 | ax.set_ylabel("score")
53 | ax.legend(loc="lower right")
54 | ax.set_ylim(0,1)
55 | ax.set_title("AdaBoostClassifier with Decision Tree")
56 |
57 | ax=fig.add_subplot(2,1,2)
58 | clf=ensemble.AdaBoostClassifier(learning_rate=0.1,base_estimator=GaussianNB())
59 | clf.fit(X_train,y_train)
60 | ## graph
61 | estimators_num=len(clf.estimators_)
62 | X=range(1,estimators_num+1)
63 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score")
64 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score")
65 | ax.set_xlabel("estimator num")
66 | ax.set_ylabel("score")
67 | ax.legend(loc="lower right")
68 | ax.set_ylim(0,1)
69 | ax.set_title("AdaBoostClassifier with Gaussian Naive Bayes")
70 | plt.show()
71 | def test_AdaBoostClassifier_learning_rate(*data):
72 | '''
73 | test performance with different learning rate
74 | :param data: train_data, test_data, train_value, test_value
75 | :return: None
76 | '''
77 | X_train,X_test,y_train,y_test=data
78 | learning_rates=np.linspace(0.01,1)
79 | fig=plt.figure()
80 | ax=fig.add_subplot(1,1,1)
81 | traing_scores=[]
82 | testing_scores=[]
83 | for learning_rate in learning_rates:
84 | clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,n_estimators=500)
85 | clf.fit(X_train,y_train)
86 | traing_scores.append(clf.score(X_train,y_train))
87 | testing_scores.append(clf.score(X_test,y_test))
88 | ax.plot(learning_rates,traing_scores,label="Traing score")
89 | ax.plot(learning_rates,testing_scores,label="Testing score")
90 | ax.set_xlabel("learning rate")
91 | ax.set_ylabel("score")
92 | ax.legend(loc="best")
93 | ax.set_title("AdaBoostClassifier")
94 | plt.show()
95 | def test_AdaBoostClassifier_algorithm(*data):
96 | '''
97 | test performance with different algorithm
98 | :param data: train_data, test_data, train_value, test_value
99 | :return: None
100 | '''
101 | X_train,X_test,y_train,y_test=data
102 | algorithms=['SAMME.R','SAMME']
103 | fig=plt.figure()
104 | learning_rates=[0.05,0.1,0.5,0.9]
105 | for i,learning_rate in enumerate(learning_rates):
106 | ax=fig.add_subplot(2,2,i+1)
107 | for i ,algorithm in enumerate(algorithms):
108 | clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,
109 | algorithm=algorithm)
110 | clf.fit(X_train,y_train)
111 | ## 绘图
112 | estimators_num=len(clf.estimators_)
113 | X=range(1,estimators_num+1)
114 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),
115 | label="%s:Traing score"%algorithms[i])
116 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),
117 | label="%s:Testing score"%algorithms[i])
118 | ax.set_xlabel("estimator num")
119 | ax.set_ylabel("score")
120 | ax.legend(loc="lower right")
121 | ax.set_title("learing rate:%f"%learning_rate)
122 | fig.suptitle("AdaBoostClassifier")
123 | plt.show()
124 | if __name__=='__main__':
125 | X_train,X_test,y_train,y_test=load_data_classification()
126 | test_AdaBoostClassifier(X_train,X_test,y_train,y_test)
127 | test_AdaBoostClassifier_base_classifier(X_train,X_test,y_train,y_test)
128 | test_AdaBoostClassifier_learning_rate(X_train,X_test,y_train,y_test)
129 | test_AdaBoostClassifier_algorithm(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/10.Ensemble/10.2 adaboost regression.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets,cross_validation,ensemble
4 |
5 | def load_data_regression():
6 | '''
7 | load the date set for regression (diabetes)
8 | :return: train_data, test_data, train_value, test_value
9 | '''
10 | diabetes = datasets.load_diabetes()
11 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
12 | test_size=0.25,random_state=0)
13 |
14 | def test_AdaBoostRegressor(*data):
15 | '''
16 | test the regression with different number of regression model
17 | :param data: train_data, test_data, train_value, test_value
18 | :return: None
19 | '''
20 | X_train,X_test,y_train,y_test=data
21 | regr=ensemble.AdaBoostRegressor()
22 | regr.fit(X_train,y_train)
23 | ## graph
24 | fig=plt.figure()
25 | ax=fig.add_subplot(1,1,1)
26 | estimators_num=len(regr.estimators_)
27 | X=range(1,estimators_num+1)
28 | ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score")
29 | ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score")
30 | ax.set_xlabel("estimator num")
31 | ax.set_ylabel("score")
32 | ax.legend(loc="best")
33 | ax.set_title("AdaBoostRegressor")
34 | plt.show()
35 | def test_AdaBoostRegressor_base_regr(*data):
36 | '''
37 | test the regression with different number of model and regression method
38 | :param data: train_data, test_data, train_value, test_value
39 | :return: None
40 | '''
41 | from sklearn.svm import LinearSVR
42 | X_train,X_test,y_train,y_test=data
43 | fig=plt.figure()
44 | regrs=[ensemble.AdaBoostRegressor(),
45 | ensemble.AdaBoostRegressor(base_estimator=LinearSVR(epsilon=0.01,C=100))]
46 | labels=["Decision Tree Regressor","Linear SVM Regressor"]
47 | for i ,regr in enumerate(regrs):
48 | ax=fig.add_subplot(2,1,i+1)
49 | regr.fit(X_train,y_train)
50 | ## graph
51 | estimators_num=len(regr.estimators_)
52 | X=range(1,estimators_num+1)
53 | ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score")
54 | ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score")
55 | ax.set_xlabel("estimator num")
56 | ax.set_ylabel("score")
57 | ax.legend(loc="lower right")
58 | ax.set_ylim(-1,1)
59 | ax.set_title("Base_Estimator:%s"%labels[i])
60 | plt.suptitle("AdaBoostRegressor")
61 | plt.show()
62 | def test_AdaBoostRegressor_learning_rate(*data):
63 | '''
64 | test the performance with different learning rate
65 | :param data: train_data, test_data, train_value, test_value
66 | :return: None
67 | '''
68 | X_train,X_test,y_train,y_test=data
69 | learning_rates=np.linspace(0.01,1)
70 | fig=plt.figure()
71 | ax=fig.add_subplot(1,1,1)
72 | traing_scores=[]
73 | testing_scores=[]
74 | for learning_rate in learning_rates:
75 | regr=ensemble.AdaBoostRegressor(learning_rate=learning_rate,n_estimators=500)
76 | regr.fit(X_train,y_train)
77 | traing_scores.append(regr.score(X_train,y_train))
78 | testing_scores.append(regr.score(X_test,y_test))
79 | ax.plot(learning_rates,traing_scores,label="Traing score")
80 | ax.plot(learning_rates,testing_scores,label="Testing score")
81 | ax.set_xlabel("learning rate")
82 | ax.set_ylabel("score")
83 | ax.legend(loc="best")
84 | ax.set_title("AdaBoostRegressor")
85 | plt.show()
86 | def test_AdaBoostRegressor_loss(*data):
87 | '''
88 | test the method with different loss function
89 | :param data: train_data, test_data, train_value, test_value
90 | :return: None
91 | '''
92 | X_train,X_test,y_train,y_test=data
93 | losses=['linear','square','exponential']
94 | fig=plt.figure()
95 | ax=fig.add_subplot(1,1,1)
96 | for i ,loss in enumerate(losses):
97 | regr=ensemble.AdaBoostRegressor(loss=loss,n_estimators=30)
98 | regr.fit(X_train,y_train)
99 | ## graph
100 | estimators_num=len(regr.estimators_)
101 | X=range(1,estimators_num+1)
102 | ax.plot(list(X),list(regr.staged_score(X_train,y_train)),
103 | label="Traing score:loss=%s"%loss)
104 | ax.plot(list(X),list(regr.staged_score(X_test,y_test)),
105 | label="Testing score:loss=%s"%loss)
106 | ax.set_xlabel("estimator num")
107 | ax.set_ylabel("score")
108 | ax.legend(loc="lower right")
109 | ax.set_ylim(-1,1)
110 | plt.suptitle("AdaBoostRegressor")
111 | plt.show()
112 |
113 | if __name__=='__main__':
114 | X_train,X_test,y_train,y_test=load_data_regression()
115 | test_AdaBoostRegressor(X_train,X_test,y_train,y_test)
116 | test_AdaBoostRegressor_base_regr(X_train,X_test,y_train,y_test)
117 | test_AdaBoostRegressor_learning_rate(X_train,X_test,y_train,y_test)
118 | test_AdaBoostRegressor_loss(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/10.Ensemble/10.3 RF_classifier.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets,cross_validation,ensemble
4 | def load_data_classification():
5 | '''
6 | load the data set for classifier
7 | :return: train_data, test_data, train_value, test_value
8 | '''
9 | digits=datasets.load_digits()
10 | return cross_validation.train_test_split(digits.data,digits.target,
11 | test_size=0.25,random_state=0,stratify=digits.target)
12 | def test_RandomForestClassifier(*data):
13 | '''
14 | test the RF method
15 | :param data: train_data, test_data, train_value, test_value
16 | :return: None
17 | '''
18 | X_train,X_test,y_train,y_test=data
19 | clf=ensemble.RandomForestClassifier()
20 | clf.fit(X_train,y_train)
21 | print("Traing Score:%f"%clf.score(X_train,y_train))
22 | print("Testing Score:%f"%clf.score(X_test,y_test))
23 | def test_RandomForestClassifier_num(*data):
24 | '''
25 | test the performance with different n_estimators
26 | :param data: train_data, test_data, train_value, test_value
27 | :return: None
28 | '''
29 | X_train,X_test,y_train,y_test=data
30 | nums=np.arange(1,100,step=2)
31 | fig=plt.figure()
32 | ax=fig.add_subplot(1,1,1)
33 | testing_scores=[]
34 | training_scores=[]
35 | for num in nums:
36 | clf=ensemble.RandomForestClassifier(n_estimators=num)
37 | clf.fit(X_train,y_train)
38 | training_scores.append(clf.score(X_train,y_train))
39 | testing_scores.append(clf.score(X_test,y_test))
40 | ax.plot(nums,training_scores,label="Training Score")
41 | ax.plot(nums,testing_scores,label="Testing Score")
42 | ax.set_xlabel("estimator num")
43 | ax.set_ylabel("score")
44 | ax.legend(loc="lower right")
45 | ax.set_ylim(0,1.05)
46 | plt.suptitle("RandomForestClassifier")
47 | plt.show()
48 | def test_RandomForestClassifier_max_depth(*data):
49 | '''
50 | test the performance with different max_depth
51 | :param data: train_data, test_data, train_value, test_value
52 | :return: None
53 | '''
54 | X_train,X_test,y_train,y_test=data
55 | maxdepths=range(1,20)
56 | fig=plt.figure()
57 | ax=fig.add_subplot(1,1,1)
58 | testing_scores=[]
59 | training_scores=[]
60 | for max_depth in maxdepths:
61 | clf=ensemble.RandomForestClassifier(max_depth=max_depth)
62 | clf.fit(X_train,y_train)
63 | training_scores.append(clf.score(X_train,y_train))
64 | testing_scores.append(clf.score(X_test,y_test))
65 | ax.plot(maxdepths,training_scores,label="Training Score")
66 | ax.plot(maxdepths,testing_scores,label="Testing Score")
67 | ax.set_xlabel("max_depth")
68 | ax.set_ylabel("score")
69 | ax.legend(loc="lower right")
70 | ax.set_ylim(0,1.05)
71 | plt.suptitle("RandomForestClassifier")
72 | plt.show()
73 | def test_RandomForestClassifier_max_features(*data):
74 | '''
75 | test the performance with different max_features
76 | :param data: train_data, test_data, train_value, test_value
77 | :return: None
78 | '''
79 | X_train,X_test,y_train,y_test=data
80 | max_features=np.linspace(0.01,1.0)
81 | fig=plt.figure()
82 | ax=fig.add_subplot(1,1,1)
83 | testing_scores=[]
84 | training_scores=[]
85 | for max_feature in max_features:
86 | clf=ensemble.RandomForestClassifier(max_features=max_feature)
87 | clf.fit(X_train,y_train)
88 | training_scores.append(clf.score(X_train,y_train))
89 | testing_scores.append(clf.score(X_test,y_test))
90 | ax.plot(max_features,training_scores,label="Training Score")
91 | ax.plot(max_features,testing_scores,label="Testing Score")
92 | ax.set_xlabel("max_feature")
93 | ax.set_ylabel("score")
94 | ax.legend(loc="lower right")
95 | ax.set_ylim(0,1.05)
96 | plt.suptitle("RandomForestClassifier")
97 | plt.show()
98 | if __name__=='__main__':
99 | X_train,X_test,y_train,y_test=load_data_classification()
100 | test_RandomForestClassifier(X_train,X_test,y_train,y_test)
101 | test_RandomForestClassifier_num(X_train,X_test,y_train,y_test)
102 | test_RandomForestClassifier_max_depth(X_train,X_test,y_train,y_test)
103 | test_RandomForestClassifier_max_features(X_train,X_test,y_train,y_test)
104 |
--------------------------------------------------------------------------------
/10.Ensemble/10.4 RF_regression.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets,cross_validation,ensemble
4 | def load_data_regression():
5 | '''
6 | load the diabetes for regression
7 | :return: train_data, test_data, train_value, test_value
8 | '''
9 | diabetes = datasets.load_diabetes()
10 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
11 | test_size=0.25,random_state=0)
12 | def test_RandomForestRegressor(*data):
13 | '''
14 | test the RF method
15 | :param data: train_data, test_data, train_value, test_value
16 | :return: None
17 | '''
18 | X_train,X_test,y_train,y_test=data
19 | regr=ensemble.RandomForestRegressor()
20 | regr.fit(X_train,y_train)
21 | print("Traing Score:%f"%regr.score(X_train,y_train))
22 | print("Testing Score:%f"%regr.score(X_test,y_test))
23 | def test_RandomForestRegressor_num(*data):
24 | '''
25 | test the performance with different n_estimators
26 | :param data: train_data, test_data, train_value, test_value
27 | :return: None
28 | '''
29 | X_train,X_test,y_train,y_test=data
30 | nums=np.arange(1,100,step=2)
31 | fig=plt.figure()
32 | ax=fig.add_subplot(1,1,1)
33 | testing_scores=[]
34 | training_scores=[]
35 | for num in nums:
36 | regr=ensemble.RandomForestRegressor(n_estimators=num)
37 | regr.fit(X_train,y_train)
38 | training_scores.append(regr.score(X_train,y_train))
39 | testing_scores.append(regr.score(X_test,y_test))
40 | ax.plot(nums,training_scores,label="Training Score")
41 | ax.plot(nums,testing_scores,label="Testing Score")
42 | ax.set_xlabel("estimator num")
43 | ax.set_ylabel("score")
44 | ax.legend(loc="lower right")
45 | ax.set_ylim(-1,1)
46 | plt.suptitle("RandomForestRegressor")
47 | plt.show()
48 | def test_RandomForestRegressor_max_depth(*data):
49 | '''
50 | test the performance with different max_depth
51 | :param data: train_data, test_data, train_value, test_value
52 | :return: None
53 | '''
54 | X_train,X_test,y_train,y_test=data
55 | maxdepths=range(1,20)
56 | fig=plt.figure()
57 | ax=fig.add_subplot(1,1,1)
58 | testing_scores=[]
59 | training_scores=[]
60 | for max_depth in maxdepths:
61 | regr=ensemble.RandomForestRegressor(max_depth=max_depth)
62 | regr.fit(X_train,y_train)
63 | training_scores.append(regr.score(X_train,y_train))
64 | testing_scores.append(regr.score(X_test,y_test))
65 | ax.plot(maxdepths,training_scores,label="Training Score")
66 | ax.plot(maxdepths,testing_scores,label="Testing Score")
67 | ax.set_xlabel("max_depth")
68 | ax.set_ylabel("score")
69 | ax.legend(loc="lower right")
70 | ax.set_ylim(0,1.05)
71 | plt.suptitle("RandomForestRegressor")
72 | plt.show()
73 | def test_RandomForestRegressor_max_features(*data):
74 | '''
75 | test the performance with different max_features
76 | :param data: train_data, test_data, train_value, test_value
77 | :return: None
78 | '''
79 | X_train,X_test,y_train,y_test=data
80 | max_features=np.linspace(0.01,1.0)
81 | fig=plt.figure()
82 | ax=fig.add_subplot(1,1,1)
83 | testing_scores=[]
84 | training_scores=[]
85 | for max_feature in max_features:
86 | regr=ensemble.RandomForestRegressor(max_features=max_feature)
87 | regr.fit(X_train,y_train)
88 | training_scores.append(regr.score(X_train,y_train))
89 | testing_scores.append(regr.score(X_test,y_test))
90 | ax.plot(max_features,training_scores,label="Training Score")
91 | ax.plot(max_features,testing_scores,label="Testing Score")
92 | ax.set_xlabel("max_feature")
93 | ax.set_ylabel("score")
94 | ax.legend(loc="lower right")
95 | ax.set_ylim(0,1.05)
96 | plt.suptitle("RandomForestRegressor")
97 | plt.show()
98 | if __name__=='__main__':
99 | X_train,X_test,y_train,y_test=load_data_regression()
100 | test_RandomForestRegressor(X_train,X_test,y_train,y_test)
101 | test_RandomForestRegressor_num(X_train,X_test,y_train,y_test)
102 | test_RandomForestRegressor_max_depth(X_train,X_test,y_train,y_test)
103 | test_RandomForestRegressor_max_features(X_train,X_test,y_train,y_test)
104 |
--------------------------------------------------------------------------------
/10.Ensemble/10.5 Gradient_Classifier.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets,cross_validation,ensemble
4 |
5 | def load_data_classification():
6 | '''
7 | load the digits data set for classification
8 | :return: train_data, test_data, train_value, test_value
9 | '''
10 | digits=datasets.load_digits()
11 | return cross_validation.train_test_split(digits.data,digits.target,
12 | test_size=0.25,random_state=0,stratify=digits.target)
13 | def test_GradientBoostingClassifier(*data):
14 | '''
15 | test the method
16 | :param data: train_data, test_data, train_value, test_value
17 | :return: None
18 | '''
19 | X_train,X_test,y_train,y_test=data
20 | clf=ensemble.GradientBoostingClassifier()
21 | clf.fit(X_train,y_train)
22 | print("Traing Score:%f"%clf.score(X_train,y_train))
23 | print("Testing Score:%f"%clf.score(X_test,y_test))
24 | def test_GradientBoostingClassifier_num(*data):
25 | '''
26 | test the performance with different n_estimators
27 | :param data: train_data, test_data, train_value, test_value
28 | :return: None
29 | '''
30 | X_train,X_test,y_train,y_test=data
31 | nums=np.arange(1,100,step=2)
32 | fig=plt.figure()
33 | ax=fig.add_subplot(1,1,1)
34 | testing_scores=[]
35 | training_scores=[]
36 | for num in nums:
37 | clf=ensemble.GradientBoostingClassifier(n_estimators=num)
38 | clf.fit(X_train,y_train)
39 | training_scores.append(clf.score(X_train,y_train))
40 | testing_scores.append(clf.score(X_test,y_test))
41 | ax.plot(nums,training_scores,label="Training Score")
42 | ax.plot(nums,testing_scores,label="Testing Score")
43 | ax.set_xlabel("estimator num")
44 | ax.set_ylabel("score")
45 | ax.legend(loc="lower right")
46 | ax.set_ylim(0,1.05)
47 | plt.suptitle("GradientBoostingClassifier")
48 | plt.show()
49 | def test_GradientBoostingClassifier_maxdepth(*data):
50 | '''
51 | test the performance with different max_depth
52 | :param data: train_data, test_data, train_value, test_value
53 | :return: None
54 | '''
55 | X_train,X_test,y_train,y_test=data
56 | maxdepths=np.arange(1,20)
57 | fig=plt.figure()
58 | ax=fig.add_subplot(1,1,1)
59 | testing_scores=[]
60 | training_scores=[]
61 | for maxdepth in maxdepths:
62 | clf=ensemble.GradientBoostingClassifier(max_depth=maxdepth,max_leaf_nodes=None)
63 | clf.fit(X_train,y_train)
64 | training_scores.append(clf.score(X_train,y_train))
65 | testing_scores.append(clf.score(X_test,y_test))
66 | ax.plot(maxdepths,training_scores,label="Training Score")
67 | ax.plot(maxdepths,testing_scores,label="Testing Score")
68 | ax.set_xlabel("max_depth")
69 | ax.set_ylabel("score")
70 | ax.legend(loc="lower right")
71 | ax.set_ylim(0,1.05)
72 | plt.suptitle("GradientBoostingClassifier")
73 | plt.show()
74 | def test_GradientBoostingClassifier_learning(*data):
75 | '''
76 | test the performance with different learning rate
77 | :param data: train_data, test_data, train_value, test_value
78 | :return: None
79 | '''
80 | X_train,X_test,y_train,y_test=data
81 | learnings=np.linspace(0.01,1.0)
82 | fig=plt.figure()
83 | ax=fig.add_subplot(1,1,1)
84 | testing_scores=[]
85 | training_scores=[]
86 | for learning in learnings:
87 | clf=ensemble.GradientBoostingClassifier(learning_rate=learning)
88 | clf.fit(X_train,y_train)
89 | training_scores.append(clf.score(X_train,y_train))
90 | testing_scores.append(clf.score(X_test,y_test))
91 | ax.plot(learnings,training_scores,label="Training Score")
92 | ax.plot(learnings,testing_scores,label="Testing Score")
93 | ax.set_xlabel("learning_rate")
94 | ax.set_ylabel("score")
95 | ax.legend(loc="lower right")
96 | ax.set_ylim(0,1.05)
97 | plt.suptitle("GradientBoostingClassifier")
98 | plt.show()
99 | def test_GradientBoostingClassifier_subsample(*data):
100 | '''
101 | test the performance with different subsample
102 | :param data: train_data, test_data, train_value, test_value
103 | :return: None
104 | '''
105 | X_train,X_test,y_train,y_test=data
106 | fig=plt.figure()
107 | ax=fig.add_subplot(1,1,1)
108 | subsamples=np.linspace(0.01,1.0)
109 | testing_scores=[]
110 | training_scores=[]
111 | for subsample in subsamples:
112 | clf=ensemble.GradientBoostingClassifier(subsample=subsample)
113 | clf.fit(X_train,y_train)
114 | training_scores.append(clf.score(X_train,y_train))
115 | testing_scores.append(clf.score(X_test,y_test))
116 | ax.plot(subsamples,training_scores,label="Training Score")
117 | ax.plot(subsamples,testing_scores,label="Training Score")
118 | ax.set_xlabel("subsample")
119 | ax.set_ylabel("score")
120 | ax.legend(loc="lower right")
121 | ax.set_ylim(0,1.05)
122 | plt.suptitle("GradientBoostingClassifier")
123 | plt.show()
124 | def test_GradientBoostingClassifier_max_features(*data):
125 | '''
126 | test the performance with different max_features
127 | :param data: train_data, test_data, train_value, test_value
128 | :return: None
129 | '''
130 | X_train,X_test,y_train,y_test=data
131 | fig=plt.figure()
132 | ax=fig.add_subplot(1,1,1)
133 | max_features=np.linspace(0.01,1.0)
134 | testing_scores=[]
135 | training_scores=[]
136 | for features in max_features:
137 | clf=ensemble.GradientBoostingClassifier(max_features=features)
138 | clf.fit(X_train,y_train)
139 | training_scores.append(clf.score(X_train,y_train))
140 | testing_scores.append(clf.score(X_test,y_test))
141 | ax.plot(max_features,training_scores,label="Training Score")
142 | ax.plot(max_features,testing_scores,label="Training Score")
143 | ax.set_xlabel("max_features")
144 | ax.set_ylabel("score")
145 | ax.legend(loc="lower right")
146 | ax.set_ylim(0,1.05)
147 | plt.suptitle("GradientBoostingClassifier")
148 | plt.show()
149 | if __name__=='__main__':
150 | X_train,X_test,y_train,y_test=load_data_classification()
151 | test_GradientBoostingClassifier(X_train,X_test,y_train,y_test)
152 | test_GradientBoostingClassifier_num(X_train,X_test,y_train,y_test)
153 | test_GradientBoostingClassifier_maxdepth(X_train,X_test,y_train,y_test)
154 | test_GradientBoostingClassifier_learning(X_train,X_test,y_train,y_test)
155 | test_GradientBoostingClassifier_subsample(X_train,X_test,y_train,y_test)
156 | test_GradientBoostingClassifier_max_features(X_train,X_test,y_train,y_test)
157 |
--------------------------------------------------------------------------------
/10.Ensemble/10.6 Gradient_regresion.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets,cross_validation,ensemble
4 |
5 | def load_data_regression():
6 | '''
7 | load the diabetes data for regression
8 | :return: train_data, test_data, train_value, test_value
9 | '''
10 | diabetes = datasets.load_diabetes()
11 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
12 | test_size=0.25,random_state=0)
13 | def test_GradientBoostingRegressor(*data):
14 | '''
15 | test the method
16 | :param data: train_data, test_data, train_value, test_value
17 | :return: None
18 | '''
19 | X_train,X_test,y_train,y_test=data
20 | regr=ensemble.GradientBoostingRegressor()
21 | regr.fit(X_train,y_train)
22 | print("Training score:%f"%regr.score(X_train,y_train))
23 | print("Testing score:%f"%regr.score(X_test,y_test))
24 | def test_GradientBoostingRegressor_num(*data):
25 | '''
26 | test the performance with different n_estimators
27 | :param data: train_data, test_data, train_value, test_value
28 | :return: None
29 | '''
30 | X_train,X_test,y_train,y_test=data
31 | nums=np.arange(1,200,step=2)
32 | fig=plt.figure()
33 | ax=fig.add_subplot(1,1,1)
34 | testing_scores=[]
35 | training_scores=[]
36 | for num in nums:
37 | regr=ensemble.GradientBoostingRegressor(n_estimators=num)
38 | regr.fit(X_train,y_train)
39 | training_scores.append(regr.score(X_train,y_train))
40 | testing_scores.append(regr.score(X_test,y_test))
41 | ax.plot(nums,training_scores,label="Training Score")
42 | ax.plot(nums,testing_scores,label="Testing Score")
43 | ax.set_xlabel("estimator num")
44 | ax.set_ylabel("score")
45 | ax.legend(loc="lower right")
46 | ax.set_ylim(0,1.05)
47 | plt.suptitle("GradientBoostingRegressor")
48 | plt.show()
49 | def test_GradientBoostingRegressor_maxdepth(*data):
50 | '''
51 | test the performance with different max_depth
52 | :param data: train_data, test_data, train_value, test_value
53 | :return: None
54 | '''
55 | X_train,X_test,y_train,y_test=data
56 | maxdepths=np.arange(1,20)
57 | fig=plt.figure()
58 | ax=fig.add_subplot(1,1,1)
59 | testing_scores=[]
60 | training_scores=[]
61 | for maxdepth in maxdepths:
62 | regr=ensemble.GradientBoostingRegressor(max_depth=maxdepth,max_leaf_nodes=None)
63 | regr.fit(X_train,y_train)
64 | training_scores.append(regr.score(X_train,y_train))
65 | testing_scores.append(regr.score(X_test,y_test))
66 | ax.plot(maxdepths,training_scores,label="Training Score")
67 | ax.plot(maxdepths,testing_scores,label="Testing Score")
68 | ax.set_xlabel("max_depth")
69 | ax.set_ylabel("score")
70 | ax.legend(loc="lower right")
71 | ax.set_ylim(-1,1.05)
72 | plt.suptitle("GradientBoostingRegressor")
73 | plt.show()
74 | def test_GradientBoostingRegressor_learning(*data):
75 | '''
76 | test the performance with different learning rate
77 | :param data: train_data, test_data, train_value, test_value
78 | :return: None
79 | '''
80 | X_train,X_test,y_train,y_test=data
81 | learnings=np.linspace(0.01,1.0)
82 | fig=plt.figure()
83 | ax=fig.add_subplot(1,1,1)
84 | testing_scores=[]
85 | training_scores=[]
86 | for learning in learnings:
87 | regr=ensemble.GradientBoostingRegressor(learning_rate=learning)
88 | regr.fit(X_train,y_train)
89 | training_scores.append(regr.score(X_train,y_train))
90 | testing_scores.append(regr.score(X_test,y_test))
91 | ax.plot(learnings,training_scores,label="Training Score")
92 | ax.plot(learnings,testing_scores,label="Testing Score")
93 | ax.set_xlabel("learning_rate")
94 | ax.set_ylabel("score")
95 | ax.legend(loc="lower right")
96 | ax.set_ylim(-1,1.05)
97 | plt.suptitle("GradientBoostingRegressor")
98 | plt.show()
99 | def test_GradientBoostingRegressor_subsample(*data):
100 | '''
101 | test the performance with different subsample
102 | :param data: train_data, test_data, train_value, test_value
103 | :return: None
104 | '''
105 | X_train,X_test,y_train,y_test=data
106 | fig=plt.figure()
107 | ax=fig.add_subplot(1,1,1)
108 | subsamples=np.linspace(0.01,1.0,num=20)
109 | testing_scores=[]
110 | training_scores=[]
111 | for subsample in subsamples:
112 | regr=ensemble.GradientBoostingRegressor(subsample=subsample)
113 | regr.fit(X_train,y_train)
114 | training_scores.append(regr.score(X_train,y_train))
115 | testing_scores.append(regr.score(X_test,y_test))
116 | ax.plot(subsamples,training_scores,label="Training Score")
117 | ax.plot(subsamples,testing_scores,label="Training Score")
118 | ax.set_xlabel("subsample")
119 | ax.set_ylabel("score")
120 | ax.legend(loc="lower right")
121 | ax.set_ylim(-1,1.05)
122 | plt.suptitle("GradientBoostingRegressor")
123 | plt.show()
124 | def test_GradientBoostingRegressor_loss(*data):
125 | '''
126 | test the performance with differnt loss function and alpha
127 | :param data: train_data, test_data, train_value, test_value
128 | :return: None
129 | '''
130 | X_train,X_test,y_train,y_test=data
131 | fig=plt.figure()
132 | nums=np.arange(1,200,step=2)
133 | ########## graph huber ######
134 | ax=fig.add_subplot(2,1,1)
135 | alphas=np.linspace(0.01,1.0,endpoint=False,num=5)
136 | for alpha in alphas:
137 | testing_scores=[]
138 | training_scores=[]
139 | for num in nums:
140 | regr=ensemble.GradientBoostingRegressor(n_estimators=num,
141 | loss='huber',alpha=alpha)
142 | regr.fit(X_train,y_train)
143 | training_scores.append(regr.score(X_train,y_train))
144 | testing_scores.append(regr.score(X_test,y_test))
145 | ax.plot(nums,training_scores,label="Training Score:alpha=%f"%alpha)
146 | ax.plot(nums,testing_scores,label="Testing Score:alpha=%f"%alpha)
147 | ax.set_xlabel("estimator num")
148 | ax.set_ylabel("score")
149 | ax.legend(loc="lower right",framealpha=0.4)
150 | ax.set_ylim(0,1.05)
151 | ax.set_title("loss=%huber")
152 | plt.suptitle("GradientBoostingRegressor")
153 | #### graph ls and lad
154 | ax=fig.add_subplot(2,1,2)
155 | for loss in ['ls','lad']:
156 | testing_scores=[]
157 | training_scores=[]
158 | for num in nums:
159 | regr=ensemble.GradientBoostingRegressor(n_estimators=num,loss=loss)
160 | regr.fit(X_train,y_train)
161 | training_scores.append(regr.score(X_train,y_train))
162 | testing_scores.append(regr.score(X_test,y_test))
163 | ax.plot(nums,training_scores,label="Training Score:loss=%s"%loss)
164 | ax.plot(nums,testing_scores,label="Testing Score:loss=%s"%loss)
165 | ax.set_xlabel("estimator num")
166 | ax.set_ylabel("score")
167 | ax.legend(loc="lower right",framealpha=0.4)
168 | ax.set_ylim(0,1.05)
169 | ax.set_title("loss=ls,lad")
170 | plt.suptitle("GradientBoostingRegressor")
171 | plt.show()
172 | def test_GradientBoostingRegressor_max_features(*data):
173 | '''
174 | test the performance with different max_features
175 | :param data: train_data, test_data, train_value, test_value
176 | :return: None
177 | '''
178 | X_train,X_test,y_train,y_test=data
179 | fig=plt.figure()
180 | ax=fig.add_subplot(1,1,1)
181 | max_features=np.linspace(0.01,1.0)
182 | testing_scores=[]
183 | training_scores=[]
184 | for features in max_features:
185 | regr=ensemble.GradientBoostingRegressor(max_features=features)
186 | regr.fit(X_train,y_train)
187 | training_scores.append(regr.score(X_train,y_train))
188 | testing_scores.append(regr.score(X_test,y_test))
189 | ax.plot(max_features,training_scores,label="Training Score")
190 | ax.plot(max_features,testing_scores,label="Training Score")
191 | ax.set_xlabel("max_features")
192 | ax.set_ylabel("score")
193 | ax.legend(loc="lower right")
194 | ax.set_ylim(0,1.05)
195 | plt.suptitle("GradientBoostingRegressor")
196 | plt.show()
197 |
198 | if __name__=='__main__':
199 | X_train,X_test,y_train,y_test=load_data_regression()
200 | test_GradientBoostingRegressor(X_train,X_test,y_train,y_test)
201 | test_GradientBoostingRegressor_num(X_train,X_test,y_train,y_test)
202 | test_GradientBoostingRegressor_maxdepth(X_train,X_test,y_train,y_test)
203 | test_GradientBoostingRegressor_learning(X_train,X_test,y_train,y_test)
204 | test_GradientBoostingRegressor_subsample(X_train,X_test,y_train,y_test)
205 | test_GradientBoostingRegressor_loss(X_train,X_test,y_train,y_test)
206 | test_GradientBoostingRegressor_max_features(X_train,X_test,y_train,y_test)
207 |
--------------------------------------------------------------------------------
/10.Ensemble/README.md:
--------------------------------------------------------------------------------
1 | # 综述
2 |
3 | 集成学习, 等价于多算法的融合, 或者说高级的投票机制。
4 |
5 | # 主要两大类
6 |
7 | Boosting算法: 个体模型之间存在强依赖关系,串行生成。
8 |
9 | Bagging算法: 个体模型之间不存在强依赖关系,可以同时生成。
10 |
11 | # Boosting算法
12 |
13 | 是一种常见的统计学习方法。 代表为Adaboost 算法, 是一种通过改变权值使弱学习变成强学习的算法。
14 |
15 | # Bagging算法
16 |
17 | 自主采样法,代表为随即森林。 Random Forest。
18 |
19 | # 实战代码:GitHub:
20 |
21 | 1. Adaboost classifier
22 |
23 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.1%20Adaboost%20classifer.py
24 |
25 | 2. Adaboost regression
26 |
27 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.2%20adaboost%20regression.py
28 |
29 | 3. RF_ classifier
30 |
31 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.3%20RF_classifier.py
32 |
33 | 4.RF_ regression
34 |
35 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.4%20RF_regression.py
36 |
37 | 5. Gradient_ Classifer
38 |
39 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.5%20Gradient_Classifier.py
40 |
41 | 6. Gradient _ Regression
42 |
43 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.6%20Gradient_regresion.py
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/11. Preprocessing/11.1 Binarize.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import Binarizer
2 | def test_Binarizer():
3 | '''
4 | test Binatizer method
5 | :return: None
6 | '''
7 | X=[ [1,2,3,4,5],
8 | [5,4,3,2,1],
9 | [3,3,3,3,3,],
10 | [1,1,1,1,1] ]
11 | print("before transform:",X)
12 | binarizer=Binarizer(threshold=2.5)
13 | print("after transform:",binarizer.transform(X))
14 |
15 | if __name__=='__main__':
16 | test_Binarizer()
--------------------------------------------------------------------------------
/11. Preprocessing/11.2 One-hot encoder.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import OneHotEncoder
2 | def test_OneHotEncoder():
3 | '''
4 | test the method
5 | :return: None
6 | '''
7 | X=[ [1,2,3,4,5],
8 | [5,4,3,2,1],
9 | [3,3,3,3,3,],
10 | [1,1,1,1,1] ]
11 | print("before transform:",X)
12 | encoder=OneHotEncoder(sparse=False)
13 | encoder.fit(X)
14 | print("active_features_:",encoder.active_features_)
15 | print("feature_indices_:",encoder.feature_indices_)
16 | print("n_values_:",encoder.n_values_)
17 | print("after transform:",encoder.transform( [[1,2,3,4,5]]))
18 | if __name__=='__main__':
19 | test_OneHotEncoder()
--------------------------------------------------------------------------------
/11. Preprocessing/11.3 normalize.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import Normalizer
2 | def test_Normalizer():
3 | '''
4 | test the method
5 | :return: None
6 | '''
7 | X=[ [1,2,3,4,5],
8 | [5,4,3,2,1],
9 | [1,3,5,2,4,],
10 | [2,4,1,3,5] ]
11 | print("before transform:",X)
12 | normalizer=Normalizer(norm='l2')
13 | print("after transform:",normalizer.transform(X))
14 |
15 | if __name__=='__main__':
16 | test_Normalizer()
--------------------------------------------------------------------------------
/11. Preprocessing/11.4 standardize.py:
--------------------------------------------------------------------------------
1 | from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler,StandardScaler
2 |
3 | def test_MinMaxScaler():
4 | '''
5 | test the method of MinMax Scaler
6 | :return: None
7 | '''
8 | X=[ [1,5,1,2,10],
9 | [2,6,3,2,7],
10 | [3,7,5,6,4,],
11 | [4,8,7,8,1] ]
12 | print("before transform:",X)
13 | scaler=MinMaxScaler(feature_range=(0,2))
14 | scaler.fit(X)
15 | print("min_ is :",scaler.min_)
16 | print("scale_ is :",scaler.scale_)
17 | print("data_max_ is :",scaler.data_max_)
18 | print("data_min_ is :",scaler.data_min_)
19 | print("data_range_ is :",scaler.data_range_)
20 | print("after transform:",scaler.transform(X))
21 | def test_MaxAbsScaler():
22 | '''
23 | test the method of MaxAbs Scaler
24 |
25 | :return: None
26 | '''
27 | X=[ [1,5,1,2,10],
28 | [2,6,3,2,7],
29 | [3,7,5,6,4,],
30 | [4,8,7,8,1] ]
31 | print("before transform:",X)
32 | scaler=MaxAbsScaler()
33 | scaler.fit(X)
34 | print("scale_ is :",scaler.scale_)
35 | print("max_abs_ is :",scaler.max_abs_)
36 | print("after transform:",scaler.transform(X))
37 | def test_StandardScaler():
38 | '''
39 | test the method of Standard Scaler
40 | :return: None
41 | '''
42 | X=[ [1,5,1,2,10],
43 | [2,6,3,2,7],
44 | [3,7,5,6,4,],
45 | [4,8,7,8,1] ]
46 | print("before transform:",X)
47 | scaler=StandardScaler()
48 | scaler.fit(X)
49 | print("scale_ is :",scaler.scale_)
50 | print("mean_ is :",scaler.mean_)
51 | print("var_ is :",scaler.var_)
52 | print("after transform:",scaler.transform(X))
53 |
54 | if __name__=='__main__':
55 | test_MinMaxScaler()
56 | test_MaxAbsScaler()
57 | test_MaxAbsScaler()
--------------------------------------------------------------------------------
/11. Preprocessing/11.5 feature_seleticon_filter.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_selection import VarianceThreshold,SelectKBest,f_classif
2 |
3 | def test_VarianceThreshold():
4 | '''
5 | test the method of VarianceThreshold
6 | :return: None
7 | '''
8 | X=[[100,1,2,3],
9 | [100,4,5,6],
10 | [100,7,8,9],
11 | [101,11,12,13]]
12 | selector=VarianceThreshold(1)
13 | selector.fit(X)
14 | print("Variances is %s"%selector.variances_)
15 | print("After transform is %s"%selector.transform(X))
16 | print("The surport is %s"%selector.get_support(True))
17 | print("After reverse transform is %s"%
18 | selector.inverse_transform(selector.transform(X)))
19 | def test_SelectKBest():
20 | '''
21 | test the method of SelectKBert
22 | :return: None
23 | '''
24 | X=[ [1,2,3,4,5],
25 | [5,4,3,2,1],
26 | [3,3,3,3,3,],
27 | [1,1,1,1,1] ]
28 | y=[0,1,0,1]
29 | print("before transform:",X)
30 | selector=SelectKBest(score_func=f_classif,k=3)
31 | selector.fit(X,y)
32 | print("scores_:",selector.scores_)
33 | print("pvalues_:",selector.pvalues_)
34 | print("selected index:",selector.get_support(True))
35 | print("after transform:",selector.transform(X))
36 | if __name__=='__main__':
37 | test_VarianceThreshold()
38 | test_SelectKBest()
--------------------------------------------------------------------------------
/11. Preprocessing/11.6 feature_selection_bagging.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_selection import RFE,RFECV
2 | from sklearn.svm import LinearSVC
3 | from sklearn.datasets import load_iris
4 | from sklearn import cross_validation
5 |
6 | def test_RFE():
7 | '''
8 | test the method of RFE, the number of feature aim to 2
9 | :return: None
10 | '''
11 | iris=load_iris()
12 | X=iris.data
13 | y=iris.target
14 | estimator=LinearSVC()
15 | selector=RFE(estimator=estimator,n_features_to_select=2)
16 | selector.fit(X,y)
17 | print("N_features %s"%selector.n_features_)
18 | print("Support is %s"%selector.support_)
19 | print("Ranking %s"%selector.ranking_)
20 | def test_RFECV():
21 | '''
22 | test the method of RFECV
23 | :return: None
24 | '''
25 | iris=load_iris()
26 | X=iris.data
27 | y=iris.target
28 | estimator=LinearSVC()
29 | selector=RFECV(estimator=estimator,cv=3)
30 | selector.fit(X,y)
31 | print("N_features %s"%selector.n_features_)
32 | print("Support is %s"%selector.support_)
33 | print("Ranking %s"%selector.ranking_)
34 | print("Grid Scores %s"%selector.grid_scores_)
35 | def test_compare_with_no_feature_selection():
36 | '''
37 | compare the result before the selection and after
38 | :return: None
39 | '''
40 | iris=load_iris()
41 | X,y=iris.data,iris.target
42 | estimator=LinearSVC()
43 | selector=RFE(estimator=estimator,n_features_to_select=2)
44 | X_t=selector.fit_transform(X,y)
45 | X_train,X_test,y_train,y_test=cross_validation.train_test_split(X, y,
46 | test_size=0.25,random_state=0,stratify=y)
47 | X_train_t,X_test_t,y_train_t,y_test_t=cross_validation.train_test_split(X_t, y,
48 | test_size=0.25,random_state=0,stratify=y)
49 | clf=LinearSVC()
50 | clf_t=LinearSVC()
51 | clf.fit(X_train,y_train)
52 | clf_t.fit(X_train_t,y_train_t)
53 | print("Original DataSet: test score=%s"%(clf.score(X_test,y_test)))
54 | print("Selected DataSet: test score=%s"%(clf_t.score(X_test_t,y_test_t)))
55 | if __name__=='__main__':
56 | test_RFE()
57 | test_compare_with_no_feature_selection()
58 | test_RFECV()
--------------------------------------------------------------------------------
/11. Preprocessing/11.7 feature_selection_embeded.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_selection import SelectFromModel
2 | from sklearn.svm import LinearSVC
3 | from sklearn.datasets import load_digits,load_diabetes
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | from sklearn.linear_model import Lasso
7 |
8 | def test_SelectFromModel():
9 | '''
10 | test the method of SelectFromModel
11 | :return: None
12 | '''
13 | digits=load_digits()
14 | X=digits.data
15 | y=digits.target
16 | estimator=LinearSVC(penalty='l1',dual=False)
17 | selector=SelectFromModel(estimator=estimator,threshold='mean')
18 | selector.fit(X,y)
19 | selector.transform(X)
20 | print("Threshold %s"%selector.threshold_)
21 | print("Support is %s"%selector.get_support(indices=True))
22 | def test_Lasso(*data):
23 | '''
24 | test the correlation between alpha and sparse condition
25 | :param data: train_data, test_data, train_value, test_value
26 | :return: None
27 | '''
28 | X,y=data
29 | alphas=np.logspace(-2,2)
30 | zeros=[]
31 | for alpha in alphas:
32 | regr=Lasso(alpha=alpha)
33 | regr.fit(X,y)
34 | num=0
35 | for ele in regr.coef_:
36 | if abs(ele) < 1e-5:num+=1
37 | zeros.append(num)
38 | fig=plt.figure()
39 | ax=fig.add_subplot(1,1,1)
40 | ax.plot(alphas,zeros)
41 | ax.set_xlabel(r"$\alpha$")
42 | ax.set_xscale("log")
43 | ax.set_ylim(0,X.shape[1]+1)
44 | ax.set_ylabel("zeros in coef")
45 | ax.set_title("Sparsity In Lasso")
46 | plt.show()
47 | def test_LinearSVC(*data):
48 | '''
49 | test the correlation between C and sparse condition
50 | :param data: train_data, test_data, train_value, test_value
51 | :return: None
52 | '''
53 | X,y=data
54 | Cs=np.logspace(-2,2)
55 | zeros=[]
56 | for C in Cs:
57 | clf=LinearSVC(C=C,penalty='l1',dual=False)
58 | clf.fit(X,y)
59 |
60 | num=0
61 | for row in clf.coef_:
62 | for ele in row:
63 | if abs(ele) < 1e-5:num+=1
64 | zeros.append(num)
65 |
66 | fig=plt.figure()
67 | ax=fig.add_subplot(1,1,1)
68 | ax.plot(Cs,zeros)
69 | ax.set_xlabel("C")
70 | ax.set_xscale("log")
71 | ax.set_ylabel("zeros in coef")
72 | ax.set_title("Sparsity In SVM")
73 | plt.show()
74 | if __name__=='__main__':
75 | test_SelectFromModel()
76 | data=load_diabetes()
77 | test_Lasso(data.data,data.target)
78 | data=load_digits()
79 | test_LinearSVC(data.data,data.target)
--------------------------------------------------------------------------------
/11. Preprocessing/11.8 pipeline.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from sklearn.svm import LinearSVC
4 | from sklearn.datasets import load_digits
5 | from sklearn import cross_validation
6 | from sklearn.linear_model import LogisticRegression
7 | from sklearn.pipeline import Pipeline
8 | def test_Pipeline(data):
9 | '''
10 | test the pipeline
11 | :param data: train_data, test_data, train_value, test_value
12 | :return: None
13 | '''
14 | X_train,X_test,y_train,y_test=data
15 | steps=[("Linear_SVM",LinearSVC(C=1,penalty='l1',dual=False)),
16 | ("LogisticRegression",LogisticRegression(C=1))]
17 | pipeline=Pipeline(steps)
18 | pipeline.fit(X_train,y_train)
19 | print("Named steps:",pipeline.named_steps)
20 | print("Pipeline Score:",pipeline.score(X_test,y_test))
21 | if __name__=='__main__':
22 | data=load_digits()
23 | test_Pipeline(cross_validation.train_test_split(data.data, data.target,test_size=0.25
24 | ,random_state=0,stratify=data.target))
--------------------------------------------------------------------------------
/11. Preprocessing/11.9 dictionary learning.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import DictionaryLearning
2 |
3 | def test_DictionaryLearning():
4 | '''
5 | test the DL method
6 | :return: None
7 | '''
8 | X=[[1,2,3,4,5],
9 | [6,7,8,9,10],
10 | [10,9,8,7,6,],
11 | [5,4,3,2,1] ]
12 | print("before transform:",X)
13 | dct=DictionaryLearning(n_components=3)
14 | dct.fit(X)
15 | print("components is :",dct.components_)
16 | print("after transform:",dct.transform(X))
17 |
18 | if __name__=='__main__':
19 | test_DictionaryLearning()
--------------------------------------------------------------------------------
/11. Preprocessing/README.md:
--------------------------------------------------------------------------------
1 | # 综述
2 |
3 | 数据预处理,在大数据分析中起到十分重要的作用。因为在很多的时候我们得到的数据都是dirty的, 也可以说是生数据。而第一步需要做的,就是让这些生数据变成有结构的,可以统一进行运用处理的数据
4 |
5 | # 常用流程
6 |
7 | 去除唯一属性;处理缺失值;属性编码;数据标准化,正则化;特征选择;主成分分析,降维。
8 |
9 | # 实战代码:GitHub:
10 |
11 | Preprocessing:
12 |
13 | https://github.com/JasonK93/ML-note/tree/master/11.%20Preprocessing
14 |
15 |
--------------------------------------------------------------------------------
/12. Model evaluation/12.1 Loss function.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import zero_one_loss,log_loss
2 |
3 |
4 | def test_zero_one_loss():
5 | '''
6 | test 0-1 loss function
7 | :return: None
8 | '''
9 | y_true=[1,1,1,1,1,0,0,0,0,0]
10 | y_pred=[0,0,0,1,1,1,1,1,0,0]
11 | print("zero_one_loss:",zero_one_loss(y_true,y_pred,normalize=True))
12 | print("zero_one_loss:",zero_one_loss(y_true,y_pred,normalize=False))
13 | def test_log_loss():
14 | '''
15 | test log function
16 | :return: None
17 | '''
18 | y_true=[1, 1, 1, 0, 0, 0]
19 | y_pred=[[0.1, 0.9],
20 | [0.2, 0.8],
21 | [0.3, 0.7],
22 | [0.7, 0.3],
23 | [0.8, 0.2],
24 | [0.9, 0.1]]
25 | print("log_loss:",log_loss(y_true,y_pred,normalize=True))
26 | print("log_loss:",log_loss(y_true,y_pred,normalize=False))
27 |
28 | if __name__=="__main__":
29 | test_zero_one_loss()
30 | test_log_loss()
--------------------------------------------------------------------------------
/12. Model evaluation/12.2 data split.py:
--------------------------------------------------------------------------------
1 | from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,LeaveOneOut\
2 | ,cross_val_score
3 | import numpy as np
4 | def test_train_test_split():
5 | '''
6 | test train_test_split method
7 | :return: None
8 | '''
9 | X=[[1,2,3,4],
10 | [11,12,13,14],
11 | [21,22,23,24],
12 | [31,32,33,34],
13 | [41,42,43,44],
14 | [51,52,53,54],
15 | [61,62,63,64],
16 | [71,72,73,74]]
17 | y=[1,1,0,0,1,1,0,0]
18 |
19 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=0)
20 | print("X_train=",X_train)
21 | print("X_test=",X_test)
22 | print("y_train=",y_train)
23 | print("y_test=",y_test)
24 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4,
25 | random_state=0,stratify=y)
26 | print("Stratify:X_train=",X_train)
27 | print("Stratify:X_test=",X_test)
28 | print("Stratify:y_train=",y_train)
29 | print("Stratify:y_test=",y_test)
30 | def test_KFold():
31 | '''
32 | test Kfold
33 | :return: None
34 | '''
35 | X=np.array([[1,2,3,4],
36 | [11,12,13,14],
37 | [21,22,23,24],
38 | [31,32,33,34],
39 | [41,42,43,44],
40 | [51,52,53,54],
41 | [61,62,63,64],
42 | [71,72,73,74],
43 | [81,82,83,84]])
44 | y=np.array([1,1,0,0,1,1,0,0,1])
45 |
46 | folder=KFold(n_splits=3,random_state=0,shuffle=False)
47 | for train_index,test_index in folder.split(X,y):
48 | print("Train Index:",train_index)
49 | print("Test Index:",test_index)
50 | print("X_train:",X[train_index])
51 | print("X_test:",X[test_index])
52 | print("")
53 |
54 | shuffle_folder=KFold(n_splits=3,random_state=0,shuffle=True)
55 | for train_index,test_index in shuffle_folder.split(X,y):
56 | print("Shuffled Train Index:",train_index)
57 | print("Shuffled Test Index:",test_index)
58 | print("Shuffled X_train:",X[train_index])
59 | print("Shuffled X_test:",X[test_index])
60 | print("")
61 | def test_StratifiedKFold():
62 | '''
63 | test Stratified Kfold
64 | :return: None
65 | '''
66 | X=np.array([[1,2,3,4],
67 | [11,12,13,14],
68 | [21,22,23,24],
69 | [31,32,33,34],
70 | [41,42,43,44],
71 | [51,52,53,54],
72 | [61,62,63,64],
73 | [71,72,73,74]])
74 |
75 | y=np.array([1,1,0,0,1,1,0,0])
76 |
77 | folder=KFold(n_splits=4,random_state=0,shuffle=False)
78 | stratified_folder=StratifiedKFold(n_splits=4,random_state=0,shuffle=False)
79 | for train_index,test_index in folder.split(X,y):
80 | print("Train Index:",train_index)
81 | print("Test Index:",test_index)
82 | print("y_train:",y[train_index])
83 | print("y_test:",y[test_index])
84 | print("")
85 |
86 | for train_index,test_index in stratified_folder.split(X,y):
87 | print("Stratified Train Index:",train_index)
88 | print("Stratified Test Index:",test_index)
89 | print("Stratified y_train:",y[train_index])
90 | print("Stratified y_test:",y[test_index])
91 | print("")
92 |
93 | def test_cross_val_score():
94 | '''
95 | test cross_val_score
96 | :return: None
97 | '''
98 | from sklearn.datasets import load_digits
99 | from sklearn.svm import LinearSVC
100 |
101 | digits=load_digits()
102 | X=digits.data
103 | y=digits.target
104 |
105 | result=cross_val_score(LinearSVC(),X,y,cv=10)
106 | print("Cross Val Score is:",result)
107 |
108 |
109 | if __name__=='__main__':
110 | test_train_test_split()
111 | test_KFold()
112 | test_StratifiedKFold()
113 | test_cross_val_score()
--------------------------------------------------------------------------------
/12. Model evaluation/12.3 validation_curve.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.datasets import load_digits
4 | from sklearn.svm import LinearSVC
5 | from sklearn.learning_curve import validation_curve
6 |
7 | def test_validation_curve():
8 | '''
9 | test validation_curve with LinerSVC and different C
10 | :return: None
11 | '''
12 | digits = load_digits()
13 | X,y=digits.data,digits.target
14 | param_name="C"
15 | param_range = np.logspace(-2, 2)
16 | train_scores, test_scores = validation_curve(LinearSVC(), X, y, param_name=param_name,
17 | param_range=param_range,cv=10, scoring="accuracy")
18 |
19 | train_scores_mean = np.mean(train_scores, axis=1)
20 | train_scores_std = np.std(train_scores, axis=1)
21 | test_scores_mean = np.mean(test_scores, axis=1)
22 | test_scores_std = np.std(test_scores, axis=1)
23 |
24 | fig=plt.figure()
25 | ax=fig.add_subplot(1,1,1)
26 |
27 | ax.semilogx(param_range, train_scores_mean, label="Training Accuracy", color="r")
28 | ax.fill_between(param_range, train_scores_mean - train_scores_std,
29 | train_scores_mean + train_scores_std, alpha=0.2, color="r")
30 | ax.semilogx(param_range, test_scores_mean, label="Testing Accuracy", color="g")
31 | ax.fill_between(param_range, test_scores_mean - test_scores_std,
32 | test_scores_mean + test_scores_std, alpha=0.2, color="g")
33 |
34 | ax.set_title("Validation Curve with LinearSVC")
35 | ax.set_xlabel("C")
36 | ax.set_ylabel("Score")
37 | ax.set_ylim(0,1.1)
38 | ax.legend(loc='best')
39 | plt.show()
40 |
41 | if __name__=='__main__':
42 | test_validation_curve()
--------------------------------------------------------------------------------
/12. Model evaluation/12.4 grid_search.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_digits
2 | from sklearn.linear_model import LogisticRegression
3 | from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
4 | from sklearn.metrics import classification_report
5 | from sklearn.model_selection import train_test_split
6 | import scipy
7 |
8 | def test_GridSearchCV():
9 | '''
10 | Use the GridSearchCV, and LogisticRegression to improve C, penalty, multi_class
11 | :return: None
12 | '''
13 | digits = load_digits()
14 | X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target,test_size=0.25,
15 | random_state=0,stratify=digits.target)
16 | tuned_parameters = [{'penalty': ['l1','l2'],
17 | 'C': [0.01,0.05,0.1,0.5,1,5,10,50,100],
18 | 'solver':['liblinear'],
19 | 'multi_class': ['ovr']},
20 |
21 | {'penalty': ['l2'],
22 | 'C': [0.01,0.05,0.1,0.5,1,5,10,50,100],
23 | 'solver':['lbfgs'],
24 | 'multi_class': ['ovr','multinomial']},
25 | ]
26 | clf=GridSearchCV(LogisticRegression(tol=1e-6),tuned_parameters,cv=10)
27 | clf.fit(X_train,y_train)
28 | print("Best parameters set found:",clf.best_params_)
29 | print("Grid scores:")
30 | for params, mean_score, scores in clf.grid_scores_:
31 | print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params))
32 |
33 | print("Optimized Score:",clf.score(X_test,y_test))
34 | print("Detailed classification report:")
35 | y_true, y_pred = y_test, clf.predict(X_test)
36 | print(classification_report(y_true, y_pred))
37 |
38 | def test_RandomizedSearchCV():
39 |
40 | '''
41 | Use RandomizedSearchCV and LogisticRegression, to improve C, multi_class.
42 | :return: None
43 | '''
44 | digits = load_digits()
45 | X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target,
46 | test_size=0.25,random_state=0,stratify=digits.target)
47 |
48 | tuned_parameters ={ 'C': scipy.stats.expon(scale=100),
49 | 'multi_class': ['ovr','multinomial']}
50 | clf=RandomizedSearchCV(LogisticRegression(penalty='l2',solver='lbfgs',tol=1e-6),
51 | tuned_parameters,cv=10,scoring="accuracy",n_iter=100)
52 | clf.fit(X_train,y_train)
53 | print("Best parameters set found:",clf.best_params_)
54 | print("Randomized Grid scores:")
55 | for params, mean_score, scores in clf.grid_scores_:
56 | print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params))
57 |
58 | print("Optimized Score:",clf.score(X_test,y_test))
59 | print("Detailed classification report:")
60 | y_true, y_pred = y_test, clf.predict(X_test)
61 | print(classification_report(y_true, y_pred))
62 |
63 | if __name__=='__main__':
64 | test_GridSearchCV()
65 | test_RandomizedSearchCV()
--------------------------------------------------------------------------------
/12. Model evaluation/12.5 classification_metrics.py:
--------------------------------------------------------------------------------
1 |
2 | from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\
3 | ,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\
4 | ,roc_curve
5 | from sklearn.datasets import load_iris
6 | from sklearn.multiclass import OneVsRestClassifier
7 | from sklearn.svm import SVC
8 | from sklearn.model_selection import train_test_split
9 | import matplotlib.pyplot as plt
10 | from sklearn.preprocessing import label_binarize
11 | import numpy as np
12 |
13 |
14 | def test_accuracy_score():
15 |
16 | y_true=[1,1,1,1,1,0,0,0,0,0]
17 | y_pred=[0,0,1,1,0,0,1,1,0,0]
18 | print('Accuracy Score(normalize=True):',accuracy_score(y_true,y_pred,normalize=True))
19 | print('Accuracy Score(normalize=False):',accuracy_score(y_true,y_pred,normalize=False))
20 |
21 | def test_precision_score():
22 |
23 | y_true=[1,1,1,1,1,0,0,0,0,0]
24 | y_pred=[0,0,1,1,0,0,0,0,0,0]
25 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True))
26 | print('Precision Score:',precision_score(y_true,y_pred))
27 | def test_recall_score():
28 |
29 | y_true=[1,1,1,1,1,0,0,0,0,0]
30 | y_pred=[0,0,1,1,0,0,0,0,0,0]
31 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True))
32 | print('Precision Score:',precision_score(y_true,y_pred))
33 | print('Recall Score:',recall_score(y_true,y_pred))
34 | def test_f1_score():
35 |
36 | y_true=[1,1,1,1,1,0,0,0,0,0]
37 | y_pred=[0,0,1,1,0,0,0,0,0,0]
38 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True))
39 | print('Precision Score:',precision_score(y_true,y_pred))
40 | print('Recall Score:',recall_score(y_true,y_pred))
41 | print('F1 Score:',f1_score(y_true,y_pred))
42 | def test_fbeta_score():
43 |
44 | y_true=[1,1,1,1,1,0,0,0,0,0]
45 | y_pred=[0,0,1,1,0,0,0,0,0,0]
46 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True))
47 | print('Precision Score:',precision_score(y_true,y_pred))
48 | print('Recall Score:',recall_score(y_true,y_pred))
49 | print('F1 Score:',f1_score(y_true,y_pred))
50 | print('Fbeta Score(beta=0.001):',fbeta_score(y_true,y_pred,beta=0.001))
51 | print('Fbeta Score(beta=1):',fbeta_score(y_true,y_pred,beta=1))
52 | print('Fbeta Score(beta=10):',fbeta_score(y_true,y_pred,beta=10))
53 | print('Fbeta Score(beta=10000):',fbeta_score(y_true,y_pred,beta=10000))
54 | def test_classification_report():
55 |
56 | y_true=[1,1,1,1,1,0,0,0,0,0]
57 | y_pred=[0,0,1,1,0,0,0,0,0,0]
58 | print('Classification Report:\n',classification_report(y_true,y_pred,
59 | target_names=["class_0","class_1"]))
60 | def test_confusion_matrix():
61 |
62 | y_true=[1,1,1,1,1,0,0,0,0,0]
63 | y_pred=[0,0,1,1,0,0,0,0,0,0]
64 | print('Confusion Matrix:\n',confusion_matrix(y_true,y_pred,labels=[0,1]))
65 | def test_precision_recall_curve():
66 |
67 | iris=load_iris()
68 | X=iris.data
69 | y=iris.target
70 |
71 | y = label_binarize(y, classes=[0, 1, 2])
72 | n_classes = y.shape[1]
73 |
74 | np.random.seed(0)
75 | n_samples, n_features = X.shape
76 | X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]
77 |
78 | X_train,X_test,y_train,y_test=train_test_split(X,y,
79 | test_size=0.5,random_state=0)
80 |
81 | clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0))
82 | clf.fit(X_train,y_train)
83 | y_score = clf.fit(X_train, y_train).decision_function(X_test)
84 |
85 | fig=plt.figure()
86 | ax=fig.add_subplot(1,1,1)
87 | precision = dict()
88 | recall = dict()
89 | for i in range(n_classes):
90 | precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
91 | y_score[:, i])
92 | ax.plot(recall[i],precision[i],label="target=%s"%i)
93 | ax.set_xlabel("Recall Score")
94 | ax.set_ylabel("Precision Score")
95 | ax.set_title("P-R")
96 | ax.legend(loc='best')
97 | ax.set_xlim(0,1.1)
98 | ax.set_ylim(0,1.1)
99 | ax.grid()
100 | plt.show()
101 | def test_roc_auc_score():
102 |
103 | iris=load_iris()
104 | X=iris.data
105 | y=iris.target
106 |
107 | y = label_binarize(y, classes=[0, 1, 2])
108 | n_classes = y.shape[1]
109 |
110 | np.random.seed(0)
111 | n_samples, n_features = X.shape
112 | X = np.c_[X, np.random.randn(n_samples, 200 * n_features)]
113 |
114 | X_train,X_test,y_train,y_test=train_test_split(X,y,
115 | test_size=0.5,random_state=0)
116 |
117 | clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0))
118 | clf.fit(X_train,y_train)
119 | y_score = clf.fit(X_train, y_train).decision_function(X_test)
120 |
121 | fig=plt.figure()
122 | ax=fig.add_subplot(1,1,1)
123 | fpr = dict()
124 | tpr = dict()
125 | roc_auc=dict()
126 | for i in range(n_classes):
127 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i],y_score[:, i])
128 | roc_auc[i] = roc_auc_score(fpr[i], tpr[i])
129 | ax.plot(fpr[i],tpr[i],label="target=%s,auc=%s"%(i,roc_auc[i]))
130 | ax.plot([0, 1], [0, 1], 'k--')
131 | ax.set_xlabel("FPR")
132 | ax.set_ylabel("TPR")
133 | ax.set_title("ROC")
134 | ax.legend(loc="best")
135 | ax.set_xlim(0,1.1)
136 | ax.set_ylim(0,1.1)
137 | ax.grid()
138 | plt.show()
139 |
140 | if __name__=='__main__':
141 | test_accuracy_score()
142 | test_precision_score()
143 | test_recall_score()
144 | test_f1_score()
145 | test_fbeta_score()
146 | test_classification_report()
147 | test_confusion_matrix()
148 | test_precision_recall_curve()
149 | # test_roc_auc_score()
--------------------------------------------------------------------------------
/12. Model evaluation/12.6 learning curve.py:
--------------------------------------------------------------------------------
1 |
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from sklearn.datasets import load_digits
5 | from sklearn.svm import LinearSVC
6 | from sklearn.learning_curve import learning_curve
7 |
8 | def test_learning_curve():
9 |
10 | digits = load_digits()
11 | X,y=digits.data,digits.target
12 |
13 | train_sizes=np.linspace(0.1,1.0,endpoint=True,dtype='float')
14 | abs_trains_sizes,train_scores, test_scores = learning_curve(LinearSVC(),
15 | X, y,cv=10, scoring="accuracy",train_sizes=train_sizes)
16 |
17 | train_scores_mean = np.mean(train_scores, axis=1)
18 | train_scores_std = np.std(train_scores, axis=1)
19 | test_scores_mean = np.mean(test_scores, axis=1)
20 | test_scores_std = np.std(test_scores, axis=1)
21 |
22 | fig=plt.figure()
23 | ax=fig.add_subplot(1,1,1)
24 |
25 | ax.plot(abs_trains_sizes, train_scores_mean, label="Training Accuracy", color="r")
26 | ax.fill_between(abs_trains_sizes, train_scores_mean - train_scores_std,
27 | train_scores_mean + train_scores_std, alpha=0.2, color="r")
28 | ax.plot(abs_trains_sizes, test_scores_mean, label="Testing Accuracy", color="g")
29 | ax.fill_between(abs_trains_sizes, test_scores_mean - test_scores_std,
30 | test_scores_mean + test_scores_std, alpha=0.2, color="g")
31 |
32 | ax.set_title("Learning Curve with LinearSVC")
33 | ax.set_xlabel("Sample Nums")
34 | ax.set_ylabel("Score")
35 | ax.set_ylim(0,1.1)
36 | ax.legend(loc='best')
37 | plt.show()
38 |
39 | if __name__=="__main__":
40 | test_learning_curve()
--------------------------------------------------------------------------------
/12. Model evaluation/12.7 regression_metrics.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import mean_absolute_error,mean_squared_error
2 |
3 | def test_mean_absolute_error():
4 |
5 | y_true=[1,1,1,1,1,2,2,2,0,0]
6 | y_pred=[0,0,0,1,1,1,0,0,0,0]
7 |
8 | print("Mean Absolute Error:",mean_absolute_error(y_true,y_pred))
9 | def test_mean_squared_error():
10 |
11 | y_true=[1,1,1,1,1,2,2,2,0,0]
12 | y_pred=[0,0,0,1,1,1,0,0,0,0]
13 |
14 | print("Mean Absolute Error:",mean_absolute_error(y_true,y_pred))
15 | print("Mean Square Error:",mean_squared_error(y_true,y_pred))
16 |
17 | if __name__=="__main__":
18 | test_mean_absolute_error()
19 | test_mean_squared_error()
--------------------------------------------------------------------------------
/12. Model evaluation/README.md:
--------------------------------------------------------------------------------
1 | # 综述
2 |
3 | 模型的评估在机器学习中扮演着很重要的角色,用于分别什么才是好的预测模型。机器学习一般包含两个方面,原型设计和应用。原型设计方面会通过验证和离线评估来选择一个较好的模型。评估方法一般有在线评估和离线评估等。在线评估一般是在应用阶段使用新生成的数据来进行评估并更新模型的过程。
4 |
5 | # 离线评估,在线评估
6 |
7 | 离线评估中,我们一般会使用到 准确率(accuracy), 精确率(precision), 召回率(recall)。而在线评估有用户生命周期价值(Customer Lifetime Value), 广告点击率( Click Through Rate), 用户流失率(Customer Churn Rate) 等等。
8 |
9 | # 损失函数
10 |
11 | 损失函数一般用于度量错误的程度。 常用的有:0-1损失函数, 平方损失函数,绝对损失函数, 对数损失函数。 风险函数定义为损失函数的期望,所以学习的目标也可以是风险函数最小的模型。
12 |
13 | # 模型评估
14 |
15 | 度量因素:训练误差,测试误差。根据这两个因素可以推论是否有过拟合或者欠拟合的情况。评估方法常用有:1.留出法:也可以说是三分法( train data, valid data, test data). 2 交叉验证法(Cross- Validation) 3. 留一法( Leave-One-Out) 4. 自助法(Boostrapping)
16 |
17 | # 性能度量
18 |
19 | 准确率,错误率, 混淆矩阵,precision, recall, P-R curve( Precision-Recall 曲线, 被包住的性能好), ROC曲线
20 |
21 | 实战代码:GitHub:
22 |
23 | https://github.com/JasonK93/ML-note/tree/master/12.%20Model%20evaluation
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/2.decision tree(DT)/2.1 Decision Tree-Classifier.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | from sklearn.tree import DecisionTreeClassifier
5 | from sklearn import datasets
6 | from sklearn import cross_validation
7 | import matplotlib.pyplot as plt
8 | def load_data():
9 | '''
10 | load iris data from sk-learn. this data has 150 samples and 3 class.
11 | return:
12 | 1 array for the classification problem.
13 | train_data, test_data, train_value, test_value
14 | '''
15 | iris=datasets.load_iris()
16 | X_train=iris.data
17 | y_train=iris.target
18 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
19 | random_state=0,stratify=y_train)
20 | def test_DecisionTreeClassifier(*data):
21 | '''
22 | test decision tree
23 | :param data: train_data, test_data, train_value, test_value
24 | :return: None
25 | '''
26 | X_train,X_test,y_train,y_test=data
27 | clf = DecisionTreeClassifier()
28 | clf.fit(X_train, y_train)
29 |
30 | print("Training score: {0}".format(clf.score(X_train,y_train)))
31 | print("Testing score: {0}".format(clf.score(X_test,y_test)))
32 | def test_DecisionTreeClassifier_criterion(*data):
33 | '''
34 | test the performance with different criterion
35 | :param data: train_data, test_data, train_value, test_value
36 | :return: None
37 | '''
38 | X_train,X_test,y_train,y_test=data
39 | criterions=['gini','entropy']
40 | for criterion in criterions:
41 | clf = DecisionTreeClassifier(criterion=criterion)
42 | clf.fit(X_train, y_train)
43 | print("criterion:{0}".format(criterion))
44 | print("Training score: {0}".format(clf.score(X_train,y_train)))
45 | print("Testing score: {0}".format(clf.score(X_test,y_test)))
46 | def test_DecisionTreeClassifier_splitter(*data):
47 | '''
48 | test the performance with different splitters
49 | :param data: train_data, test_data, train_value, test_value
50 | :return: None
51 | '''
52 | X_train,X_test,y_train,y_test=data
53 | splitters=['best','random']
54 | for splitter in splitters:
55 | clf = DecisionTreeClassifier(splitter=splitter)
56 | clf.fit(X_train, y_train)
57 | print("splitter: {0}".format(splitter))
58 | print("Training score:{0}".format(clf.score(X_train,y_train)))
59 | print("Testing score: {0}".format(clf.score(X_test,y_test)))
60 | def test_DecisionTreeClassifier_depth(*data,maxdepth):
61 | '''
62 | test the score with different max_depth
63 | :param data: train_data, test_data, train_value, test_value
64 | :param maxdepth: an integer
65 | :return: None
66 | '''
67 | X_train,X_test,y_train,y_test=data
68 | depths=np.arange(1,maxdepth)
69 | training_scores=[]
70 | testing_scores=[]
71 | for depth in depths:
72 | clf = DecisionTreeClassifier(max_depth=depth)
73 | clf.fit(X_train, y_train)
74 | training_scores.append(clf.score(X_train,y_train))
75 | testing_scores.append(clf.score(X_test,y_test))
76 |
77 | ## graph
78 | fig=plt.figure()
79 | ax=fig.add_subplot(1,1,1)
80 | ax.plot(depths,training_scores,label="traing score",marker='o')
81 | ax.plot(depths,testing_scores,label="testing score",marker='*')
82 | ax.set_xlabel("maxdepth")
83 | ax.set_ylabel("score")
84 | ax.set_title("Decision Tree Classification")
85 | ax.legend(framealpha=0.5,loc='best')
86 | plt.show()
87 | if __name__=='__main__':
88 | X_train,X_test,y_train,y_test=load_data()
89 | test_DecisionTreeClassifier(X_train,X_test,y_train,y_test)
90 | test_DecisionTreeClassifier_criterion(X_train,X_test,y_train,y_test)
91 | test_DecisionTreeClassifier_splitter(X_train,X_test,y_train,y_test)
92 | test_DecisionTreeClassifier_depth(X_train,X_test,y_train,y_test,maxdepth=100)
--------------------------------------------------------------------------------
/2.decision tree(DT)/2.2 Decision Tree- Regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import numpy as np
4 | from sklearn.tree import DecisionTreeRegressor
5 | from sklearn import cross_validation
6 | import matplotlib.pyplot as plt
7 | def creat_data(n):
8 | '''
9 | generate data from random
10 | :param n: the number of sample
11 | :return: train_data, test_data, train_value, test_value
12 | '''
13 | np.random.seed(0)
14 | X = 5 * np.random.rand(n, 1)
15 | y = np.sin(X).ravel()
16 | noise_num=(int)(n/5)
17 | y[::5] += 3 * (0.5 - np.random.rand(noise_num)) # add noise every 5 sample
18 | return cross_validation.train_test_split(X, y,
19 | test_size=0.25,random_state=1)
20 | def test_DecisionTreeRegressor(*data):
21 | '''
22 | test DT regression
23 | :param data: train_data, test_data, train_value, test_value
24 | :return: None
25 | '''
26 | X_train,X_test,y_train,y_test=data
27 | regr = DecisionTreeRegressor()
28 | regr.fit(X_train, y_train)
29 | print("Training score:{0}".format(regr.score(X_train,y_train)))
30 | print("Testing score:{0}".format(regr.score(X_test,y_test)))
31 | ##graph
32 | fig=plt.figure()
33 | ax=fig.add_subplot(1,1,1)
34 | X = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
35 | Y = regr.predict(X)
36 | ax.scatter(X_train, y_train, label="train sample",c='g')
37 | ax.scatter(X_test, y_test, label="test sample",c='r')
38 | ax.plot(X, Y, label="predict_value", linewidth=2,alpha=0.5)
39 | ax.set_xlabel("data")
40 | ax.set_ylabel("target")
41 | ax.set_title("Decision Tree Regression")
42 | ax.legend(framealpha=0.5)
43 | plt.show()
44 | def test_DecisionTreeRegressor_splitter(*data):
45 | '''
46 | test the performance with different splitters
47 | :param data: train_data, test_data, train_value, test_value
48 | :return: None
49 | '''
50 | X_train,X_test,y_train,y_test=data
51 | splitters=['best','random']
52 | for splitter in splitters:
53 | regr = DecisionTreeRegressor(splitter=splitter)
54 | regr.fit(X_train, y_train)
55 | print("Splitter {0}".format(splitter))
56 | print("Training score:{0}".format(regr.score(X_train,y_train)))
57 | print("Testing score:{0}".format(regr.score(X_test,y_test)))
58 | def test_DecisionTreeRegressor_depth(*data,maxdepth):
59 | '''
60 | test the score with different max_depth
61 | :param data: train_data, test_data, train_value, test_value
62 | :param maxdepth: an integer
63 | :return: None
64 | '''
65 | X_train,X_test,y_train,y_test=data
66 | depths=np.arange(1,maxdepth)
67 | training_scores=[]
68 | testing_scores=[]
69 | for depth in depths:
70 | regr = DecisionTreeRegressor(max_depth=depth)
71 | regr.fit(X_train, y_train)
72 | training_scores.append(regr.score(X_train,y_train))
73 | testing_scores.append(regr.score(X_test,y_test))
74 |
75 | ## graph
76 | fig=plt.figure()
77 | ax=fig.add_subplot(1,1,1)
78 | ax.plot(depths,training_scores,label="traing score")
79 | ax.plot(depths,testing_scores,label="testing score")
80 | ax.set_xlabel("maxdepth")
81 | ax.set_ylabel("score")
82 | ax.set_title("Decision Tree Regression")
83 | ax.legend(framealpha=0.5)
84 | plt.show()
85 | if __name__=='__main__':
86 | X_train,X_test,y_train,y_test=creat_data(100)
87 | test_DecisionTreeRegressor(X_train,X_test,y_train,y_test)
88 | test_DecisionTreeRegressor_splitter(X_train,X_test,y_train,y_test)
89 | test_DecisionTreeRegressor_depth(X_train,X_test,y_train,y_test,maxdepth=20)
90 |
--------------------------------------------------------------------------------
/2.decision tree(DT)/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 简介
3 |
4 | 决策树的功能很强大,是一种有监督的学习方法。决策树既可以用来解决回归问题,也可以解决分类问题。
5 |
6 | # 原理
7 |
8 | 在特征空间上执行递归的二元分割. 。有节点和有向边组成。
9 |
10 | # 步骤
11 |
12 | 特征选择;决策树生成;决策树剪枝。
13 |
14 | 特征选择根据:熵,基尼系数,方差等因素决定。生成决策树的方法有很多,典型的有ID3,和C4.5。 ID3 采用的信息增益作为度量。C4.5采用信息增益比。树剪枝简化了模型,并且某种程度上减少了过拟合的发生。同时树剪枝也是预测误差和数据复杂度之间的一个折中。
15 |
16 | # 实战代码:GitHub
17 |
18 | 1.决策树分类:
19 |
20 | https://github.com/JasonK93/ML-note/blob/master/2.decision%20tree(DT)/2.1%20Decision%20Tree-Classifier.py
21 |
22 | 2.决策树回归:
23 |
24 | https://github.com/JasonK93/ML-note/blob/master/2.decision%20tree(DT)/2.2%20Decision%20Tree-%20Regression.py
25 |
26 | # 决策图
27 |
28 | 决策树生成后可以对相关规则进行可视化,使用函数export_graphviz()
29 |
--------------------------------------------------------------------------------
/3.Bayes/3.1 Gaussian Bayes.py:
--------------------------------------------------------------------------------
1 | from sklearn import datasets,cross_validation,naive_bayes
2 | import matplotlib.pyplot as plt
3 |
4 | def load_data():
5 | '''
6 | reload the digits dataset from sklearn
7 | :return: train_data, test_data, train_value, test_value
8 | '''
9 | digits=datasets.load_digits()
10 | return cross_validation.train_test_split(digits.data,digits.target,
11 | test_size=0.25,random_state=0,stratify=digits.target)
12 |
13 | def test_GaussianNB(*data):
14 | '''
15 | Test Gaussian NB
16 | :param data: train_data, test_data, train_value, test_value
17 | :return: None
18 | '''
19 | X_train,X_test,y_train,y_test=data
20 | cls=naive_bayes.GaussianNB()
21 | cls.fit(X_train,y_train)
22 | print('Training Score: {0}' .format( cls.score(X_train,y_train)))
23 | print('Testing Score: {0}' .format( cls.score(X_test, y_test)))
24 |
25 | def show_digits():
26 | '''
27 | graph the first 25 samples in the data set
28 | :return: None
29 | '''
30 | digits=datasets.load_digits()
31 | fig=plt.figure()
32 | print("vector from images 0:",digits.data[0])
33 | for i in range(25):
34 | ax=fig.add_subplot(5,5,i+1)
35 | ax.imshow(digits.images[i],cmap=plt.cm.gray_r, interpolation='nearest')
36 | plt.show()
37 |
38 | if __name__=='__main__':
39 | show_digits()
40 | X_train,X_test,y_train,y_test=load_data()
41 | test_GaussianNB(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/3.Bayes/3.2 Multinomial NB.py:
--------------------------------------------------------------------------------
1 | from sklearn import datasets,cross_validation,naive_bayes
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | def load_data():
6 | '''
7 | reload the digits dataset from sklearn
8 | :return: train_data, test_data, train_value, test_value
9 | '''
10 | digits=datasets.load_digits()
11 | return cross_validation.train_test_split(digits.data,digits.target,
12 | test_size=0.25,random_state=0,stratify=digits.target)
13 |
14 | def test_MultinomialNB(*data):
15 | '''
16 | test Multinomial NB
17 | :param data: train_data, test_data, train_value, test_value
18 | :return: None
19 | '''
20 | X_train,X_test,y_train,y_test=data
21 | cls=naive_bayes.MultinomialNB()
22 | cls.fit(X_train,y_train)
23 | print('Training Score: {0}' .format( cls.score(X_train,y_train)))
24 | print('Testing Score: {0}'.format(cls.score(X_test, y_test)))
25 | def test_MultinomialNB_alpha(*data):
26 | '''
27 | test the performance with different alpha
28 | :param data: train_data, test_data, train_value, test_value
29 | :return: None
30 | '''
31 | X_train,X_test,y_train,y_test=data
32 | alphas=np.logspace(-2,5,num=200)
33 | train_scores=[]
34 | test_scores=[]
35 | for alpha in alphas:
36 | cls=naive_bayes.MultinomialNB(alpha=alpha)
37 | cls.fit(X_train,y_train)
38 | train_scores.append(cls.score(X_train,y_train))
39 | test_scores.append(cls.score(X_test, y_test))
40 |
41 | ## graph
42 | fig=plt.figure()
43 | ax=fig.add_subplot(1,1,1)
44 | ax.plot(alphas,train_scores,label="Training Score")
45 | ax.plot(alphas,test_scores,label="Testing Score")
46 | ax.set_xlabel(r"$\alpha$")
47 | ax.set_ylabel("score")
48 | ax.set_ylim(0,1.0)
49 | ax.set_title("MultinomialNB")
50 | ax.set_xscale("log")
51 | plt.show()
52 | def show_digits():
53 | '''
54 | graph the first 25 samples in the data set
55 | :return: None
56 | '''
57 | digits=datasets.load_digits()
58 | fig=plt.figure()
59 | print("vector from images 0:",digits.data[0])
60 | for i in range(25):
61 | ax=fig.add_subplot(5,5,i+1)
62 | ax.imshow(digits.images[i],cmap=plt.cm.gray_r, interpolation='nearest')
63 | plt.show()
64 |
65 | if __name__=='__main__':
66 | show_digits()
67 | X_train, X_test, y_train, y_test = load_data()
68 | test_MultinomialNB(X_train, X_test, y_train, y_test)
69 | test_MultinomialNB_alpha(X_train, X_test, y_train, y_test)
70 |
--------------------------------------------------------------------------------
/3.Bayes/3.3 Bernoulli NB.py:
--------------------------------------------------------------------------------
1 | from sklearn import datasets,cross_validation,naive_bayes
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | def load_data():
6 | '''
7 | reload the digits dataset from sklearn
8 | :return: train_data, test_data, train_value, test_value
9 | '''
10 | digits=datasets.load_digits()
11 | return cross_validation.train_test_split(digits.data,digits.target,
12 | test_size=0.25,random_state=0,stratify=digits.target)
13 |
14 | def test_BernoulliNB(*data):
15 | '''
16 | test BernoulliNB
17 | :param data: train_data, test_data, train_value, test_value
18 | :return: None
19 | '''
20 | X_train,X_test,y_train,y_test=data
21 | cls=naive_bayes.BernoulliNB()
22 | cls.fit(X_train,y_train)
23 | print('Training Score: {0}'.format(cls.score(X_train,y_train)))
24 | print('Testing Score: {0}'.format(cls.score(X_test, y_test)))
25 | def test_BernoulliNB_alpha(*data):
26 | '''
27 | test the performance with different alpha
28 | :param data: train_data, test_data, train_value, test_value
29 | :return: None
30 | '''
31 | X_train,X_test,y_train,y_test=data
32 | alphas=np.logspace(-2,5,num=200)
33 | train_scores=[]
34 | test_scores=[]
35 | for alpha in alphas:
36 | cls=naive_bayes.BernoulliNB(alpha=alpha)
37 | cls.fit(X_train,y_train)
38 | train_scores.append(cls.score(X_train,y_train))
39 | test_scores.append(cls.score(X_test, y_test))
40 |
41 | ## graph
42 | fig=plt.figure()
43 | ax=fig.add_subplot(1,1,1)
44 | ax.plot(alphas,train_scores,label="Training Score")
45 | ax.plot(alphas,test_scores,label="Testing Score")
46 | ax.set_xlabel(r"$\alpha$")
47 | ax.set_ylabel("score")
48 | ax.set_ylim(0,1.0)
49 | ax.set_title("BernoulliNB")
50 | ax.set_xscale("log")
51 | ax.legend(loc="best")
52 | plt.show()
53 | def test_BernoulliNB_binarize(*data):
54 | '''
55 | test the performance with different binarize
56 | :param data: train_data, test_data, train_value, test_value
57 | :return: None
58 | '''
59 | X_train,X_test,y_train,y_test=data
60 | min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1
61 | max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1
62 | binarizes=np.linspace(min_x,max_x,endpoint=True,num=100)
63 | train_scores=[]
64 | test_scores=[]
65 | for binarize in binarizes:
66 | cls=naive_bayes.BernoulliNB(binarize=binarize)
67 | cls.fit(X_train,y_train)
68 | train_scores.append(cls.score(X_train,y_train))
69 | test_scores.append(cls.score(X_test, y_test))
70 |
71 | ## graph
72 | fig=plt.figure()
73 | ax=fig.add_subplot(1,1,1)
74 | ax.plot(binarizes,train_scores,label="Training Score")
75 | ax.plot(binarizes,test_scores,label="Testing Score")
76 | ax.set_xlabel("binarize")
77 | ax.set_ylabel("score")
78 | ax.set_ylim(0,1.0)
79 | ax.set_xlim(min_x-1,max_x+1)
80 | ax.set_title("BernoulliNB")
81 | ax.legend(loc="best")
82 | plt.show()
83 | def show_digits():
84 | '''
85 | graph the first 25 samples in the data set
86 | :return: None
87 | '''
88 | digits=datasets.load_digits()
89 | fig=plt.figure()
90 | print("vector from images 0:",digits.data[0])
91 | for i in range(25):
92 | ax=fig.add_subplot(5,5,i+1)
93 | ax.imshow(digits.images[i],cmap=plt.cm.gray_r, interpolation='nearest')
94 | plt.show()
95 |
96 | if __name__=='__main__':
97 | show_digits()
98 | X_train, X_test, y_train, y_test = load_data()
99 | test_BernoulliNB(X_train, X_test, y_train, y_test)
100 | test_BernoulliNB_alpha(X_train, X_test, y_train, y_test)
101 | test_BernoulliNB_binarize(X_train, X_test, y_train, y_test)
--------------------------------------------------------------------------------
/3.Bayes/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 综述
3 |
4 | 贝叶斯分类原理是通过对某对象的先验概率,利用贝叶斯公式计算出后验概率,再选取最大的概率的事件作为分类对象。
5 |
6 | # 分类器
7 |
8 | 1.高斯分类器GaussianNB:条件概率分布满足高斯分布
9 |
10 | https://github.com/JasonK93/ML-note/blob/master/3.Bayes/3.1%20Gaussian%20Bayes.py
11 |
12 | 2.多项式贝叶斯分类器(MultinomialNB):条件概率满足多项式分布
13 |
14 | https://github.com/JasonK93/ML-note/blob/master/3.Bayes/3.2%20Multinomial%20NB.py
15 |
16 | 3.伯努利贝叶斯分类器(BernouliNB):条件概率满足伯努利分布
17 |
18 | https://github.com/JasonK93/ML-note/blob/master/3.Bayes/3.3%20Bernoulli%20NB.py
19 |
20 | # Partial_fit
21 |
22 | 贝叶斯可以处理大规模数据,当完整的训练集无法放入内存中的时候,可以动态的增加数据来进行使用—-online classifier。将一个大数据集分割成数个数据集分块训练。
23 |
--------------------------------------------------------------------------------
/4. KNN/4.1 KNN classification.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import neighbors, datasets,cross_validation
4 |
5 | def load_classification_data():
6 | '''
7 | load the digit data
8 | :return: train_data, test_data, train_value, test_value
9 | '''
10 | digits=datasets.load_digits()
11 | X_train=digits.data
12 | y_train=digits.target
13 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
14 | random_state=0,stratify=y_train)
15 | def test_KNeighborsClassifier(*data):
16 | '''
17 | test KNN classifier
18 | :param data: train_data, test_data, train_value, test_value
19 | :return: None
20 | '''
21 | X_train,X_test,y_train,y_test=data
22 | clf=neighbors.KNeighborsClassifier()
23 | clf.fit(X_train,y_train)
24 | print("Training Score:{0}".format(clf.score(X_train,y_train)))
25 | print("Testing Score:{0}".format(clf.score(X_test,y_test)))
26 | def test_KNeighborsClassifier_k_w(*data):
27 | '''
28 | test the performance with different n_neighbors and weights
29 | :param data: train_data, test_data, train_value, test_value
30 | :return: None
31 | '''
32 | X_train,X_test,y_train,y_test=data
33 | Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int')
34 | weights=['uniform','distance']
35 |
36 | fig=plt.figure()
37 | ax=fig.add_subplot(1,1,1)
38 | ### graph
39 | for weight in weights:
40 | training_scores=[]
41 | testing_scores=[]
42 | for K in Ks:
43 | clf=neighbors.KNeighborsClassifier(weights=weight,n_neighbors=K)
44 | clf.fit(X_train,y_train)
45 | testing_scores.append(clf.score(X_test,y_test))
46 | training_scores.append(clf.score(X_train,y_train))
47 | ax.plot(Ks,testing_scores,label="testing score:weight={0}".format(weight))
48 | ax.plot(Ks,training_scores,label="training score:weight={0}".format(weight))
49 | ax.legend(loc='best')
50 | ax.set_xlabel("K")
51 | ax.set_ylabel("score")
52 | ax.set_ylim(0,1.05)
53 | ax.set_title("KNeighborsClassifier")
54 | plt.show()
55 | def test_KNeighborsClassifier_k_p(*data):
56 | '''
57 | test the performance with different n_neighbors and p
58 | :param data: train_data, test_data, train_value, test_value
59 | :return: None
60 | '''
61 | X_train,X_test,y_train,y_test=data
62 | Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int')
63 | Ps=[1,2,10]
64 |
65 | fig=plt.figure()
66 | ax=fig.add_subplot(1,1,1)
67 | ### graph
68 | for P in Ps:
69 | training_scores=[]
70 | testing_scores=[]
71 | for K in Ks:
72 | clf=neighbors.KNeighborsClassifier(p=P,n_neighbors=K)
73 | clf.fit(X_train,y_train)
74 | testing_scores.append(clf.score(X_test,y_test))
75 | training_scores.append(clf.score(X_train,y_train))
76 | ax.plot(Ks,testing_scores,label="testing score:p={0}".format(P))
77 | ax.plot(Ks,training_scores,label="training score:p={0}".format(P))
78 | ax.legend(loc='best')
79 | ax.set_xlabel("K")
80 | ax.set_ylabel("score")
81 | ax.set_ylim(0,1.05)
82 | ax.set_title("KNeighborsClassifier")
83 | plt.show()
84 |
85 | if __name__=='__main__':
86 | X_train,X_test,y_train,y_test=load_classification_data()
87 | test_KNeighborsClassifier(X_train,X_test,y_train,y_test)
88 | test_KNeighborsClassifier_k_w(X_train,X_test,y_train,y_test)
89 | test_KNeighborsClassifier_k_p(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/4. KNN/4.2 KNN regressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import neighbors, cross_validation
4 |
5 | def create_regression_data(n):
6 | '''
7 | generate data
8 | :param n: the space of data set
9 | :return: train_data, test_data, train_value, test_value
10 | '''
11 | X =5 * np.random.rand(n, 1)
12 | y = np.sin(X).ravel()
13 | y[::5] += 1 * (0.5 - np.random.rand(int(n/5)))
14 | return cross_validation.train_test_split(X, y,test_size=0.25,random_state=0)
15 |
16 | def test_KNeighborsRegressor(*data):
17 | '''
18 | test the KNN regressor
19 | :param data: train_data, test_data, train_value, test_value
20 | :return: None
21 | '''
22 | X_train,X_test,y_train,y_test=data
23 | regr=neighbors.KNeighborsRegressor()
24 | regr.fit(X_train,y_train)
25 | print("Training Score:{0}".format(regr.score(X_train,y_train)))
26 | print("Testing Score:{0}".format(regr.score(X_test,y_test)))
27 | def test_KNeighborsRegressor_k_w(*data):
28 | '''
29 | test the performance with different n_neighbors and weights
30 | :param data: train_data, test_data, train_value, test_value
31 | :return: None
32 | '''
33 | X_train,X_test,y_train,y_test=data
34 | Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int')
35 | weights=['uniform','distance']
36 |
37 | fig=plt.figure()
38 | ax=fig.add_subplot(1,1,1)
39 | ### graph
40 | for weight in weights:
41 | training_scores=[]
42 | testing_scores=[]
43 | for K in Ks:
44 | regr=neighbors.KNeighborsRegressor(weights=weight,n_neighbors=K)
45 | regr.fit(X_train,y_train)
46 | testing_scores.append(regr.score(X_test,y_test))
47 | training_scores.append(regr.score(X_train,y_train))
48 | ax.plot(Ks,testing_scores,label="testing score:weight={0}".format(weight))
49 | ax.plot(Ks,training_scores,label="training score:weight={0}".format(weight))
50 | ax.legend(loc='best')
51 | ax.set_xlabel("K")
52 | ax.set_ylabel("score")
53 | ax.set_ylim(0,1.05)
54 | ax.set_title("KNeighborsRegressor")
55 | plt.show()
56 | def test_KNeighborsRegressor_k_p(*data):
57 | '''
58 | test the performance with different n_neighbors and p
59 | :param data: train_data, test_data, train_value, test_value
60 | :return: None
61 | '''
62 | X_train,X_test,y_train,y_test=data
63 | Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int')
64 | Ps=[1,2,10]
65 |
66 | fig=plt.figure()
67 | ax=fig.add_subplot(1,1,1)
68 | ### graph
69 | for P in Ps:
70 | training_scores=[]
71 | testing_scores=[]
72 | for K in Ks:
73 | regr=neighbors.KNeighborsRegressor(p=P,n_neighbors=K)
74 | regr.fit(X_train,y_train)
75 | testing_scores.append(regr.score(X_test,y_test))
76 | training_scores.append(regr.score(X_train,y_train))
77 | ax.plot(Ks,testing_scores,label="testing score:p={0}".format(P))
78 | ax.plot(Ks,training_scores,label="training score:p={0}".format(P))
79 | ax.legend(loc='best')
80 | ax.set_xlabel("K")
81 | ax.set_ylabel("score")
82 | ax.set_ylim(0,1.05)
83 | ax.set_title("KNeighborsRegressor")
84 | plt.show()
85 |
86 | if __name__=='__main__':
87 | X_train,X_test,y_train,y_test=create_regression_data(1000)
88 | test_KNeighborsRegressor(X_train,X_test,y_train,y_test)
89 | test_KNeighborsRegressor_k_w(X_train,X_test,y_train,y_test)
90 | test_KNeighborsRegressor_k_p(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/4. KNN/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 综述
3 |
4 | 通过计算数据特征值之间的距离,根据K值,进行分类。
5 |
6 | # 三要素
7 |
8 | K值,距离度量,分类决策规则。 K值通过价差验证平均误差率进行选择,距离度量一般为欧几里得距离,分类决策规则通常采用多数表决法。
9 |
10 | # 实战代码:GitHub
11 |
12 | 1.分类:
13 |
14 | https://github.com/JasonK93/ML-note/blob/master/4.%20KNN/4.1%20KNN%20classification.py
15 |
16 | 2.回归:
17 |
18 | https://github.com/JasonK93/ML-note/blob/master/4.%20KNN/4.2%20KNN%20regressor.py
19 |
--------------------------------------------------------------------------------
/5.Dimension_Reduction/5.1 PCA.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets,decomposition
4 |
5 | def load_data():
6 | '''
7 | load the data
8 | :return: train_data, train_value
9 | '''
10 | iris=datasets.load_iris()
11 | return iris.data,iris.target
12 |
13 | def test_PCA(*data):
14 | '''
15 | test the PCA method
16 | :param data: train_data, train_value
17 | :return: None
18 | '''
19 | X,y=data
20 | pca=decomposition.PCA(n_components=None)
21 | pca.fit(X)
22 | print('explained variance ratio : %s'% str(pca.explained_variance_ratio_))
23 | def plot_PCA(*data):
24 | '''
25 | graph the data after PCA
26 | :param data: train_data, train_value
27 | :return: None
28 | '''
29 | X,y=data
30 | pca=decomposition.PCA(n_components=2)
31 | pca.fit(X)
32 | X_r=pca.transform(X)
33 | ###### graph 2-D data ########
34 | fig=plt.figure()
35 | ax=fig.add_subplot(1,1,1)
36 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
37 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
38 | for label ,color in zip( np.unique(y),colors):
39 | position=y==label
40 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),color=color)
41 |
42 | ax.set_xlabel("X[0]")
43 | ax.set_ylabel("Y[0]")
44 | ax.legend(loc="best")
45 | ax.set_title("PCA")
46 | plt.show()
47 | if __name__=='__main__':
48 | X,y=load_data()
49 | test_PCA(X,y)
50 | plot_PCA(X,y)
--------------------------------------------------------------------------------
/5.Dimension_Reduction/5.2 KPCA.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets,decomposition
4 |
5 | def load_data():
6 | '''
7 | load the iris data
8 | :return: train_data, train_value
9 | '''
10 | iris=datasets.load_iris()# 使用 scikit-learn 自带的 iris 数据集
11 | return iris.data,iris.target
12 |
13 | def test_KPCA(*data):
14 | '''
15 | test the KPCA method
16 | :param data: train_data, train_value
17 | :return: None
18 | '''
19 | X,y=data
20 | kernels=['linear','poly','rbf','sigmoid']
21 | for kernel in kernels:
22 | kpca=decomposition.KernelPCA(n_components=None,kernel=kernel) # Use 4 different kernel
23 | kpca.fit(X)
24 | print('kernel={0} --> lambdas: {1}'.format (kernel,kpca.lambdas_))
25 | def plot_KPCA(*data):
26 | '''
27 | graph after KPCA
28 | :param data: train_data, train_value
29 | :return: None
30 | '''
31 | X,y=data
32 | kernels=['linear','poly','rbf','sigmoid']
33 | fig=plt.figure()
34 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
35 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
36 |
37 | for i,kernel in enumerate(kernels):
38 | kpca=decomposition.KernelPCA(n_components=2,kernel=kernel)
39 | kpca.fit(X)
40 | X_r=kpca.transform(X)
41 | ax=fig.add_subplot(2,2,i+1)
42 | for label ,color in zip( np.unique(y),colors):
43 | position=y==label
44 | ax.scatter(X_r[position,0],X_r[position,1],label="target= %d"%label,
45 | color=color)
46 | ax.set_xlabel("X[0]")
47 | ax.set_ylabel("X[1]")
48 | ax.legend(loc="best")
49 | ax.set_title("kernel={0}".format(kernel))
50 | plt.suptitle("KPCA")
51 | plt.show()
52 | def plot_KPCA_poly(*data):
53 | '''
54 | graph after KPCA with poly kernel
55 | :param data: train_data, train_value
56 | :return: None
57 | '''
58 | X,y=data
59 | fig=plt.figure()
60 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
61 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
62 | Params=[(3,1,1),(3,10,1),(3,1,10),(3,10,10),(10,1,1),(10,10,1),(10,1,10),(10,10,10)] # parameter of poly
63 | # p , gamma , r )
64 | # p :3,10
65 | # gamma :1,10
66 | # r :1,10
67 | # 8 combination
68 | for i,(p,gamma,r) in enumerate(Params):
69 | kpca=decomposition.KernelPCA(n_components=2,kernel='poly'
70 | ,gamma=gamma,degree=p,coef0=r)
71 | kpca.fit(X)
72 | X_r=kpca.transform(X)
73 | ax=fig.add_subplot(2,4,i+1)
74 | for label ,color in zip( np.unique(y),colors):
75 | position=y==label
76 | ax.scatter(X_r[position,0],X_r[position,1],label="target= %d"%label,
77 | color=color)
78 | ax.set_xlabel("X[0]")
79 | ax.set_xticks([])
80 | ax.set_yticks([])
81 | ax.set_ylabel("X[1]")
82 | ax.legend(loc="best")
83 | ax.set_title(r"$ ({0} (x \cdot z+1)+{1})^{{2}}$".format(gamma,r,p))
84 | plt.suptitle("KPCA-Poly")
85 | plt.show()
86 | def plot_KPCA_rbf(*data):
87 | '''
88 | graph with kernel of rbf
89 | :param data: train_data, train_value
90 | :return: None
91 | '''
92 | X,y=data
93 | fig=plt.figure()
94 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
95 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
96 | Gammas=[0.5,1,4,10]
97 | for i,gamma in enumerate(Gammas):
98 | kpca=decomposition.KernelPCA(n_components=2,kernel='rbf',gamma=gamma)
99 | kpca.fit(X)
100 | X_r=kpca.transform(X)
101 | ax=fig.add_subplot(2,2,i+1)
102 | for label ,color in zip( np.unique(y),colors):
103 | position=y==label
104 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),
105 | color=color)
106 | ax.set_xlabel("X[0]")
107 | ax.set_xticks([])
108 | ax.set_yticks([])
109 | ax.set_ylabel("X[1]")
110 | ax.legend(loc="best")
111 | ax.set_title(r"$\exp(-{0}||x-z||^2)$".format(gamma))
112 | plt.suptitle("KPCA-rbf")
113 | plt.show()
114 | def plot_KPCA_sigmoid(*data):
115 | '''
116 | graph with sigmoid kernel
117 | :param data: train_data, train_value
118 | :return: None
119 | '''
120 | X,y=data
121 | fig=plt.figure()
122 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
123 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
124 | Params=[(0.01,0.1),(0.01,0.2),(0.1,0.1),(0.1,0.2),(0.2,0.1),(0.2,0.2)]# parameter of sigmoid kernel
125 | # gamma,coef0
126 | # gamma : 0.01,0.1,0.2
127 | # coef0 : 0.1,0.2
128 | # 6 combination
129 | for i,(gamma,r) in enumerate(Params):
130 | kpca=decomposition.KernelPCA(n_components=2,kernel='sigmoid',gamma=gamma,coef0=r)
131 | kpca.fit(X)
132 | X_r=kpca.transform(X)
133 | ax=fig.add_subplot(3,2,i+1)
134 | for label ,color in zip( np.unique(y),colors):
135 | position=y==label
136 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),
137 | color=color)
138 | ax.set_xlabel("X[0]")
139 | ax.set_xticks([])
140 | ax.set_yticks([])
141 | ax.set_ylabel("X[1]")
142 | ax.legend(loc="best")
143 | ax.set_title(r"$\tanh({0}(x\cdot z)+{1})$".format(gamma,r))
144 | plt.suptitle("KPCA-sigmoid")
145 | plt.show()
146 | if __name__=='__main__':
147 | X,y=load_data()
148 | test_KPCA(X,y)
149 | plot_KPCA(X,y)
150 | plot_KPCA_poly(X,y)
151 | plot_KPCA_rbf(X,y)
152 | plot_KPCA_sigmoid(X,y)
--------------------------------------------------------------------------------
/5.Dimension_Reduction/5.3 MDS.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets,manifold
4 |
5 | def load_data():
6 | '''
7 | load the iris data
8 | :return: train_data, train_value
9 | '''
10 | iris=datasets.load_iris()
11 | return iris.data,iris.target
12 |
13 | def test_MDS(*data):
14 | '''
15 | test MDS method
16 | :param data: train_data, train_value
17 | :return: None
18 | '''
19 | X,y=data
20 | for n in [4,3,2,1]:
21 | mds=manifold.MDS(n_components=n)
22 | mds.fit(X)
23 | print('stress(n_components={0}) : {1}'.format (n, str(mds.stress_)))
24 | def plot_MDS(*data):
25 | '''
26 | graph after MDS
27 | :param data: train_data, train_value
28 | :return: None
29 | '''
30 | X,y=data
31 | mds=manifold.MDS(n_components=2)
32 | X_r=mds.fit_transform(X)
33 |
34 | ### graph
35 | fig=plt.figure()
36 | ax=fig.add_subplot(1,1,1)
37 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
38 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
39 | for label ,color in zip( np.unique(y),colors):
40 | position=y==label
41 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),color=color)
42 |
43 | ax.set_xlabel("X[0]")
44 | ax.set_ylabel("X[1]")
45 | ax.legend(loc="best")
46 | ax.set_title("MDS")
47 | plt.show()
48 | if __name__=='__main__':
49 | X,y=load_data()
50 | test_MDS(X,y)
51 | plot_MDS(X,y)
--------------------------------------------------------------------------------
/5.Dimension_Reduction/5.4 Isomap.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets,manifold
4 |
5 | def load_data():
6 | '''
7 | load the iris data
8 | :return: train_data, train_value
9 | '''
10 | iris=datasets.load_iris()
11 | return iris.data,iris.target
12 |
13 | def test_Isomap(*data):
14 | '''
15 | test Isomap method
16 | :param data: train_data, train_value
17 | :return: None
18 | '''
19 | X,y=data
20 | for n in [4,3,2,1]:
21 | isomap=manifold.Isomap(n_components=n)
22 | isomap.fit(X)
23 | print('reconstruction_error(n_components=%d) : %s'%
24 | (n, isomap.reconstruction_error()))
25 | def plot_Isomap_k(*data):
26 | '''
27 | test the performance with different n_neighbors and reduce to 2-D
28 | :param data: train_data, train_value
29 | :return: None
30 | '''
31 | X,y=data
32 | Ks=[1,5,25,y.size-1]
33 |
34 | fig=plt.figure()
35 | for i, k in enumerate(Ks):
36 | isomap=manifold.Isomap(n_components=2,n_neighbors=k)
37 | X_r=isomap.fit_transform(X)
38 |
39 | ax=fig.add_subplot(2,2,i+1)
40 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
41 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
42 | for label ,color in zip( np.unique(y),colors):
43 | position=y==label
44 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}"
45 | .format(label),color=color)
46 |
47 | ax.set_xlabel("X[0]")
48 | ax.set_ylabel("X[1]")
49 | ax.legend(loc="best")
50 | ax.set_title("k={0}".format(k))
51 | plt.suptitle("Isomap")
52 | plt.show()
53 | def plot_Isomap_k_d1(*data):
54 | '''
55 | test the performance with different n_neighbors and reduce to 1-D
56 | :param data: train_data, train_value
57 | :return: None
58 | '''
59 | X,y=data
60 | Ks=[1,5,25,y.size-1]
61 |
62 | fig=plt.figure()
63 | for i, k in enumerate(Ks):
64 | isomap=manifold.Isomap(n_components=1,n_neighbors=k)
65 | X_r=isomap.fit_transform(X)
66 |
67 | ax=fig.add_subplot(2,2,i+1)
68 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
69 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
70 | for label ,color in zip( np.unique(y),colors):
71 | position=y==label
72 | ax.scatter(X_r[position],np.zeros_like(X_r[position]),
73 | label="target= {0}".format(label),color=color)
74 |
75 | ax.set_xlabel("X")
76 | ax.set_ylabel("Y")
77 | ax.legend(loc="best")
78 | ax.set_title("k={0}".format(k))
79 | plt.suptitle("Isomap")
80 | plt.show()
81 | if __name__=='__main__':
82 | X,y=load_data()
83 | test_Isomap(X,y)
84 | plot_Isomap_k(X,y)
85 | plot_Isomap_k_d1(X,y)
--------------------------------------------------------------------------------
/5.Dimension_Reduction/5.5 LLE.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import datasets,manifold
4 |
5 | def load_data():
6 | '''
7 | load the iris data
8 | :return: train_data, train_value
9 | '''
10 | iris=datasets.load_iris()
11 | return iris.data,iris.target
12 | def test_LocallyLinearEmbedding(*data):
13 | '''
14 | test the LLE method
15 | :param data: train_data, train_value
16 | :return: None
17 | '''
18 | X,y=data
19 | for n in [4,3,2,1]:
20 | lle=manifold.LocallyLinearEmbedding(n_components=n)
21 | lle.fit(X)
22 | print('reconstruction_error(n_components=%d) : %s'%
23 | (n, lle.reconstruction_error_))
24 | def plot_LocallyLinearEmbedding_k(*data):
25 | '''
26 | test the performance with different n_neighbors and reduce to 2-D
27 | :param data: train_data, train_value
28 | :return: None
29 | '''
30 | X,y=data
31 | Ks=[1,5,25,y.size-1]
32 |
33 | fig=plt.figure()
34 | for i, k in enumerate(Ks):
35 | lle=manifold.LocallyLinearEmbedding(n_components=2,n_neighbors=k)
36 | X_r=lle.fit_transform(X)
37 |
38 | ax=fig.add_subplot(2,2,i+1)
39 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
40 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
41 | for label ,color in zip( np.unique(y),colors):
42 | position=y==label
43 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}"
44 | .format(label),color=color)
45 |
46 | ax.set_xlabel("X[0]")
47 | ax.set_ylabel("X[1]")
48 | ax.legend(loc="best")
49 | ax.set_title("k={0}".format(k))
50 | plt.suptitle("LocallyLinearEmbedding")
51 | plt.show()
52 | def plot_LocallyLinearEmbedding_k_d1(*data):
53 | '''
54 | test the performance with different n_neighbors and reduce to 1-D
55 | :param data: train_data, train_value
56 | :return: None
57 | '''
58 | X,y=data
59 | Ks=[1,5,25,y.size-1]
60 |
61 | fig=plt.figure()
62 | for i, k in enumerate(Ks):
63 | lle=manifold.LocallyLinearEmbedding(n_components=1,n_neighbors=k)
64 | X_r=lle.fit_transform(X)
65 |
66 | ax=fig.add_subplot(2,2,i+1)
67 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
68 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
69 | for label ,color in zip( np.unique(y),colors):
70 | position=y==label
71 | ax.scatter(X_r[position],np.zeros_like(X_r[position]),
72 | label="target= {0}".format(label),color=color)
73 |
74 | ax.set_xlabel("X")
75 | ax.set_ylabel("Y")
76 | ax.legend(loc="best")
77 | ax.set_title("k={0}".format(k))
78 | plt.suptitle("LocallyLinearEmbedding")
79 | plt.show()
80 | if __name__=='__main__':
81 | X,y=load_data()
82 | test_LocallyLinearEmbedding(X,y)
83 | plot_LocallyLinearEmbedding_k(X,y)
84 | plot_LocallyLinearEmbedding_k_d1(X,y)
--------------------------------------------------------------------------------
/5.Dimension_Reduction/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 综述
3 |
4 | 针对数据特征的处理,从而避免维度灾难,并且减少噪音数据特征的影响,提高精度。
5 |
6 | # PCA
7 |
8 | 主成分分析法,是一种维度上的压缩变换。但是由于是无监督的压缩,很多的时候是将开始的特征进行了线性组合,从而生成了新的不能合理解释的新的特征。
9 |
10 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.1%20PCA.py
11 |
12 | # SVD
13 |
14 | 奇异值分解降维。该方法等价于PCA主成分分析,核心都是求解XX(T)的特征值以及对应的特征向量。
15 |
16 | # KPCA
17 |
18 | 核主成分分析法。由于主成分分析法是线性的降维,并不能满足现实任务中的要求,所以需要非线性映射的降维。所以有了基于核技术的降维方法,核主成分分析。
19 |
20 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.2%20KPCA.py
21 |
22 | # 流形学习降维
23 |
24 | 流形学习是一种借鉴了拓扑流形概念的降维方法,是一种非线性的降维方法。其特点在于,构造的局部邻域不同,利用这些邻域结构构造全局的低维嵌入方法不同。
25 |
26 | # MDS
27 |
28 | 多维缩放降维,要求原始空间中的样本之间的距离在低维空间中得到保持。
29 |
30 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.3%20MDS.py
31 |
32 | # Isomap
33 |
34 | 等度量映射降维,利用流形在局部上与欧几里得空间同胚的性质,找到每个点在低维流形上的邻近点近邻连接图。计算最短路径问题。利用MDS方法获得低维空间。
35 |
36 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.4%20Isomap.py
37 |
38 | # LLE
39 |
40 | 局部线性嵌入降维的主要目标是,降维的同时保证邻域内样本的线性关系。
41 |
42 |
43 |
44 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.5%20LLE.py
45 |
--------------------------------------------------------------------------------
/6. Clustering/6.1 Kmeans.py:
--------------------------------------------------------------------------------
1 | from sklearn import cluster
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.datasets.samples_generator import make_blobs
5 | from sklearn.metrics import adjusted_rand_score
6 |
7 | def create_data(centers,num=100,std=0.7):
8 | '''
9 | generate data
10 | :param centers: dimension of centre
11 | :param num: number of sample
12 | :param std: std of each cluster
13 | :return: data, target
14 | '''
15 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
16 | return X,labels_true
17 | def plot_data(*data):
18 | '''
19 | graph the dataset
20 | :param data: data, target
21 | :return: None
22 | '''
23 | X,labels_true=data
24 | labels=np.unique(labels_true)
25 | fig=plt.figure()
26 | ax=fig.add_subplot(1,1,1)
27 | colors='rgbyckm'
28 | for i,label in enumerate(labels):
29 | position=labels_true==label
30 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label),
31 | color=colors[i%len(colors)])
32 |
33 | ax.legend(loc="best",framealpha=0.5)
34 | ax.set_xlabel("X[0]")
35 | ax.set_ylabel("Y[1]")
36 | ax.set_title("data")
37 | plt.show()
38 |
39 |
40 |
41 | def test_Kmeans(*data):
42 | '''
43 | test the Kmeans
44 | :param data: data, target
45 | :return: None
46 | '''
47 | X,labels_true=data
48 | clst=cluster.KMeans()
49 | clst.fit(X)
50 | predicted_labels=clst.predict(X)
51 | print("ARI:{0}".format( adjusted_rand_score(labels_true,predicted_labels)))
52 | print("Sum center distance {0}".format(clst.inertia_))
53 | def test_Kmeans_nclusters(*data):
54 | '''
55 | test the performance with different n_clusters
56 | :param data: data, target
57 | :return: None
58 | '''
59 | X,labels_true=data
60 | nums=range(1,50)
61 | ARIs=[]
62 | Distances=[]
63 | for num in nums:
64 | clst=cluster.KMeans(n_clusters=num)
65 | clst.fit(X)
66 | predicted_labels=clst.predict(X)
67 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
68 | Distances.append(clst.inertia_)
69 |
70 | ## graph
71 | fig=plt.figure()
72 | ax=fig.add_subplot(1,2,1)
73 | ax.plot(nums,ARIs,marker="+")
74 | ax.set_xlabel("n_clusters")
75 | ax.set_ylabel("ARI")
76 | ax=fig.add_subplot(1,2,2)
77 | ax.plot(nums,Distances,marker='o')
78 | ax.set_xlabel("n_clusters")
79 | ax.set_ylabel("inertia_")
80 | fig.suptitle("KMeans")
81 | plt.show()
82 | def test_Kmeans_n_init(*data):
83 | '''
84 | test the performance with different n_init and init paramter
85 | :param data: data, target
86 | :return: None
87 | '''
88 | X,labels_true=data
89 | nums=range(1,50)
90 | ## graph
91 | fig=plt.figure()
92 |
93 | ARIs_k=[]
94 | Distances_k=[]
95 | ARIs_r=[]
96 | Distances_r=[]
97 | for num in nums:
98 | clst=cluster.KMeans(n_init=num,init='k-means++')
99 | clst.fit(X)
100 | predicted_labels=clst.predict(X)
101 | ARIs_k.append(adjusted_rand_score(labels_true,predicted_labels))
102 | Distances_k.append(clst.inertia_)
103 |
104 | clst=cluster.KMeans(n_init=num,init='random')
105 | clst.fit(X)
106 | predicted_labels=clst.predict(X)
107 | ARIs_r.append(adjusted_rand_score(labels_true,predicted_labels))
108 | Distances_r.append(clst.inertia_)
109 |
110 | ax=fig.add_subplot(1,2,1)
111 | ax.plot(nums,ARIs_k,marker="+",label="k-means++")
112 | ax.plot(nums,ARIs_r,marker="+",label="random")
113 | ax.set_xlabel("n_init")
114 | ax.set_ylabel("ARI")
115 | ax.set_ylim(0,1)
116 | ax.legend(loc='best')
117 | ax=fig.add_subplot(1,2,2)
118 | ax.plot(nums,Distances_k,marker='o',label="k-means++")
119 | ax.plot(nums,Distances_r,marker='o',label="random")
120 | ax.set_xlabel("n_init")
121 | ax.set_ylabel("inertia_")
122 | ax.legend(loc='best')
123 |
124 | fig.suptitle("KMeans")
125 | plt.show()
126 |
127 | if __name__=='__main__':
128 | centers=[[1,1],[2,2],[1,2],[10,20]]
129 | X,labels_true=create_data(centers,1000,0.5)
130 | plot_data(X,labels_true)
131 | test_Kmeans(X,labels_true)
132 | test_Kmeans_nclusters(X,labels_true)
133 | test_Kmeans_n_init(X,labels_true)
134 |
--------------------------------------------------------------------------------
/6. Clustering/6.2 DBSCAN.py:
--------------------------------------------------------------------------------
1 | from sklearn import cluster
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.datasets.samples_generator import make_blobs
5 | from sklearn.metrics import adjusted_rand_score
6 |
7 |
8 | def create_data(centers,num=100,std=0.7):
9 | '''
10 | generate data
11 | :param centers: dimension of centre
12 | :param num: number of sample
13 | :param std: std of each cluster
14 | :return: data, target
15 | '''
16 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
17 | return X,labels_true
18 | def plot_data(*data):
19 | '''
20 | graph the dataset
21 | :param data: data, target
22 | :return: None
23 | '''
24 | X,labels_true=data
25 | labels=np.unique(labels_true)
26 | fig=plt.figure()
27 | ax=fig.add_subplot(1,1,1)
28 | colors='rgbyckm'
29 | for i,label in enumerate(labels):
30 | position=labels_true==label
31 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label),
32 | color=colors[i%len(colors)])
33 |
34 | ax.legend(loc="best",framealpha=0.5)
35 | ax.set_xlabel("X[0]")
36 | ax.set_ylabel("Y[1]")
37 | ax.set_title("data")
38 | plt.show()
39 |
40 | def test_DBSCAN(*data):
41 | '''
42 | test the DBSCAN method
43 | :param data: train, target
44 | :return: None
45 | '''
46 | X,labels_true=data
47 | clst=cluster.DBSCAN()
48 | predicted_labels=clst.fit_predict(X)
49 | print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels))
50 | print("Core sample num:{0}".format(len(clst.core_sample_indices_)))
51 | def test_DBSCAN_epsilon(*data):
52 | '''
53 | test the score with different eps
54 | :param data: train, target
55 | :return: None
56 | '''
57 | X,labels_true=data
58 | epsilons=np.logspace(-1,1.5)
59 | ARIs=[]
60 | Core_nums=[]
61 | for epsilon in epsilons:
62 | clst=cluster.DBSCAN(eps=epsilon)
63 | predicted_labels=clst.fit_predict(X)
64 | ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
65 | Core_nums.append(len(clst.core_sample_indices_))
66 |
67 | ## graph
68 | fig=plt.figure()
69 | ax=fig.add_subplot(1,2,1)
70 | ax.plot(epsilons,ARIs,marker='+')
71 | ax.set_xscale('log')
72 | ax.set_xlabel(r"$\epsilon$")
73 | ax.set_ylim(0,1)
74 | ax.set_ylabel('ARI')
75 |
76 | ax=fig.add_subplot(1,2,2)
77 | ax.plot(epsilons,Core_nums,marker='o')
78 | ax.set_xscale('log')
79 | ax.set_xlabel(r"$\epsilon$")
80 | ax.set_ylabel('Core_Nums')
81 |
82 | fig.suptitle("DBSCAN")
83 | plt.show()
84 | def test_DBSCAN_min_samples(*data):
85 | '''
86 | test the score with different min_sample
87 | :param data: train, target
88 | :return: None
89 | '''
90 | X,labels_true=data
91 | min_samples=range(1,100)
92 | ARIs=[]
93 | Core_nums=[]
94 | for num in min_samples:
95 | clst=cluster.DBSCAN(min_samples=num)
96 | predicted_labels=clst.fit_predict(X)
97 | ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
98 | Core_nums.append(len(clst.core_sample_indices_))
99 |
100 | ## graph
101 | fig=plt.figure()
102 | ax=fig.add_subplot(1,2,1)
103 | ax.plot(min_samples,ARIs,marker='+')
104 | ax.set_xlabel( "min_samples")
105 | ax.set_ylim(0,1)
106 | ax.set_ylabel('ARI')
107 |
108 | ax=fig.add_subplot(1,2,2)
109 | ax.plot(min_samples,Core_nums,marker='o')
110 | ax.set_xlabel( "min_samples")
111 | ax.set_ylabel('Core_Nums')
112 |
113 | fig.suptitle("DBSCAN")
114 | plt.show()
115 |
116 | if __name__=='__main__':
117 | centers=[[1,1],[2,2],[1,2],[10,20]]
118 | X,labels_true=create_data(centers,1000,0.5)
119 | plot_data(X,labels_true)
120 | test_DBSCAN(X,labels_true)
121 | test_DBSCAN_epsilon(X,labels_true)
122 | test_DBSCAN_min_samples(X,labels_true)
123 |
--------------------------------------------------------------------------------
/6. Clustering/6.3 Agglomerative Clustering.py:
--------------------------------------------------------------------------------
1 | from sklearn import cluster
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.datasets.samples_generator import make_blobs
5 | from sklearn.metrics import adjusted_rand_score
6 |
7 |
8 | def create_data(centers,num=100,std=0.7):
9 | '''
10 | generate data
11 | :param centers: dimension of centre
12 | :param num: number of sample
13 | :param std: std of each cluster
14 | :return: data, target
15 | '''
16 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
17 | return X,labels_true
18 | def plot_data(*data):
19 | '''
20 | graph the dataset
21 | :param data: data, target
22 | :return: None
23 | '''
24 | X,labels_true=data
25 | labels=np.unique(labels_true)
26 | fig=plt.figure()
27 | ax=fig.add_subplot(1,1,1)
28 | colors='rgbyckm'
29 | for i,label in enumerate(labels):
30 | position=labels_true==label
31 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label),
32 | color=colors[i%len(colors)])
33 |
34 | ax.legend(loc="best",framealpha=0.5)
35 | ax.set_xlabel("X[0]")
36 | ax.set_ylabel("Y[1]")
37 | ax.set_title("data")
38 | plt.show()
39 |
40 | def test_AgglomerativeClustering(*data):
41 | '''
42 | test AGG method
43 | :param data: data, target
44 | :return: None
45 | '''
46 | X,labels_true=data
47 | clst=cluster.AgglomerativeClustering()
48 | predicted_labels=clst.fit_predict(X)
49 | print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels)))
50 | def test_AgglomerativeClustering_nclusters(*data):
51 | '''
52 | test the performance with different n_clusters
53 | :param data: data, target
54 | :return: None
55 | '''
56 | X,labels_true=data
57 | nums=range(1,50)
58 | ARIs=[]
59 | for num in nums:
60 | clst=cluster.AgglomerativeClustering(n_clusters=num)
61 | predicted_labels=clst.fit_predict(X)
62 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
63 |
64 | ## graph
65 | fig=plt.figure()
66 | ax=fig.add_subplot(1,1,1)
67 | ax.plot(nums,ARIs,marker="+")
68 | ax.set_xlabel("n_clusters")
69 | ax.set_ylabel("ARI")
70 | fig.suptitle("AgglomerativeClustering")
71 | plt.show()
72 | def test_AgglomerativeClustering_linkage(*data):
73 | '''
74 | test the performance with different linkages
75 | :param data: data, target
76 | :return: None
77 | '''
78 | X,labels_true=data
79 | nums=range(1,50)
80 | fig=plt.figure()
81 | ax=fig.add_subplot(1,1,1)
82 |
83 | linkages=['ward','complete','average']
84 | markers="+o*"
85 | for i, linkage in enumerate(linkages):
86 | ARIs=[]
87 | for num in nums:
88 | clst=cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage)
89 | predicted_labels=clst.fit_predict(X)
90 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
91 | ax.plot(nums,ARIs,marker=markers[i],label="linkage:{0}".format(linkage))
92 |
93 | ax.set_xlabel("n_clusters")
94 | ax.set_ylabel("ARI")
95 | ax.legend(loc="best")
96 | fig.suptitle("AgglomerativeClustering")
97 | plt.show()
98 |
99 | if __name__=='__main__':
100 | centers=[[1,1],[2,2],[1,2],[10,20]]
101 | X,labels_true=create_data(centers,1000,0.5)
102 | plot_data(X,labels_true)
103 | test_AgglomerativeClustering(X,labels_true)
104 | test_AgglomerativeClustering_nclusters(X,labels_true)
105 | test_AgglomerativeClustering_linkage(X,labels_true)
106 |
107 |
108 |
--------------------------------------------------------------------------------
/6. Clustering/6.4 GaussianMixture.py:
--------------------------------------------------------------------------------
1 | from sklearn import cluster
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn.datasets.samples_generator import make_blobs
5 | from sklearn.metrics import adjusted_rand_score
6 | from sklearn import mixture
7 |
8 |
9 | def create_data(centers,num=100,std=0.7):
10 | '''
11 | generate data
12 | :param centers: dimension of centre
13 | :param num: number of sample
14 | :param std: std of each cluster
15 | :return: data, target
16 | '''
17 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
18 | return X,labels_true
19 | def plot_data(*data):
20 | '''
21 | graph the dataset
22 | :param data: data, target
23 | :return: None
24 | '''
25 | X,labels_true=data
26 | labels=np.unique(labels_true)
27 | fig=plt.figure()
28 | ax=fig.add_subplot(1,1,1)
29 | colors='rgbyckm'
30 | for i,label in enumerate(labels):
31 | position=labels_true==label
32 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label),
33 | color=colors[i%len(colors)])
34 |
35 | ax.legend(loc="best",framealpha=0.5)
36 | ax.set_xlabel("X[0]")
37 | ax.set_ylabel("Y[1]")
38 | ax.set_title("data")
39 | plt.show()
40 |
41 | def test_GMM(*data):
42 | '''
43 | test the method of GMM
44 | :param data: data , target
45 | :return: None
46 | '''
47 | X,labels_true=data
48 | clst=mixture.GaussianMixture()
49 | clst.fit(X)
50 | predicted_labels=clst.predict(X)
51 | print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels)))
52 | def test_GMM_n_components(*data):
53 | '''
54 | test the performance with different N_components
55 | :param data: data, target
56 | :return: None
57 | '''
58 | X,labels_true=data
59 | nums=range(1,50)
60 | ARIs=[]
61 | for num in nums:
62 | clst=mixture.GaussianMixture(n_components=num)
63 | clst.fit(X)
64 | predicted_labels=clst.predict(X)
65 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
66 |
67 | ## graph
68 | fig=plt.figure()
69 | ax=fig.add_subplot(1,1,1)
70 | ax.plot(nums,ARIs,marker="+")
71 | ax.set_xlabel("n_components")
72 | ax.set_ylabel("ARI")
73 | fig.suptitle("GMM")
74 | plt.show()
75 | def test_GMM_cov_type(*data):
76 | '''
77 | test the performance with different cov_type
78 | :param data: data, target
79 | :return: None
80 | '''
81 | X,labels_true=data
82 | nums=range(1,50)
83 |
84 | cov_types=['spherical','tied','diag','full']
85 | markers="+o*s"
86 | fig=plt.figure()
87 | ax=fig.add_subplot(1,1,1)
88 |
89 | for i ,cov_type in enumerate(cov_types):
90 | ARIs=[]
91 | for num in nums:
92 | clst=mixture.GaussianMixture(n_components=num,covariance_type=cov_type)
93 | clst.fit(X)
94 | predicted_labels=clst.predict(X)
95 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
96 | ax.plot(nums,ARIs,marker=markers[i],label="covariance_type:{0}".format(cov_type))
97 |
98 | ax.set_xlabel("n_components")
99 | ax.legend(loc="best")
100 | ax.set_ylabel("ARI")
101 | fig.suptitle("GMM")
102 | plt.show()
103 |
104 | if __name__=='__main__':
105 | centers=[[1,1],[2,2],[1,2],[10,20]]
106 | X,labels_true=create_data(centers,1000,0.5)
107 | plot_data(X,labels_true)
108 | test_GMM(X,labels_true)
109 | test_GMM_n_components(X,labels_true)
110 | test_GMM_cov_type(X,labels_true)
111 |
--------------------------------------------------------------------------------
/6. Clustering/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 综述
3 |
4 | 聚类算法是非监督学习,目的在于对于一些没有target的数据集,进行分类的算法。这是一种探索性分析的方法,用来分析数据的内在特点,寻找数据的分布规律。
5 |
6 | # 聚类的有效性指标
7 |
8 | 聚类的有效性指标主要有两种,外部指标,内部指标。
9 |
10 | 外部指标:由聚类结果与某个参考模型比较获得。1.Jaccard 系数;2.FM指数;3.Rand 指数;4.ARI指数。
11 |
12 | 内部指标:由考察聚类结果直接获得。1.DB指数;2.Dunn指数。
13 |
14 | # 度量距离
15 |
16 | 欧几里得距离,曼哈顿距离,汉明距离,VDM距离等等
17 |
18 | # 原型聚类
19 |
20 | 常用的原型聚类由,K均值聚类,高斯混合聚类等等。K均值的目标函数是最小均方误差。高斯混合聚类,假设聚类服从高斯分布。
21 |
22 | # 密度聚类
23 |
24 | Density-based clustering 假设聚类结构能够通过样本分布的紧密程度来确定。常用的算法由,DBSCAN。
25 |
26 | # 层次聚类
27 |
28 | hierarchical clustering 可在不同层上对数据集进行划分。形成类似树一样的聚类结构。
29 |
30 | # EM算法
31 |
32 | 期望最大算法,是一种迭代方法,主要用于含有隐变量的概率模型的参数估计。其中主要分两步,E为求期望,M为求极大。在混合高斯聚类等方法中有应用。
33 |
34 | # 现实任务中的聚类要求:
35 |
36 | 1.可伸缩性:数据量的变化不影响聚类结果的准确度。2.不同类型数据的处理能力要求。3.适应不同类簇形状的混合聚类要求。4.初始化参数的敏感性的解决要求。5.算法的抗噪能力。6.增量聚类的实现。7.对输入次序的敏感度把握要求。8.高维数据的处理能力要求。9.结果的可读性,可视化性,可解释性,与可应用性。
37 |
38 | #实战代码:GitHub
39 |
40 | 1.Kmeans:
41 |
42 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.1%20Kmeans.py
43 |
44 | 2.DBSCAN:
45 |
46 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.2%20DBSCAN.py
47 |
48 | 3. Agglomerative Clustering:
49 |
50 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.3%20Agglomerative%20Clustering.py
51 |
52 | 4.GaussianMixture:
53 |
54 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.4%20GaussianMixture.py
55 |
--------------------------------------------------------------------------------
/7. Support Vector Machine/7.1 SVM-liner_SVC.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model,cross_validation,svm
4 |
5 | def load_data_classfication():
6 | '''
7 | load iris data set
8 | :return: train_data,test_data, train_target, test_target
9 | '''
10 | iris=datasets.load_iris()
11 | X_train=iris.data
12 | y_train=iris.target
13 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
14 | random_state=0,stratify=y_train)
15 |
16 | def test_SVC_linear(*data):
17 | '''
18 | test method of SVC
19 | :param data: train_data,test_data, train_target, test_target
20 | :return: None
21 | '''
22 | X_train,X_test,y_train,y_test=data
23 | cls=svm.SVC(kernel='linear')
24 | cls.fit(X_train,y_train)
25 | print('Coefficients:{0}, intercept {1}'.format(cls.coef_,cls.intercept_))
26 | print('Score: {0}' .format(cls.score(X_test, y_test)))
27 | def test_SVC_poly(*data):
28 | '''
29 | test the performance with different degree, gamma, codf0
30 | :param data: train_data,test_data, train_target, test_target
31 | :return: None
32 | '''
33 | X_train,X_test,y_train,y_test=data
34 | fig=plt.figure()
35 | ### test degree ####
36 | degrees=range(1,20)
37 | train_scores=[]
38 | test_scores=[]
39 | for degree in degrees:
40 | cls=svm.SVC(kernel='poly',degree=degree)
41 | cls.fit(X_train,y_train)
42 | train_scores.append(cls.score(X_train,y_train))
43 | test_scores.append(cls.score(X_test, y_test))
44 | ax=fig.add_subplot(1,3,1)
45 | ax.plot(degrees,train_scores,label="Training score ",marker='+' )
46 | ax.plot(degrees,test_scores,label= " Testing score ",marker='o' )
47 | ax.set_title( "SVC_poly_degree ")
48 | ax.set_xlabel("p")
49 | ax.set_ylabel("score")
50 | ax.set_ylim(0,1.05)
51 | ax.legend(loc="best",framealpha=0.5)
52 |
53 | ### test gamma , degree fixed with 3####
54 | gammas=range(1,20)
55 | train_scores=[]
56 | test_scores=[]
57 | for gamma in gammas:
58 | cls=svm.SVC(kernel='poly',gamma=gamma,degree=3)
59 | cls.fit(X_train,y_train)
60 | train_scores.append(cls.score(X_train,y_train))
61 | test_scores.append(cls.score(X_test, y_test))
62 | ax=fig.add_subplot(1,3,2)
63 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
64 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
65 | ax.set_title( "SVC_poly_gamma ")
66 | ax.set_xlabel(r"$\gamma$")
67 | ax.set_ylabel("score")
68 | ax.set_ylim(0,1.05)
69 | ax.legend(loc="best",framealpha=0.5)
70 | ### test r , gamma fixed with 10 , degree fixed with 3######
71 | rs=range(0,20)
72 | train_scores=[]
73 | test_scores=[]
74 | for r in rs:
75 | cls=svm.SVC(kernel='poly',gamma=10,degree=3,coef0=r)
76 | cls.fit(X_train,y_train)
77 | train_scores.append(cls.score(X_train,y_train))
78 | test_scores.append(cls.score(X_test, y_test))
79 | ax=fig.add_subplot(1,3,3)
80 | ax.plot(rs,train_scores,label="Training score ",marker='+' )
81 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' )
82 | ax.set_title( "SVC_poly_r ")
83 | ax.set_xlabel(r"r")
84 | ax.set_ylabel("score")
85 | ax.set_ylim(0,1.05)
86 | ax.legend(loc="best",framealpha=0.5)
87 | plt.show()
88 | def test_SVC_rbf(*data):
89 | '''
90 | test SVC with Gaussian kernel and different gamma
91 | :param data: train_data,test_data, train_target, test_target
92 | :return: None
93 | '''
94 | X_train,X_test,y_train,y_test=data
95 | gammas=range(1,20)
96 | train_scores=[]
97 | test_scores=[]
98 | for gamma in gammas:
99 | cls=svm.SVC(kernel='rbf',gamma=gamma)
100 | cls.fit(X_train,y_train)
101 | train_scores.append(cls.score(X_train,y_train))
102 | test_scores.append(cls.score(X_test, y_test))
103 | fig=plt.figure()
104 | ax=fig.add_subplot(1,1,1)
105 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
106 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
107 | ax.set_title( "SVC_rbf")
108 | ax.set_xlabel(r"$\gamma$")
109 | ax.set_ylabel("score")
110 | ax.set_ylim(0,1.05)
111 | ax.legend(loc="best",framealpha=0.5)
112 | plt.show()
113 | def test_SVC_sigmoid(*data):
114 | '''
115 | test SVC with sigmoid kernel with different gamma and coef0
116 | :param data: train_data,test_data, train_target, test_target
117 | :return: None
118 | '''
119 | X_train,X_test,y_train,y_test=data
120 | fig=plt.figure()
121 |
122 | ### test gamma ,fixed coef0 with 0 ####
123 | gammas=np.logspace(-2,1)
124 | train_scores=[]
125 | test_scores=[]
126 |
127 | for gamma in gammas:
128 | cls=svm.SVC(kernel='sigmoid',gamma=gamma,coef0=0)
129 | cls.fit(X_train,y_train)
130 | train_scores.append(cls.score(X_train,y_train))
131 | test_scores.append(cls.score(X_test, y_test))
132 | ax=fig.add_subplot(1,2,1)
133 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
134 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
135 | ax.set_title( "SVC_sigmoid_gamma ")
136 | ax.set_xscale("log")
137 | ax.set_xlabel(r"$\gamma$")
138 | ax.set_ylabel("score")
139 | ax.set_ylim(0,1.05)
140 | ax.legend(loc="best",framealpha=0.5)
141 | ### test r,fixed gamma with 0.01 ######
142 | rs=np.linspace(0,5)
143 | train_scores=[]
144 | test_scores=[]
145 |
146 | for r in rs:
147 | cls=svm.SVC(kernel='sigmoid',coef0=r,gamma=0.01)
148 | cls.fit(X_train,y_train)
149 | train_scores.append(cls.score(X_train,y_train))
150 | test_scores.append(cls.score(X_test, y_test))
151 | ax=fig.add_subplot(1,2,2)
152 | ax.plot(rs,train_scores,label="Training score ",marker='+' )
153 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' )
154 | ax.set_title( "SVC_sigmoid_r ")
155 | ax.set_xlabel(r"r")
156 | ax.set_ylabel("score")
157 | ax.set_ylim(0,1.05)
158 | ax.legend(loc="best",framealpha=0.5)
159 | plt.show()
160 | if __name__=="__main__":
161 | X_train,X_test,y_train,y_test=load_data_classfication()
162 | test_SVC_linear(X_train,X_test,y_train,y_test)
163 | test_SVC_poly(X_train,X_test,y_train,y_test)
164 | test_SVC_rbf(X_train,X_test,y_train,y_test)
165 | test_SVC_sigmoid(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/7. Support Vector Machine/7.2 SVM-unliner_SVC.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model,cross_validation,svm
4 |
5 | def load_data_classfication():
6 | '''
7 | load iris data set
8 | :return: train_data,test_data, train_target, test_target
9 | '''
10 | iris=datasets.load_iris()
11 | X_train=iris.data
12 | y_train=iris.target
13 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25,
14 | random_state=0,stratify=y_train)
15 |
16 | def test_SVC_linear(*data):
17 | '''
18 | test method of SVC
19 | :param data: train_data,test_data, train_target, test_target
20 | :return: None
21 | '''
22 | X_train,X_test,y_train,y_test=data
23 | cls=svm.SVC(kernel='linear')
24 | cls.fit(X_train,y_train)
25 | print('Coefficients:{0}, intercept {1}'.format(cls.coef_,cls.intercept_))
26 | print('Score: {0}' .format(cls.score(X_test, y_test)))
27 | def test_SVC_poly(*data):
28 | '''
29 | test SVC with poly kernel and different degree, gamma, coef0
30 | :param data: train_data,test_data, train_target, test_target
31 | :return: None
32 | '''
33 | X_train,X_test,y_train,y_test=data
34 | fig=plt.figure()
35 | ### test degree ####
36 | degrees=range(1,20)
37 | train_scores=[]
38 | test_scores=[]
39 | for degree in degrees:
40 | cls=svm.SVC(kernel='poly',degree=degree)
41 | cls.fit(X_train,y_train)
42 | train_scores.append(cls.score(X_train,y_train))
43 | test_scores.append(cls.score(X_test, y_test))
44 | ax=fig.add_subplot(1,3,1)
45 | ax.plot(degrees,train_scores,label="Training score ",marker='+' )
46 | ax.plot(degrees,test_scores,label= " Testing score ",marker='o' )
47 | ax.set_title( "SVC_poly_degree ")
48 | ax.set_xlabel("p")
49 | ax.set_ylabel("score")
50 | ax.set_ylim(0,1.05)
51 | ax.legend(loc="best",framealpha=0.5)
52 |
53 | ### test gamma ,fix degree with 3####
54 | gammas=range(1,20)
55 | train_scores=[]
56 | test_scores=[]
57 | for gamma in gammas:
58 | cls=svm.SVC(kernel='poly',gamma=gamma,degree=3)
59 | cls.fit(X_train,y_train)
60 | train_scores.append(cls.score(X_train,y_train))
61 | test_scores.append(cls.score(X_test, y_test))
62 | ax=fig.add_subplot(1,3,2)
63 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
64 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
65 | ax.set_title( "SVC_poly_gamma ")
66 | ax.set_xlabel(r"$\gamma$")
67 | ax.set_ylabel("score")
68 | ax.set_ylim(0,1.05)
69 | ax.legend(loc="best",framealpha=0.5)
70 | ### test r ,fix gamma with 10 , degree fixed with 3######
71 | rs=range(0,20)
72 | train_scores=[]
73 | test_scores=[]
74 | for r in rs:
75 | cls=svm.SVC(kernel='poly',gamma=10,degree=3,coef0=r)
76 | cls.fit(X_train,y_train)
77 | train_scores.append(cls.score(X_train,y_train))
78 | test_scores.append(cls.score(X_test, y_test))
79 | ax=fig.add_subplot(1,3,3)
80 | ax.plot(rs,train_scores,label="Training score ",marker='+' )
81 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' )
82 | ax.set_title( "SVC_poly_r ")
83 | ax.set_xlabel(r"r")
84 | ax.set_ylabel("score")
85 | ax.set_ylim(0,1.05)
86 | ax.legend(loc="best",framealpha=0.5)
87 | plt.show()
88 | def test_SVC_rbf(*data):
89 | '''
90 | test SVC with gaussian kernel(rbf) and different gamma
91 | :param data: train_data,test_data, train_target, test_target
92 | :return: None
93 | '''
94 | X_train,X_test,y_train,y_test=data
95 | gammas=range(1,20)
96 | train_scores=[]
97 | test_scores=[]
98 | for gamma in gammas:
99 | cls=svm.SVC(kernel='rbf',gamma=gamma)
100 | cls.fit(X_train,y_train)
101 | train_scores.append(cls.score(X_train,y_train))
102 | test_scores.append(cls.score(X_test, y_test))
103 | fig=plt.figure()
104 | ax=fig.add_subplot(1,1,1)
105 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
106 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
107 | ax.set_title( "SVC_rbf")
108 | ax.set_xlabel(r"$\gamma$")
109 | ax.set_ylabel("score")
110 | ax.set_ylim(0,1.05)
111 | ax.legend(loc="best",framealpha=0.5)
112 | plt.show()
113 | def test_SVC_sigmoid(*data):
114 | '''
115 | test SVC with sigmoid kernel and different gamma and coef0
116 | :param data: train_data,test_data, train_target, test_target
117 | :return: None
118 | '''
119 | X_train,X_test,y_train,y_test=data
120 | fig=plt.figure()
121 |
122 | ### test gamma ,fix coef0 with 0 ####
123 | gammas=np.logspace(-2,1)
124 | train_scores=[]
125 | test_scores=[]
126 |
127 | for gamma in gammas:
128 | cls=svm.SVC(kernel='sigmoid',gamma=gamma,coef0=0)
129 | cls.fit(X_train,y_train)
130 | train_scores.append(cls.score(X_train,y_train))
131 | test_scores.append(cls.score(X_test, y_test))
132 | ax=fig.add_subplot(1,2,1)
133 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
134 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
135 | ax.set_title( "SVC_sigmoid_gamma ")
136 | ax.set_xscale("log")
137 | ax.set_xlabel(r"$\gamma$")
138 | ax.set_ylabel("score")
139 | ax.set_ylim(0,1.05)
140 | ax.legend(loc="best",framealpha=0.5)
141 | ### test r,fix gamma with 0.01 ######
142 | rs=np.linspace(0,5)
143 | train_scores=[]
144 | test_scores=[]
145 |
146 | for r in rs:
147 | cls=svm.SVC(kernel='sigmoid',coef0=r,gamma=0.01)
148 | cls.fit(X_train,y_train)
149 | train_scores.append(cls.score(X_train,y_train))
150 | test_scores.append(cls.score(X_test, y_test))
151 | ax=fig.add_subplot(1,2,2)
152 | ax.plot(rs,train_scores,label="Training score ",marker='+' )
153 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' )
154 | ax.set_title( "SVC_sigmoid_r ")
155 | ax.set_xlabel(r"r")
156 | ax.set_ylabel("score")
157 | ax.set_ylim(0,1.05)
158 | ax.legend(loc="best",framealpha=0.5)
159 | plt.show()
160 | if __name__=="__main__":
161 | X_train,X_test,y_train,y_test=load_data_classfication()
162 | test_SVC_linear(X_train,X_test,y_train,y_test)
163 | test_SVC_poly(X_train,X_test,y_train,y_test)
164 | test_SVC_rbf(X_train,X_test,y_train,y_test)
165 | test_SVC_sigmoid(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/7. Support Vector Machine/7.3 liner_SVR.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model,cross_validation,svm
4 | def load_data_regression():
5 | '''
6 | load dataset for regression
7 | :return: train_data,test_data, train_target, test_target
8 | '''
9 | diabetes = datasets.load_diabetes()
10 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
11 | test_size=0.25,random_state=0)
12 |
13 | def test_LinearSVR(*data):
14 | '''
15 | test Liner SVR
16 | :param data: train_data,test_data, train_target, test_target
17 | :return: None
18 | '''
19 | X_train,X_test,y_train,y_test=data
20 | regr=svm.LinearSVR()
21 | regr.fit(X_train,y_train)
22 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_))
23 | print('Score: {0}' .format(regr.score(X_test, y_test)))
24 | def test_LinearSVR_loss(*data):
25 | '''
26 | test SVr with different loss function
27 | :param data: train_data,test_data, train_target, test_target
28 | :return:
29 | '''
30 | X_train,X_test,y_train,y_test=data
31 | losses=['epsilon_insensitive','squared_epsilon_insensitive']
32 | for loss in losses:
33 | regr=svm.LinearSVR(loss=loss)
34 | regr.fit(X_train,y_train)
35 | print("loss:{0}".format(loss))
36 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_))
37 | print('Score: {0}' .format(regr.score(X_test, y_test)))
38 | def test_LinearSVR_epsilon(*data):
39 | '''
40 | test the performance with different epsilon
41 | :param data: train_data,test_data, train_target, test_target
42 | :return: None
43 | '''
44 | X_train,X_test,y_train,y_test=data
45 | epsilons=np.logspace(-2,2)
46 | train_scores=[]
47 | test_scores=[]
48 | for epsilon in epsilons:
49 | regr=svm.LinearSVR(epsilon=epsilon,loss='squared_epsilon_insensitive')
50 | regr.fit(X_train,y_train)
51 | train_scores.append(regr.score(X_train, y_train))
52 | test_scores.append(regr.score(X_test, y_test))
53 | fig=plt.figure()
54 | ax=fig.add_subplot(1,1,1)
55 | ax.plot(epsilons,train_scores,label="Training score ",marker='+' )
56 | ax.plot(epsilons,test_scores,label= " Testing score ",marker='o' )
57 | ax.set_title( "LinearSVR_epsilon ")
58 | ax.set_xscale("log")
59 | ax.set_xlabel(r"$\epsilon$")
60 | ax.set_ylabel("score")
61 | ax.set_ylim(-1,1.05)
62 | ax.legend(loc="best",framealpha=0.5)
63 | plt.show()
64 | def test_LinearSVR_C(*data):
65 | '''
66 | test the performance with different C
67 | :param data: train_data,test_data, train_target, test_target
68 | :return: None
69 | '''
70 | X_train,X_test,y_train,y_test=data
71 | Cs=np.logspace(-1,2)
72 | train_scores=[]
73 | test_scores=[]
74 | for C in Cs:
75 | regr=svm.LinearSVR(epsilon=0.1,loss='squared_epsilon_insensitive',C=C)
76 | regr.fit(X_train,y_train)
77 | train_scores.append(regr.score(X_train, y_train))
78 | test_scores.append(regr.score(X_test, y_test))
79 | fig=plt.figure()
80 | ax=fig.add_subplot(1,1,1)
81 | ax.plot(Cs,train_scores,label="Training score ",marker='+' )
82 | ax.plot(Cs,test_scores,label= " Testing score ",marker='o' )
83 | ax.set_title( "LinearSVR_C ")
84 | ax.set_xscale("log")
85 | ax.set_xlabel(r"C")
86 | ax.set_ylabel("score")
87 | ax.set_ylim(-1,1.05)
88 | ax.legend(loc="best",framealpha=0.5)
89 | plt.show()
90 | if __name__=="__main__":
91 | X_train,X_test,y_train,y_test=load_data_regression()
92 | test_LinearSVR(X_train,X_test,y_train,y_test)
93 | test_LinearSVR_loss(X_train,X_test,y_train,y_test)
94 | test_LinearSVR_epsilon(X_train,X_test,y_train,y_test)
95 | test_LinearSVR_C(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/7. Support Vector Machine/7.4 unliner_SVR.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn import datasets, linear_model,cross_validation,svm
4 | def load_data_regression():
5 | '''
6 | load dataset for regression
7 | :return: train_data,test_data, train_target, test_target
8 | '''
9 | diabetes = datasets.load_diabetes()
10 | return cross_validation.train_test_split(diabetes.data,diabetes.target,
11 | test_size=0.25,random_state=0)
12 |
13 | def test_SVR_linear(*data):
14 | '''
15 | test SVR with liner kernel
16 | :param data: train_data,test_data, train_target, test_target
17 | :return: None
18 | '''
19 | X_train,X_test,y_train,y_test=data
20 | regr=svm.SVR(kernel='linear')
21 | regr.fit(X_train,y_train)
22 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_))
23 | print('Score: {0}' .format(regr.score(X_test, y_test)))
24 |
25 | def test_SVR_poly(*data):
26 | '''
27 | test SVR with poly kernel, and different degree, gamma, coef0
28 | :param data: train_data,test_data, train_target, test_target
29 | :return: None
30 | '''
31 | X_train,X_test,y_train,y_test=data
32 | fig=plt.figure()
33 | ### test degree ####
34 | degrees=range(1,20)
35 | train_scores=[]
36 | test_scores=[]
37 | for degree in degrees:
38 | regr=svm.SVR(kernel='poly',degree=degree,coef0=1)
39 | regr.fit(X_train,y_train)
40 | train_scores.append(regr.score(X_train,y_train))
41 | test_scores.append(regr.score(X_test, y_test))
42 | ax=fig.add_subplot(1,3,1)
43 | ax.plot(degrees,train_scores,label="Training score ",marker='+' )
44 | ax.plot(degrees,test_scores,label= " Testing score ",marker='o' )
45 | ax.set_title( "SVR_poly_degree r=1")
46 | ax.set_xlabel("p")
47 | ax.set_ylabel("score")
48 | ax.set_ylim(-1,1.)
49 | ax.legend(loc="best",framealpha=0.5)
50 |
51 | ### test gamma,fix degree with 3, fix coef0 with 1 ####
52 | gammas=range(1,40)
53 | train_scores=[]
54 | test_scores=[]
55 | for gamma in gammas:
56 | regr=svm.SVR(kernel='poly',gamma=gamma,degree=3,coef0=1)
57 | regr.fit(X_train,y_train)
58 | train_scores.append(regr.score(X_train,y_train))
59 | test_scores.append(regr.score(X_test, y_test))
60 | ax=fig.add_subplot(1,3,2)
61 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
62 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
63 | ax.set_title( "SVR_poly_gamma r=1")
64 | ax.set_xlabel(r"$\gamma$")
65 | ax.set_ylabel("score")
66 | ax.set_ylim(-1,1)
67 | ax.legend(loc="best",framealpha=0.5)
68 | ### test r,fix gamma with 20,fix degree with 3 ######
69 | rs=range(0,20)
70 | train_scores=[]
71 | test_scores=[]
72 | for r in rs:
73 | regr=svm.SVR(kernel='poly',gamma=20,degree=3,coef0=r)
74 | regr.fit(X_train,y_train)
75 | train_scores.append(regr.score(X_train,y_train))
76 | test_scores.append(regr.score(X_test, y_test))
77 | ax=fig.add_subplot(1,3,3)
78 | ax.plot(rs,train_scores,label="Training score ",marker='+' )
79 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' )
80 | ax.set_title( "SVR_poly_r gamma=20 degree=3")
81 | ax.set_xlabel(r"r")
82 | ax.set_ylabel("score")
83 | ax.set_ylim(-1,1.)
84 | ax.legend(loc="best",framealpha=0.5)
85 | plt.show()
86 | def test_SVR_rbf(*data):
87 | '''
88 | test SVR with RBF kernel and different gamma
89 | :param data: train_data,test_data, train_target, test_target
90 | :return: None
91 | '''
92 | X_train,X_test,y_train,y_test=data
93 | gammas=range(1,20)
94 | train_scores=[]
95 | test_scores=[]
96 | for gamma in gammas:
97 | regr=svm.SVR(kernel='rbf',gamma=gamma)
98 | regr.fit(X_train,y_train)
99 | train_scores.append(regr.score(X_train,y_train))
100 | test_scores.append(regr.score(X_test, y_test))
101 | fig=plt.figure()
102 | ax=fig.add_subplot(1,1,1)
103 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
104 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
105 | ax.set_title( "SVR_rbf")
106 | ax.set_xlabel(r"$\gamma$")
107 | ax.set_ylabel("score")
108 | ax.set_ylim(-1,1)
109 | ax.legend(loc="best",framealpha=0.5)
110 | plt.show()
111 | def test_SVR_sigmoid(*data):
112 | '''
113 | test SVR with sigmoid kernel and different gamma, coef0
114 | :param data: train_data,test_data, train_target, test_target
115 | :return: None
116 | '''
117 | X_train,X_test,y_train,y_test=data
118 | fig=plt.figure()
119 |
120 | ### test gammam,fix coef0 with 0.01 ####
121 | gammas=np.logspace(-1,3)
122 | train_scores=[]
123 | test_scores=[]
124 |
125 | for gamma in gammas:
126 | regr=svm.SVR(kernel='sigmoid',gamma=gamma,coef0=0.01)
127 | regr.fit(X_train,y_train)
128 | train_scores.append(regr.score(X_train,y_train))
129 | test_scores.append(regr.score(X_test, y_test))
130 | ax=fig.add_subplot(1,2,1)
131 | ax.plot(gammas,train_scores,label="Training score ",marker='+' )
132 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' )
133 | ax.set_title( "SVR_sigmoid_gamma r=0.01")
134 | ax.set_xscale("log")
135 | ax.set_xlabel(r"$\gamma$")
136 | ax.set_ylabel("score")
137 | ax.set_ylim(-1,1)
138 | ax.legend(loc="best",framealpha=0.5)
139 | ### test r ,fix gamma with 10 ######
140 | rs=np.linspace(0,5)
141 | train_scores=[]
142 | test_scores=[]
143 |
144 | for r in rs:
145 | regr=svm.SVR(kernel='sigmoid',coef0=r,gamma=10)
146 | regr.fit(X_train,y_train)
147 | train_scores.append(regr.score(X_train,y_train))
148 | test_scores.append(regr.score(X_test, y_test))
149 | ax=fig.add_subplot(1,2,2)
150 | ax.plot(rs,train_scores,label="Training score ",marker='+' )
151 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' )
152 | ax.set_title( "SVR_sigmoid_r gamma=10")
153 | ax.set_xlabel(r"r")
154 | ax.set_ylabel("score")
155 | ax.set_ylim(-1,1)
156 | ax.legend(loc="best",framealpha=0.5)
157 | plt.show()
158 | if __name__=="__main__":
159 | X_train,X_test,y_train,y_test=load_data_regression()
160 | test_SVR_linear(X_train,X_test,y_train,y_test)
161 | test_SVR_poly(X_train,X_test,y_train,y_test)
162 | test_SVR_rbf(X_train,X_test,y_train,y_test)
163 | test_SVR_sigmoid(X_train,X_test,y_train,y_test)
--------------------------------------------------------------------------------
/7. Support Vector Machine/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 综述
3 |
4 | 支持向量机(Support Vector Machine) 在使用核技术之后,是可以进行非线性分类的。模型的基本定义,是使得在空间中分类的间隔最大化。分割是超平面分割,目标函数是是满足KKT条件下的最大值或者倒数最小值。
5 |
6 | # 涉及知识点:
7 |
8 | 决策函数, KKT条件, 对偶问题,拉格朗日函数, 惩罚参数,
9 |
10 | # 常用核函数:
11 |
12 | 多项式核函数, 高斯核函数, sigmoid 核函数等等。
13 |
14 | # 支持向量机回归
15 |
16 | Support Vector Regression( SVR) , 损失函数是一范数, 但是一般会设置参数,当距离大于参数时,才考虑损失函数。
17 |
18 | # SVM优缺点:
19 |
20 | 优点:可以解决非线性的优化问题。避免了神经网络的结构选择, 核局部极小点问题。
21 |
22 | 缺点:确实数据敏感,对于非线性问题,很依赖于核函数的选择,没有通用的解法。主流算法的时间复杂度是O(n2),所以在大规模数据下的计算需要庞大的计算量。同时结果对超参数的依赖程度很大。(比如RBF核的超参数, gamma核惩罚项C)
23 |
24 | # 实战代码:GitHub
25 |
26 | 1.SVM 线性分类-SVC
27 |
28 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.1%20SVM-liner_SVC.py
29 |
30 | 2. SVM非线性分类-SVC
31 |
32 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.2%20SVM-unliner_SVC.py
33 |
34 | 3. SVM线性回归-SVR
35 |
36 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.3%20liner_SVR.py
37 |
38 | 4.SVM非线性回归-SVR
39 |
40 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.4%20unliner_SVR.py
41 |
42 |
--------------------------------------------------------------------------------
/8. Artificial Neural Network/README.md:
--------------------------------------------------------------------------------
1 | # 综述
2 |
3 | 受生物学影响,人工神经网络由一系列简单的单元组成。ANN是机器学习的一个庞大的分支,有着几百种主要的算法,例如 感知机神经网络(Perceptron Neural Network), 反向传播神经网络(back-propagation , BP), Hopfield Neural Network, 自组织映射(self-organizing Map, SOM), 学习矢量量化(Learning Vector Quantization, LVQ)等等。
4 |
5 | # 涉及知识点
6 |
7 | 感知机,损失函数,对偶形式, 多重感知机,前馈神经网络,反向传播神经网络,激活函数,隐藏层,学习率,收敛速度。
8 |
9 | # 实战代码:GitHub
10 |
--------------------------------------------------------------------------------
/8. Artificial Neural Network/test.py:
--------------------------------------------------------------------------------
1 | # this is some part for ANN
--------------------------------------------------------------------------------
/9. Semi-Supervised Learning/9.1 labelpropogation.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from sklearn import metrics
5 | from sklearn import datasets
6 | from sklearn.semi_supervised import LabelPropagation
7 |
8 | def load_data():
9 | '''
10 | load digit data set
11 | :return: data( have target), data_target, data( not have target)
12 | '''
13 | digits = datasets.load_digits()
14 | ###### shuffle ########
15 | rng = np.random.RandomState(0)
16 | indices = np.arange(len(digits.data))
17 | rng.shuffle(indices)
18 | X = digits.data[indices]
19 | y = digits.target[indices]
20 |
21 | n_labeled_points = int(len(y)/10)
22 | unlabeled_indices = np.arange(len(y))[n_labeled_points:]
23 |
24 | return X,y,unlabeled_indices
25 |
26 | def test_LabelPropagation(*data):
27 | '''
28 | 测试 LabelPropagation 的用法
29 | :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合
30 | :return: None
31 | '''
32 | X,y,unlabeled_indices=data
33 | y_train=np.copy(y) # 必须拷贝,后面要用到 y
34 | y_train[unlabeled_indices]=-1 # 未标记样本的标记设定为 -1
35 | clf=LabelPropagation(max_iter=100,kernel='rbf',gamma=0.1)
36 | clf.fit(X,y_train)
37 | ### 获取预测准确率
38 | predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记
39 | true_labels = y[unlabeled_indices] # 真实标记
40 | print("Accuracy:%f"%metrics.accuracy_score(true_labels,predicted_labels))
41 | # 或者 print("Accuracy:%f"%clf.score(X[unlabeled_indices],true_labels))
42 | def test_LabelPropagation_rbf(*data):
43 | '''
44 | test LabelPropagation with rbf kernel, and different alpha and gamma
45 | :param data: data( have target), data_target, data( not have target)
46 | :return: None
47 | '''
48 | X,y,unlabeled_indices=data
49 | y_train=np.copy(y)
50 | y_train[unlabeled_indices]=-1
51 |
52 | fig=plt.figure()
53 | ax=fig.add_subplot(1,1,1)
54 | alphas=np.linspace(0.01,1,num=10,endpoint=True)
55 | gammas=np.logspace(-2,2,num=50)
56 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
57 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
58 | ## train and graph
59 | for alpha,color in zip(alphas,colors):
60 | scores=[]
61 | for gamma in gammas:
62 | clf=LabelPropagation(max_iter=100,gamma=gamma,alpha=alpha,kernel='rbf')
63 | clf.fit(X,y_train)
64 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices]))
65 | ax.plot(gammas,scores,label=r"$\alpha=%s$"%alpha,color=color)
66 |
67 |
68 | ax.set_xlabel(r"$\gamma$")
69 | ax.set_ylabel("score")
70 | ax.set_xscale("log")
71 | ax.legend(loc="best")
72 | ax.set_title("LabelPropagation rbf kernel")
73 | plt.show()
74 | def test_LabelPropagation_knn(*data):
75 | '''
76 | test LabelPropagation with knn kernel, and different alpha , n_neighbors
77 | :param data: data( have target), data_target, data( not have target)
78 | :return: None
79 | '''
80 | X,y,unlabeled_indices=data
81 | y_train=np.copy(y)
82 | y_train[unlabeled_indices]=-1
83 |
84 | fig=plt.figure()
85 | ax=fig.add_subplot(1,1,1)
86 | alphas=np.linspace(0.01,1,num=10,endpoint=True)
87 | Ks=[1,2,3,4,5,8,10,15,20,25,30,35,40,50]
88 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
89 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
90 |
91 | for alpha,color in zip(alphas,colors):
92 | scores=[]
93 | for K in Ks:
94 | clf=LabelPropagation(max_iter=100,n_neighbors=K,alpha=alpha,kernel='knn')
95 | clf.fit(X,y_train)
96 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices]))
97 | ax.plot(Ks,scores,label=r"$\alpha=%s$"%alpha,color=color)
98 |
99 |
100 | ax.set_xlabel(r"$k$")
101 | ax.set_ylabel("score")
102 | ax.legend(loc="best")
103 | ax.set_title("LabelPropagation knn kernel")
104 | plt.show()
105 | if __name__=='__main__':
106 | data=load_data()
107 | test_LabelPropagation(*data)
108 | test_LabelPropagation_rbf(*data)
109 | test_LabelPropagation_knn(*data)
--------------------------------------------------------------------------------
/9. Semi-Supervised Learning/9.2LabelSpreading.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn import metrics
4 | from sklearn import datasets
5 | from sklearn.semi_supervised.label_propagation import LabelSpreading
6 |
7 | def load_data():
8 | '''
9 | load data
10 | :return: data( have target), data_target, data( not have target)
11 | '''
12 | digits = datasets.load_digits()
13 |
14 | rng = np.random.RandomState(0)
15 | indices = np.arange(len(digits.data))
16 | rng.shuffle(indices)
17 | X = digits.data[indices]
18 | y = digits.target[indices]
19 |
20 | n_labeled_points = int(len(y)/10)
21 | unlabeled_indices = np.arange(len(y))[n_labeled_points:]
22 |
23 | return X,y,unlabeled_indices
24 |
25 | def test_LabelSpreading(*data):
26 | '''
27 | test LabelSpreading
28 | :param data: data( have target), data_target, data( not have target)
29 | :return: None
30 | '''
31 | X,y,unlabeled_indices=data
32 | y_train=np.copy(y)
33 | y_train[unlabeled_indices]=-1
34 | clf=LabelSpreading(max_iter=100,kernel='rbf',gamma=0.1)
35 | clf.fit(X,y_train)
36 |
37 | predicted_labels = clf.transduction_[unlabeled_indices]
38 | true_labels = y[unlabeled_indices]
39 | print("Accuracy:%f"%metrics.accuracy_score(true_labels,predicted_labels))
40 |
41 | def test_LabelSpreading_rbf(*data):
42 | '''
43 | test LabelSpreading with rbf kernel and different alpha, gamma
44 | :param data: data( have target), data_target, data( not have target)
45 | :return: None
46 | '''
47 | X,y,unlabeled_indices=data
48 | y_train=np.copy(y)
49 | y_train[unlabeled_indices]=-1
50 |
51 | fig=plt.figure()
52 | ax=fig.add_subplot(1,1,1)
53 | alphas=np.linspace(0.01,1,num=10,endpoint=True)
54 | gammas=np.logspace(-2,2,num=50)
55 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
56 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
57 |
58 | for alpha,color in zip(alphas,colors):
59 | scores=[]
60 | for gamma in gammas:
61 | clf=LabelSpreading(max_iter=100,gamma=gamma,alpha=alpha,kernel='rbf')
62 | clf.fit(X,y_train)
63 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices]))
64 | ax.plot(gammas,scores,label=r"$\alpha=%s$"%alpha,color=color)
65 |
66 |
67 | ax.set_xlabel(r"$\gamma$")
68 | ax.set_ylabel("score")
69 | ax.set_xscale("log")
70 | ax.legend(loc="best")
71 | ax.set_title("LabelSpreading rbf kernel")
72 | plt.show()
73 | def test_LabelSpreading_knn(*data):
74 | '''
75 | test LabelSpreading with knn kernel, and different alpha , n_neighbors
76 | :param data: data( have target), data_target, data( not have target)
77 | :return: None
78 | '''
79 | X,y,unlabeled_indices=data
80 | y_train=np.copy(y)
81 | y_train[unlabeled_indices]=-1
82 |
83 | fig=plt.figure()
84 | ax=fig.add_subplot(1,1,1)
85 | alphas=np.linspace(0.01,1,num=10,endpoint=True)
86 | Ks=[1,2,3,4,5,8,10,15,20,25,30,35,40,50]
87 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5),
88 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),)
89 |
90 | for alpha,color in zip(alphas,colors):
91 | scores=[]
92 | for K in Ks:
93 | clf=LabelSpreading(kernel='knn',max_iter=100,n_neighbors=K,alpha=alpha)
94 | clf.fit(X,y_train)
95 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices]))
96 | ax.plot(Ks,scores,label=r"$\alpha=%s$"%alpha,color=color)
97 |
98 |
99 | ax.set_xlabel(r"$k$")
100 | ax.set_ylabel("score")
101 | ax.legend(loc="best")
102 | ax.set_title("LabelSpreading knn kernel")
103 | plt.show()
104 | if __name__=='__main__':
105 | data=load_data()
106 | test_LabelSpreading(*data)
107 | test_LabelSpreading_rbf(*data)
108 | test_LabelSpreading_knn(*data)
--------------------------------------------------------------------------------
/9. Semi-Supervised Learning/README.md:
--------------------------------------------------------------------------------
1 | # 综述
2 |
3 | 综合利用有标记的数据和没有标记的俄数据,来生成合适的分类函数模型。
4 |
5 | # 方法
6 |
7 | 生成式半监督学习方法,例如生成式高斯混合模型原理;图半监督学习,等等。
8 |
9 | # 补充
10 |
11 | 半监督学习的存在,首先要优于单纯只利用存在标记的数据集的结果。与此同时,半监督的目标是提高广泛性,但不是必然会提升的。这源于半监督需要充分利用领域知识来设计模型。
12 |
13 | # 实战代码:GitHub
14 |
15 | LabelPropagation:
16 |
17 | https://github.com/JasonK93/ML-note/blob/master/9.%20Semi-Supervised%20Learning/9.1%20labelpropogation.py
18 |
19 | 2. LabelSpreading:
20 |
21 | https://github.com/JasonK93/ML-note/blob/master/9.%20Semi-Supervised%20Learning/9.2LabelSpreading.py
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ML-note
2 | ML Learning
3 |
4 | This repo including some Machine Learning method. It can be run directly with the dependencies installed. Some description will set in each dirctory.
5 |
6 |
--------------------------------------------------------------------------------