├── .idea ├── ML note.iml ├── dictionaries │ └── Vam.xml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── 1.liner model ├── 1.basic liner model.py ├── 2.ridge analysis.py ├── 3.Lasso regression.py ├── 4. ElasticNet.py ├── 5.logistic regression.py ├── 6.LDA.py └── README.md ├── 10.Ensemble ├── 10.1 Adaboost classifer.py ├── 10.2 adaboost regression.py ├── 10.3 RF_classifier.py ├── 10.4 RF_regression.py ├── 10.5 Gradient_Classifier.py ├── 10.6 Gradient_regresion.py └── README.md ├── 11. Preprocessing ├── 11.1 Binarize.py ├── 11.2 One-hot encoder.py ├── 11.3 normalize.py ├── 11.4 standardize.py ├── 11.5 feature_seleticon_filter.py ├── 11.6 feature_selection_bagging.py ├── 11.7 feature_selection_embeded.py ├── 11.8 pipeline.py ├── 11.9 dictionary learning.py └── README.md ├── 12. Model evaluation ├── 12.1 Loss function.py ├── 12.2 data split.py ├── 12.3 validation_curve.py ├── 12.4 grid_search.py ├── 12.5 classification_metrics.py ├── 12.6 learning curve.py ├── 12.7 regression_metrics.py └── README.md ├── 2.decision tree(DT) ├── 2.1 Decision Tree-Classifier.py ├── 2.2 Decision Tree- Regression.py └── README.md ├── 3.Bayes ├── 3.1 Gaussian Bayes.py ├── 3.2 Multinomial NB.py ├── 3.3 Bernoulli NB.py └── README.md ├── 4. KNN ├── 4.1 KNN classification.py ├── 4.2 KNN regressor.py └── README.md ├── 5.Dimension_Reduction ├── 5.1 PCA.py ├── 5.2 KPCA.py ├── 5.3 MDS.py ├── 5.4 Isomap.py ├── 5.5 LLE.py └── README.md ├── 6. Clustering ├── 6.1 Kmeans.py ├── 6.2 DBSCAN.py ├── 6.3 Agglomerative Clustering.py ├── 6.4 GaussianMixture.py └── README.md ├── 7. Support Vector Machine ├── 7.1 SVM-liner_SVC.py ├── 7.2 SVM-unliner_SVC.py ├── 7.3 liner_SVR.py ├── 7.4 unliner_SVR.py └── README.md ├── 8. Artificial Neural Network ├── README.md └── test.py ├── 9. Semi-Supervised Learning ├── 9.1 labelpropogation.py ├── 9.2LabelSpreading.py └── README.md └── README.md /.idea/ML note.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/dictionaries/Vam.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | AngularJS 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /1.liner model/1.basic liner model.py: -------------------------------------------------------------------------------- 1 | # import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model, discriminant_analysis, cross_validation 4 | 5 | 6 | ''' 7 | load_data : get the diabetes data from the pkg of sklearn 8 | return: 9 | 1 array for the regression problem. 10 | train_data, test_data, train_value, test_value 11 | ''' 12 | 13 | def load_data(): 14 | 15 | diabetes = datasets.load_diabetes() 16 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 17 | test_size=0.25,random_state=0) 18 | 19 | 20 | ''' 21 | test_LR: the code that train the model 22 | param data: *data is a parameter that can change 23 | Return: None 24 | ''' 25 | def test_LinearRegression(*data): 26 | 27 | X_train,X_test,y_train,y_test=data 28 | regr = linear_model.LinearRegression() 29 | regr.fit(X_train, y_train) 30 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 31 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2))) 32 | print('Score: {0}'.format(regr.score(X_test, y_test))) 33 | # the main function 34 | if __name__=='__main__': 35 | X_train,X_test,y_train,y_test=load_data() 36 | test_LinearRegression(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /1.liner model/2.ridge analysis.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model,cross_validation 4 | 5 | 6 | def load_data(): 7 | ''' 8 | load for the dataset 9 | return: 10 | 1 array for the regression problem. 11 | train_data, test_data, train_value, test_value 12 | ''' 13 | 14 | diabetes = datasets.load_diabetes() 15 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 16 | test_size=0.25,random_state=0) 17 | 18 | def test_Ridge(*data): 19 | ''' 20 | test the ridge analysis 21 | :param data: train_data, test_data, train_value, test_value 22 | :return: None 23 | ''' 24 | 25 | X_train,X_test,y_train,y_test=data 26 | regr = linear_model.Ridge() 27 | regr.fit(X_train, y_train) 28 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 29 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2))) 30 | print('Score: {0}' .format(regr.score(X_test, y_test))) 31 | def test_Ridge_alpha(*data): 32 | ''' 33 | test the score with different alpha param 34 | :param data: train_data, test_data, train_value, test_value 35 | :return: None 36 | ''' 37 | 38 | X_train,X_test,y_train,y_test=data 39 | alphas=[0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000] 40 | ''' 41 | actually, smaller alpha means a better score. But consider the calculation power, we need to trade off. 42 | ''' 43 | scores=[] 44 | for i,alpha in enumerate(alphas): 45 | regr = linear_model.Ridge(alpha=alpha) 46 | regr.fit(X_train, y_train) 47 | scores.append(regr.score(X_test, y_test)) 48 | ## graph 49 | fig=plt.figure() 50 | ax=fig.add_subplot(1,1,1) 51 | ax.plot(alphas,scores) 52 | ax.set_xlabel(r"$\alpha$") 53 | ax.set_ylabel(r"score") 54 | ax.set_xscale('log') 55 | ax.set_title("Ridge") 56 | plt.show() 57 | if __name__=='__main__': 58 | X_train,X_test,y_train,y_test=load_data() 59 | test_Ridge(X_train,X_test,y_train,y_test) 60 | test_Ridge_alpha(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /1.liner model/3.Lasso regression.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model,cross_validation 4 | def load_data(): 5 | ''' 6 | load for the dataset 7 | return: 8 | 1 array for the regression problem. 9 | train_data, test_data, train_value, test_value 10 | ''' 11 | diabetes = datasets.load_diabetes() 12 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 13 | test_size=0.25,random_state=0) 14 | def test_Lasso(*data): 15 | ''' 16 | test for lasso 17 | :param data: train_data, test_data, train_value, test_value 18 | :return: None 19 | ''' 20 | 21 | X_train,X_test,y_train,y_test=data 22 | regr = linear_model.Lasso() 23 | regr.fit(X_train, y_train) 24 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_, regr.intercept_)) 25 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2))) 26 | print('Score: {0}'.format(regr.score(X_test, y_test))) 27 | def test_Lasso_alpha(*data): 28 | ''' 29 | test the score with different alpha 30 | :param data: train_data, test_data, train_value, test_value 31 | :return: None 32 | ''' 33 | 34 | X_train,X_test,y_train,y_test=data 35 | alphas=[0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50,100,200,500,1000] 36 | scores=[] 37 | for i,alpha in enumerate(alphas): 38 | regr = linear_model.Lasso(alpha=alpha) 39 | regr.fit(X_train, y_train) 40 | scores.append(regr.score(X_test, y_test)) 41 | ## graph 42 | fig=plt.figure() 43 | ax=fig.add_subplot(1,1,1) 44 | ax.plot(alphas,scores) 45 | ax.set_xlabel(r"$\alpha$") 46 | ax.set_ylabel(r"score") 47 | ax.set_xscale('log') 48 | ax.set_title("Lasso") 49 | plt.show() 50 | if __name__=='__main__': 51 | X_train,X_test,y_train,y_test=load_data() 52 | test_Lasso(X_train,X_test,y_train,y_test) 53 | test_Lasso_alpha(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /1.liner model/4. ElasticNet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from sklearn import datasets, linear_model,cross_validation 5 | 6 | def load_data(): 7 | ''' 8 | load for the dataset 9 | return: 10 | 1 array for the regression problem. 11 | train_data, test_data, train_value, test_value 12 | ''' 13 | diabetes = datasets.load_diabetes() 14 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 15 | test_size=0.25,random_state=0) 16 | 17 | def test_ElasticNet(*data): 18 | ''' 19 | test for Elastic Net 20 | :param data: train_data, test_data, train_value, test_value 21 | :return: None 22 | ''' 23 | X_train,X_test,y_train,y_test=data 24 | regr = linear_model.ElasticNet() 25 | regr.fit(X_train, y_train) 26 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_, regr.intercept_)) 27 | print("Residual sum of squares: {0}".format(np.mean((regr.predict(X_test) - y_test) ** 2))) 28 | print('Score: {0}'.format(regr.score(X_test, y_test))) 29 | def test_ElasticNet_alpha_rho(*data): 30 | ''' 31 | test score with different alpha and l1_ratio 32 | :param data: train_data, test_data, train_value, test_value 33 | :return: None 34 | ''' 35 | X_train,X_test,y_train,y_test=data 36 | alphas=np.logspace(-2,2) 37 | rhos=np.linspace(0.01,1) 38 | scores=[] 39 | for alpha in alphas: 40 | for rho in rhos: 41 | regr = linear_model.ElasticNet(alpha=alpha,l1_ratio=rho) 42 | regr.fit(X_train, y_train) 43 | scores.append(regr.score(X_test, y_test)) 44 | ## graph 45 | alphas, rhos = np.meshgrid(alphas, rhos) 46 | scores=np.array(scores).reshape(alphas.shape) 47 | from mpl_toolkits.mplot3d import Axes3D # this part works well in py3 48 | from matplotlib import cm 49 | fig=plt.figure() 50 | ax=Axes3D(fig) 51 | surf = ax.plot_surface(alphas, rhos, scores, rstride=1, cstride=1, cmap=cm.jet, 52 | linewidth=0, antialiased=False) 53 | fig.colorbar(surf, shrink=0.5, aspect=5) 54 | ax.set_xlabel(r"$\alpha$") 55 | ax.set_ylabel(r"$\rho$") 56 | ax.set_zlabel("score") 57 | ax.set_title("ElasticNet") 58 | plt.show() 59 | if __name__=='__main__': 60 | X_train,X_test,y_train,y_test=load_data() 61 | test_ElasticNet(X_train,X_test,y_train,y_test) 62 | test_ElasticNet_alpha_rho(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /1.liner model/5.logistic regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from sklearn import datasets, linear_model,cross_validation 6 | 7 | def load_data(): 8 | ''' 9 | load for the dataset 10 | return: 11 | 1 array for the classification problem. 12 | train_data, test_data, train_value, test_value 13 | ''' 14 | iris=datasets.load_iris() # Use the IRIS data. This data has 3 class and 150 examples 15 | X_train=iris.data 16 | y_train=iris.target 17 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25, 18 | random_state=0,stratify=y_train) 19 | def test_LogisticRegression(*data): 20 | ''' 21 | test of LR 22 | :param data: train_data, test_data, train_value, test_value 23 | :return: None 24 | ''' 25 | X_train,X_test,y_train,y_test=data 26 | regr = linear_model.LogisticRegression() 27 | regr.fit(X_train, y_train) 28 | print('Coefficients: {0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 29 | print('Score: {0}' .format(regr.score(X_test, y_test))) 30 | def test_LogisticRegression_multinomial(*data): 31 | ''' 32 | Test with different multi_class 33 | :param data: train_data, test_data, train_value, test_value 34 | :return: None 35 | ''' 36 | X_train,X_test,y_train,y_test=data 37 | regr = linear_model.LogisticRegression(multi_class='multinomial',solver='lbfgs') 38 | regr.fit(X_train, y_train) 39 | print('Coefficients: {0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 40 | print('Score: {0}' .format(regr.score(X_test, y_test))) 41 | def test_LogisticRegression_C(*data): 42 | ''' 43 | test score with different C 44 | :param data: train_data, test_data, train_value, test_value 45 | :return: None 46 | ''' 47 | X_train,X_test,y_train,y_test=data 48 | Cs=np.logspace(-2,4,num=100) 49 | scores=[] 50 | for C in Cs: 51 | regr = linear_model.LogisticRegression(C=C) 52 | regr.fit(X_train, y_train) 53 | scores.append(regr.score(X_test, y_test)) 54 | ## graph 55 | fig=plt.figure() 56 | ax=fig.add_subplot(1,1,1) 57 | ax.plot(Cs,scores) 58 | ax.set_xlabel(r"C") 59 | ax.set_ylabel(r"score") 60 | ax.set_xscale('log') 61 | ax.set_title("LogisticRegression") 62 | plt.show() 63 | 64 | if __name__=='__main__': 65 | X_train,X_test,y_train,y_test=load_data() 66 | test_LogisticRegression(X_train,X_test,y_train,y_test) 67 | test_LogisticRegression_multinomial(X_train,X_test,y_train,y_test) 68 | test_LogisticRegression_C(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /1.liner model/6.LDA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from sklearn import datasets, discriminant_analysis,cross_validation 6 | 7 | def load_data(): 8 | ''' 9 | load for the dataset 10 | return: 11 | 1 array for the classification problem. 12 | train_data, test_data, train_value, test_value 13 | ''' 14 | iris=datasets.load_iris() 15 | X_train=iris.data 16 | y_train=iris.target 17 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25, 18 | random_state=0,stratify=y_train) 19 | def test_LinearDiscriminantAnalysis(*data): 20 | ''' 21 | test of LDA 22 | :param data: train_data, test_data, train_value, test_value 23 | :return: None 24 | ''' 25 | X_train,X_test,y_train,y_test=data 26 | lda = discriminant_analysis.LinearDiscriminantAnalysis() 27 | lda.fit(X_train, y_train) 28 | print('Coefficients: {0}, intercept {1}'.format(lda.coef_,lda.intercept_)) 29 | print('Score: {0}' .format( lda.score(X_test, y_test))) 30 | def plot_LDA(converted_X,y): 31 | ''' 32 | plot the graph after transfer 33 | :param converted_X: train data after transfer 34 | :param y: train_value 35 | :return: None 36 | ''' 37 | from mpl_toolkits.mplot3d import Axes3D 38 | fig=plt.figure() 39 | ax=Axes3D(fig) 40 | colors='rgb' 41 | markers='o*s' 42 | for target,color,marker in zip([0,1,2],colors,markers): 43 | pos=(y==target).ravel() 44 | X=converted_X[pos,:] 45 | ax.scatter(X[:,0], X[:,1], X[:,2],color=color,marker=marker, 46 | label="Label {0}".format(target)) 47 | ax.legend(loc="best") 48 | fig.suptitle("Iris After LDA") 49 | plt.show() 50 | def run_plot_LDA(): 51 | ''' 52 | run LDA 53 | :return: None 54 | ''' 55 | X_train,X_test,y_train,y_test=load_data() 56 | X=np.vstack((X_train,X_test)) 57 | Y=np.vstack((y_train.reshape(y_train.size,1),y_test.reshape(y_test.size,1))) 58 | lda = discriminant_analysis.LinearDiscriminantAnalysis() 59 | lda.fit(X, Y) 60 | converted_X=np.dot(X,np.transpose(lda.coef_))+lda.intercept_ 61 | plot_LDA(converted_X,Y) 62 | def test_LinearDiscriminantAnalysis_solver(*data): 63 | ''' 64 | test score with different solver 65 | :param data: train_data, test_data, train_value, test_value 66 | :return: None 67 | ''' 68 | X_train,X_test,y_train,y_test=data 69 | solvers=['svd','lsqr','eigen'] 70 | for solver in solvers: 71 | if(solver=='svd'): 72 | lda = discriminant_analysis.LinearDiscriminantAnalysis(solver=solver) 73 | else: 74 | lda = discriminant_analysis.LinearDiscriminantAnalysis(solver=solver, 75 | shrinkage=None) 76 | lda.fit(X_train, y_train) 77 | print('Score at solver={0}: {1}'.format(solver, lda.score(X_test, y_test))) 78 | def test_LinearDiscriminantAnalysis_shrinkage(*data): 79 | ''' 80 | test score with different shrinkage 81 | :param data: train_data, test_data, train_value, test_value 82 | :return: None 83 | ''' 84 | X_train,X_test,y_train,y_test=data 85 | shrinkages=np.linspace(0.0,1.0,num=20) 86 | scores=[] 87 | for shrinkage in shrinkages: 88 | lda = discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr', 89 | shrinkage=shrinkage) 90 | lda.fit(X_train, y_train) 91 | scores.append(lda.score(X_test, y_test)) 92 | ## graph 93 | fig=plt.figure() 94 | ax=fig.add_subplot(1,1,1) 95 | ax.plot(shrinkages,scores) 96 | ax.set_xlabel(r"shrinkage") 97 | ax.set_ylabel(r"score") 98 | ax.set_ylim(0,1.05) 99 | ax.set_title("LinearDiscriminantAnalysis") 100 | plt.show() 101 | 102 | if __name__=='__main__': 103 | X_train,X_test,y_train,y_test=load_data() 104 | test_LinearDiscriminantAnalysis(X_train,X_test,y_train,y_test) 105 | run_plot_LDA() 106 | test_LinearDiscriminantAnalysis_solver(X_train,X_test,y_train,y_test) 107 | test_LinearDiscriminantAnalysis_shrinkage(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /1.liner model/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 线性模型的一般表达形式: 3 | 4 | f(x) = W*X + b 5 | 6 | 其中, W= (w1, w2, w3, …, wn)T, 称之为权重向量。 权重向量直接的表现了各个特征在预测中的重要性。广义线性模型可以理解为是一个N维的线性模型。 7 | 8 | 线性回归,是一种监督性学习。结果是连续的,可以理解为回归分析; 结果是离散的, 可以理解为是分类问题。 9 | 10 | # 比较常见的相关模型有: 11 | 12 | 岭回归, Lasso回归, Elastic Net, 逻辑回归, 线性判别分析等。 13 | 14 | # 线性模型的损失函数一般为平房损失函数: 15 | 16 | (预测值-真实值)的平方和;(目标是使损失函数最小。) 17 | 18 | # 对特征使用归一化( Feature Scaling): 19 | 20 | 对特征进行处理,使之特征空间更加圆润适合训练学习。优点:1)提升模型的收敛速度;2)提神模型精度 21 | 22 | # 正则化: 23 | 24 | 对于由于变量过多而导致的过拟合问题,有两种主要的解决方法。其一是降低维度,同时会避免唯独灾难。其二就是正则化,保留了所有的变量的同时改变了他们的数量级以改变模型性能。 25 | 26 | # 逻辑回归 27 | 28 | 对于逻辑回归而言,多了一个激活函数。 29 | 30 | # 线性判别分析(LDA) 31 | 32 | LDA的思想是: 在训练时,将训练样本投影到一条直线上,是的同类的点尽可能接近,异类的点尽可能远离。在预测时,根据投影位置来判断类别。 33 | 34 | LAD的目标函数是:使同类点的方差尽可能小–J1,异类中心点距离尽可能大–J2。即使J1/J2尽可能小。 35 | 36 | # 实战代码:GitHub 37 | 38 | 1.线性模型: 39 | 40 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/1.basic%20liner%20model.py 41 | 42 | 2.岭分析 43 | 44 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/2.ridge%20analysis.py 45 | 46 | 3.Lasso分析 47 | 48 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/3.Lasso%20regression.py 49 | 50 | 4.Elastic NET 51 | 52 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/4.%20ElasticNet.py 53 | 54 | 5.逻辑回归 55 | 56 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/5.logistic%20regression.py 57 | 58 | 6. LDA 59 | 60 | https://github.com/JasonK93/ML-note/blob/master/1.liner%20model/6.LDA.py 61 | -------------------------------------------------------------------------------- /10.Ensemble/10.1 Adaboost classifer.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets,cross_validation,ensemble 4 | def load_data_classification(): 5 | ''' 6 | load data set for classification 7 | :return: train_data, test_data, train_value, test_value 8 | ''' 9 | digits=datasets.load_digits() 10 | return cross_validation.train_test_split(digits.data,digits.target, 11 | test_size=0.25,random_state=0,stratify=digits.target) 12 | def test_AdaBoostClassifier(*data): 13 | ''' 14 | test Ada score with different number of classifiers 15 | :param data: train_data, test_data, train_value, test_value 16 | :return: None 17 | ''' 18 | X_train,X_test,y_train,y_test=data 19 | clf=ensemble.AdaBoostClassifier(learning_rate=0.1) 20 | clf.fit(X_train,y_train) 21 | ## graph 22 | fig=plt.figure() 23 | ax=fig.add_subplot(1,1,1) 24 | estimators_num=len(clf.estimators_) 25 | X=range(1,estimators_num+1) 26 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score") 27 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score") 28 | ax.set_xlabel("estimator num") 29 | ax.set_ylabel("score") 30 | ax.legend(loc="best") 31 | ax.set_title("AdaBoostClassifier") 32 | plt.show() 33 | def test_AdaBoostClassifier_base_classifier(*data): 34 | ''' 35 | test Adaboost classifier with different number of classifier, and category of classifier 36 | :param data: train_data, test_data, train_value, test_value 37 | :return: None 38 | ''' 39 | from sklearn.naive_bayes import GaussianNB 40 | X_train,X_test,y_train,y_test=data 41 | fig=plt.figure() 42 | ax=fig.add_subplot(2,1,1) 43 | 44 | clf=ensemble.AdaBoostClassifier(learning_rate=0.1) 45 | clf.fit(X_train,y_train) 46 | ## graph 47 | estimators_num=len(clf.estimators_) 48 | X=range(1,estimators_num+1) 49 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score") 50 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score") 51 | ax.set_xlabel("estimator num") 52 | ax.set_ylabel("score") 53 | ax.legend(loc="lower right") 54 | ax.set_ylim(0,1) 55 | ax.set_title("AdaBoostClassifier with Decision Tree") 56 | 57 | ax=fig.add_subplot(2,1,2) 58 | clf=ensemble.AdaBoostClassifier(learning_rate=0.1,base_estimator=GaussianNB()) 59 | clf.fit(X_train,y_train) 60 | ## graph 61 | estimators_num=len(clf.estimators_) 62 | X=range(1,estimators_num+1) 63 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)),label="Traing score") 64 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)),label="Testing score") 65 | ax.set_xlabel("estimator num") 66 | ax.set_ylabel("score") 67 | ax.legend(loc="lower right") 68 | ax.set_ylim(0,1) 69 | ax.set_title("AdaBoostClassifier with Gaussian Naive Bayes") 70 | plt.show() 71 | def test_AdaBoostClassifier_learning_rate(*data): 72 | ''' 73 | test performance with different learning rate 74 | :param data: train_data, test_data, train_value, test_value 75 | :return: None 76 | ''' 77 | X_train,X_test,y_train,y_test=data 78 | learning_rates=np.linspace(0.01,1) 79 | fig=plt.figure() 80 | ax=fig.add_subplot(1,1,1) 81 | traing_scores=[] 82 | testing_scores=[] 83 | for learning_rate in learning_rates: 84 | clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate,n_estimators=500) 85 | clf.fit(X_train,y_train) 86 | traing_scores.append(clf.score(X_train,y_train)) 87 | testing_scores.append(clf.score(X_test,y_test)) 88 | ax.plot(learning_rates,traing_scores,label="Traing score") 89 | ax.plot(learning_rates,testing_scores,label="Testing score") 90 | ax.set_xlabel("learning rate") 91 | ax.set_ylabel("score") 92 | ax.legend(loc="best") 93 | ax.set_title("AdaBoostClassifier") 94 | plt.show() 95 | def test_AdaBoostClassifier_algorithm(*data): 96 | ''' 97 | test performance with different algorithm 98 | :param data: train_data, test_data, train_value, test_value 99 | :return: None 100 | ''' 101 | X_train,X_test,y_train,y_test=data 102 | algorithms=['SAMME.R','SAMME'] 103 | fig=plt.figure() 104 | learning_rates=[0.05,0.1,0.5,0.9] 105 | for i,learning_rate in enumerate(learning_rates): 106 | ax=fig.add_subplot(2,2,i+1) 107 | for i ,algorithm in enumerate(algorithms): 108 | clf=ensemble.AdaBoostClassifier(learning_rate=learning_rate, 109 | algorithm=algorithm) 110 | clf.fit(X_train,y_train) 111 | ## 绘图 112 | estimators_num=len(clf.estimators_) 113 | X=range(1,estimators_num+1) 114 | ax.plot(list(X),list(clf.staged_score(X_train,y_train)), 115 | label="%s:Traing score"%algorithms[i]) 116 | ax.plot(list(X),list(clf.staged_score(X_test,y_test)), 117 | label="%s:Testing score"%algorithms[i]) 118 | ax.set_xlabel("estimator num") 119 | ax.set_ylabel("score") 120 | ax.legend(loc="lower right") 121 | ax.set_title("learing rate:%f"%learning_rate) 122 | fig.suptitle("AdaBoostClassifier") 123 | plt.show() 124 | if __name__=='__main__': 125 | X_train,X_test,y_train,y_test=load_data_classification() 126 | test_AdaBoostClassifier(X_train,X_test,y_train,y_test) 127 | test_AdaBoostClassifier_base_classifier(X_train,X_test,y_train,y_test) 128 | test_AdaBoostClassifier_learning_rate(X_train,X_test,y_train,y_test) 129 | test_AdaBoostClassifier_algorithm(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /10.Ensemble/10.2 adaboost regression.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets,cross_validation,ensemble 4 | 5 | def load_data_regression(): 6 | ''' 7 | load the date set for regression (diabetes) 8 | :return: train_data, test_data, train_value, test_value 9 | ''' 10 | diabetes = datasets.load_diabetes() 11 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 12 | test_size=0.25,random_state=0) 13 | 14 | def test_AdaBoostRegressor(*data): 15 | ''' 16 | test the regression with different number of regression model 17 | :param data: train_data, test_data, train_value, test_value 18 | :return: None 19 | ''' 20 | X_train,X_test,y_train,y_test=data 21 | regr=ensemble.AdaBoostRegressor() 22 | regr.fit(X_train,y_train) 23 | ## graph 24 | fig=plt.figure() 25 | ax=fig.add_subplot(1,1,1) 26 | estimators_num=len(regr.estimators_) 27 | X=range(1,estimators_num+1) 28 | ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score") 29 | ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score") 30 | ax.set_xlabel("estimator num") 31 | ax.set_ylabel("score") 32 | ax.legend(loc="best") 33 | ax.set_title("AdaBoostRegressor") 34 | plt.show() 35 | def test_AdaBoostRegressor_base_regr(*data): 36 | ''' 37 | test the regression with different number of model and regression method 38 | :param data: train_data, test_data, train_value, test_value 39 | :return: None 40 | ''' 41 | from sklearn.svm import LinearSVR 42 | X_train,X_test,y_train,y_test=data 43 | fig=plt.figure() 44 | regrs=[ensemble.AdaBoostRegressor(), 45 | ensemble.AdaBoostRegressor(base_estimator=LinearSVR(epsilon=0.01,C=100))] 46 | labels=["Decision Tree Regressor","Linear SVM Regressor"] 47 | for i ,regr in enumerate(regrs): 48 | ax=fig.add_subplot(2,1,i+1) 49 | regr.fit(X_train,y_train) 50 | ## graph 51 | estimators_num=len(regr.estimators_) 52 | X=range(1,estimators_num+1) 53 | ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score") 54 | ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score") 55 | ax.set_xlabel("estimator num") 56 | ax.set_ylabel("score") 57 | ax.legend(loc="lower right") 58 | ax.set_ylim(-1,1) 59 | ax.set_title("Base_Estimator:%s"%labels[i]) 60 | plt.suptitle("AdaBoostRegressor") 61 | plt.show() 62 | def test_AdaBoostRegressor_learning_rate(*data): 63 | ''' 64 | test the performance with different learning rate 65 | :param data: train_data, test_data, train_value, test_value 66 | :return: None 67 | ''' 68 | X_train,X_test,y_train,y_test=data 69 | learning_rates=np.linspace(0.01,1) 70 | fig=plt.figure() 71 | ax=fig.add_subplot(1,1,1) 72 | traing_scores=[] 73 | testing_scores=[] 74 | for learning_rate in learning_rates: 75 | regr=ensemble.AdaBoostRegressor(learning_rate=learning_rate,n_estimators=500) 76 | regr.fit(X_train,y_train) 77 | traing_scores.append(regr.score(X_train,y_train)) 78 | testing_scores.append(regr.score(X_test,y_test)) 79 | ax.plot(learning_rates,traing_scores,label="Traing score") 80 | ax.plot(learning_rates,testing_scores,label="Testing score") 81 | ax.set_xlabel("learning rate") 82 | ax.set_ylabel("score") 83 | ax.legend(loc="best") 84 | ax.set_title("AdaBoostRegressor") 85 | plt.show() 86 | def test_AdaBoostRegressor_loss(*data): 87 | ''' 88 | test the method with different loss function 89 | :param data: train_data, test_data, train_value, test_value 90 | :return: None 91 | ''' 92 | X_train,X_test,y_train,y_test=data 93 | losses=['linear','square','exponential'] 94 | fig=plt.figure() 95 | ax=fig.add_subplot(1,1,1) 96 | for i ,loss in enumerate(losses): 97 | regr=ensemble.AdaBoostRegressor(loss=loss,n_estimators=30) 98 | regr.fit(X_train,y_train) 99 | ## graph 100 | estimators_num=len(regr.estimators_) 101 | X=range(1,estimators_num+1) 102 | ax.plot(list(X),list(regr.staged_score(X_train,y_train)), 103 | label="Traing score:loss=%s"%loss) 104 | ax.plot(list(X),list(regr.staged_score(X_test,y_test)), 105 | label="Testing score:loss=%s"%loss) 106 | ax.set_xlabel("estimator num") 107 | ax.set_ylabel("score") 108 | ax.legend(loc="lower right") 109 | ax.set_ylim(-1,1) 110 | plt.suptitle("AdaBoostRegressor") 111 | plt.show() 112 | 113 | if __name__=='__main__': 114 | X_train,X_test,y_train,y_test=load_data_regression() 115 | test_AdaBoostRegressor(X_train,X_test,y_train,y_test) 116 | test_AdaBoostRegressor_base_regr(X_train,X_test,y_train,y_test) 117 | test_AdaBoostRegressor_learning_rate(X_train,X_test,y_train,y_test) 118 | test_AdaBoostRegressor_loss(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /10.Ensemble/10.3 RF_classifier.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets,cross_validation,ensemble 4 | def load_data_classification(): 5 | ''' 6 | load the data set for classifier 7 | :return: train_data, test_data, train_value, test_value 8 | ''' 9 | digits=datasets.load_digits() 10 | return cross_validation.train_test_split(digits.data,digits.target, 11 | test_size=0.25,random_state=0,stratify=digits.target) 12 | def test_RandomForestClassifier(*data): 13 | ''' 14 | test the RF method 15 | :param data: train_data, test_data, train_value, test_value 16 | :return: None 17 | ''' 18 | X_train,X_test,y_train,y_test=data 19 | clf=ensemble.RandomForestClassifier() 20 | clf.fit(X_train,y_train) 21 | print("Traing Score:%f"%clf.score(X_train,y_train)) 22 | print("Testing Score:%f"%clf.score(X_test,y_test)) 23 | def test_RandomForestClassifier_num(*data): 24 | ''' 25 | test the performance with different n_estimators 26 | :param data: train_data, test_data, train_value, test_value 27 | :return: None 28 | ''' 29 | X_train,X_test,y_train,y_test=data 30 | nums=np.arange(1,100,step=2) 31 | fig=plt.figure() 32 | ax=fig.add_subplot(1,1,1) 33 | testing_scores=[] 34 | training_scores=[] 35 | for num in nums: 36 | clf=ensemble.RandomForestClassifier(n_estimators=num) 37 | clf.fit(X_train,y_train) 38 | training_scores.append(clf.score(X_train,y_train)) 39 | testing_scores.append(clf.score(X_test,y_test)) 40 | ax.plot(nums,training_scores,label="Training Score") 41 | ax.plot(nums,testing_scores,label="Testing Score") 42 | ax.set_xlabel("estimator num") 43 | ax.set_ylabel("score") 44 | ax.legend(loc="lower right") 45 | ax.set_ylim(0,1.05) 46 | plt.suptitle("RandomForestClassifier") 47 | plt.show() 48 | def test_RandomForestClassifier_max_depth(*data): 49 | ''' 50 | test the performance with different max_depth 51 | :param data: train_data, test_data, train_value, test_value 52 | :return: None 53 | ''' 54 | X_train,X_test,y_train,y_test=data 55 | maxdepths=range(1,20) 56 | fig=plt.figure() 57 | ax=fig.add_subplot(1,1,1) 58 | testing_scores=[] 59 | training_scores=[] 60 | for max_depth in maxdepths: 61 | clf=ensemble.RandomForestClassifier(max_depth=max_depth) 62 | clf.fit(X_train,y_train) 63 | training_scores.append(clf.score(X_train,y_train)) 64 | testing_scores.append(clf.score(X_test,y_test)) 65 | ax.plot(maxdepths,training_scores,label="Training Score") 66 | ax.plot(maxdepths,testing_scores,label="Testing Score") 67 | ax.set_xlabel("max_depth") 68 | ax.set_ylabel("score") 69 | ax.legend(loc="lower right") 70 | ax.set_ylim(0,1.05) 71 | plt.suptitle("RandomForestClassifier") 72 | plt.show() 73 | def test_RandomForestClassifier_max_features(*data): 74 | ''' 75 | test the performance with different max_features 76 | :param data: train_data, test_data, train_value, test_value 77 | :return: None 78 | ''' 79 | X_train,X_test,y_train,y_test=data 80 | max_features=np.linspace(0.01,1.0) 81 | fig=plt.figure() 82 | ax=fig.add_subplot(1,1,1) 83 | testing_scores=[] 84 | training_scores=[] 85 | for max_feature in max_features: 86 | clf=ensemble.RandomForestClassifier(max_features=max_feature) 87 | clf.fit(X_train,y_train) 88 | training_scores.append(clf.score(X_train,y_train)) 89 | testing_scores.append(clf.score(X_test,y_test)) 90 | ax.plot(max_features,training_scores,label="Training Score") 91 | ax.plot(max_features,testing_scores,label="Testing Score") 92 | ax.set_xlabel("max_feature") 93 | ax.set_ylabel("score") 94 | ax.legend(loc="lower right") 95 | ax.set_ylim(0,1.05) 96 | plt.suptitle("RandomForestClassifier") 97 | plt.show() 98 | if __name__=='__main__': 99 | X_train,X_test,y_train,y_test=load_data_classification() 100 | test_RandomForestClassifier(X_train,X_test,y_train,y_test) 101 | test_RandomForestClassifier_num(X_train,X_test,y_train,y_test) 102 | test_RandomForestClassifier_max_depth(X_train,X_test,y_train,y_test) 103 | test_RandomForestClassifier_max_features(X_train,X_test,y_train,y_test) 104 | -------------------------------------------------------------------------------- /10.Ensemble/10.4 RF_regression.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets,cross_validation,ensemble 4 | def load_data_regression(): 5 | ''' 6 | load the diabetes for regression 7 | :return: train_data, test_data, train_value, test_value 8 | ''' 9 | diabetes = datasets.load_diabetes() 10 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 11 | test_size=0.25,random_state=0) 12 | def test_RandomForestRegressor(*data): 13 | ''' 14 | test the RF method 15 | :param data: train_data, test_data, train_value, test_value 16 | :return: None 17 | ''' 18 | X_train,X_test,y_train,y_test=data 19 | regr=ensemble.RandomForestRegressor() 20 | regr.fit(X_train,y_train) 21 | print("Traing Score:%f"%regr.score(X_train,y_train)) 22 | print("Testing Score:%f"%regr.score(X_test,y_test)) 23 | def test_RandomForestRegressor_num(*data): 24 | ''' 25 | test the performance with different n_estimators 26 | :param data: train_data, test_data, train_value, test_value 27 | :return: None 28 | ''' 29 | X_train,X_test,y_train,y_test=data 30 | nums=np.arange(1,100,step=2) 31 | fig=plt.figure() 32 | ax=fig.add_subplot(1,1,1) 33 | testing_scores=[] 34 | training_scores=[] 35 | for num in nums: 36 | regr=ensemble.RandomForestRegressor(n_estimators=num) 37 | regr.fit(X_train,y_train) 38 | training_scores.append(regr.score(X_train,y_train)) 39 | testing_scores.append(regr.score(X_test,y_test)) 40 | ax.plot(nums,training_scores,label="Training Score") 41 | ax.plot(nums,testing_scores,label="Testing Score") 42 | ax.set_xlabel("estimator num") 43 | ax.set_ylabel("score") 44 | ax.legend(loc="lower right") 45 | ax.set_ylim(-1,1) 46 | plt.suptitle("RandomForestRegressor") 47 | plt.show() 48 | def test_RandomForestRegressor_max_depth(*data): 49 | ''' 50 | test the performance with different max_depth 51 | :param data: train_data, test_data, train_value, test_value 52 | :return: None 53 | ''' 54 | X_train,X_test,y_train,y_test=data 55 | maxdepths=range(1,20) 56 | fig=plt.figure() 57 | ax=fig.add_subplot(1,1,1) 58 | testing_scores=[] 59 | training_scores=[] 60 | for max_depth in maxdepths: 61 | regr=ensemble.RandomForestRegressor(max_depth=max_depth) 62 | regr.fit(X_train,y_train) 63 | training_scores.append(regr.score(X_train,y_train)) 64 | testing_scores.append(regr.score(X_test,y_test)) 65 | ax.plot(maxdepths,training_scores,label="Training Score") 66 | ax.plot(maxdepths,testing_scores,label="Testing Score") 67 | ax.set_xlabel("max_depth") 68 | ax.set_ylabel("score") 69 | ax.legend(loc="lower right") 70 | ax.set_ylim(0,1.05) 71 | plt.suptitle("RandomForestRegressor") 72 | plt.show() 73 | def test_RandomForestRegressor_max_features(*data): 74 | ''' 75 | test the performance with different max_features 76 | :param data: train_data, test_data, train_value, test_value 77 | :return: None 78 | ''' 79 | X_train,X_test,y_train,y_test=data 80 | max_features=np.linspace(0.01,1.0) 81 | fig=plt.figure() 82 | ax=fig.add_subplot(1,1,1) 83 | testing_scores=[] 84 | training_scores=[] 85 | for max_feature in max_features: 86 | regr=ensemble.RandomForestRegressor(max_features=max_feature) 87 | regr.fit(X_train,y_train) 88 | training_scores.append(regr.score(X_train,y_train)) 89 | testing_scores.append(regr.score(X_test,y_test)) 90 | ax.plot(max_features,training_scores,label="Training Score") 91 | ax.plot(max_features,testing_scores,label="Testing Score") 92 | ax.set_xlabel("max_feature") 93 | ax.set_ylabel("score") 94 | ax.legend(loc="lower right") 95 | ax.set_ylim(0,1.05) 96 | plt.suptitle("RandomForestRegressor") 97 | plt.show() 98 | if __name__=='__main__': 99 | X_train,X_test,y_train,y_test=load_data_regression() 100 | test_RandomForestRegressor(X_train,X_test,y_train,y_test) 101 | test_RandomForestRegressor_num(X_train,X_test,y_train,y_test) 102 | test_RandomForestRegressor_max_depth(X_train,X_test,y_train,y_test) 103 | test_RandomForestRegressor_max_features(X_train,X_test,y_train,y_test) 104 | -------------------------------------------------------------------------------- /10.Ensemble/10.5 Gradient_Classifier.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets,cross_validation,ensemble 4 | 5 | def load_data_classification(): 6 | ''' 7 | load the digits data set for classification 8 | :return: train_data, test_data, train_value, test_value 9 | ''' 10 | digits=datasets.load_digits() 11 | return cross_validation.train_test_split(digits.data,digits.target, 12 | test_size=0.25,random_state=0,stratify=digits.target) 13 | def test_GradientBoostingClassifier(*data): 14 | ''' 15 | test the method 16 | :param data: train_data, test_data, train_value, test_value 17 | :return: None 18 | ''' 19 | X_train,X_test,y_train,y_test=data 20 | clf=ensemble.GradientBoostingClassifier() 21 | clf.fit(X_train,y_train) 22 | print("Traing Score:%f"%clf.score(X_train,y_train)) 23 | print("Testing Score:%f"%clf.score(X_test,y_test)) 24 | def test_GradientBoostingClassifier_num(*data): 25 | ''' 26 | test the performance with different n_estimators 27 | :param data: train_data, test_data, train_value, test_value 28 | :return: None 29 | ''' 30 | X_train,X_test,y_train,y_test=data 31 | nums=np.arange(1,100,step=2) 32 | fig=plt.figure() 33 | ax=fig.add_subplot(1,1,1) 34 | testing_scores=[] 35 | training_scores=[] 36 | for num in nums: 37 | clf=ensemble.GradientBoostingClassifier(n_estimators=num) 38 | clf.fit(X_train,y_train) 39 | training_scores.append(clf.score(X_train,y_train)) 40 | testing_scores.append(clf.score(X_test,y_test)) 41 | ax.plot(nums,training_scores,label="Training Score") 42 | ax.plot(nums,testing_scores,label="Testing Score") 43 | ax.set_xlabel("estimator num") 44 | ax.set_ylabel("score") 45 | ax.legend(loc="lower right") 46 | ax.set_ylim(0,1.05) 47 | plt.suptitle("GradientBoostingClassifier") 48 | plt.show() 49 | def test_GradientBoostingClassifier_maxdepth(*data): 50 | ''' 51 | test the performance with different max_depth 52 | :param data: train_data, test_data, train_value, test_value 53 | :return: None 54 | ''' 55 | X_train,X_test,y_train,y_test=data 56 | maxdepths=np.arange(1,20) 57 | fig=plt.figure() 58 | ax=fig.add_subplot(1,1,1) 59 | testing_scores=[] 60 | training_scores=[] 61 | for maxdepth in maxdepths: 62 | clf=ensemble.GradientBoostingClassifier(max_depth=maxdepth,max_leaf_nodes=None) 63 | clf.fit(X_train,y_train) 64 | training_scores.append(clf.score(X_train,y_train)) 65 | testing_scores.append(clf.score(X_test,y_test)) 66 | ax.plot(maxdepths,training_scores,label="Training Score") 67 | ax.plot(maxdepths,testing_scores,label="Testing Score") 68 | ax.set_xlabel("max_depth") 69 | ax.set_ylabel("score") 70 | ax.legend(loc="lower right") 71 | ax.set_ylim(0,1.05) 72 | plt.suptitle("GradientBoostingClassifier") 73 | plt.show() 74 | def test_GradientBoostingClassifier_learning(*data): 75 | ''' 76 | test the performance with different learning rate 77 | :param data: train_data, test_data, train_value, test_value 78 | :return: None 79 | ''' 80 | X_train,X_test,y_train,y_test=data 81 | learnings=np.linspace(0.01,1.0) 82 | fig=plt.figure() 83 | ax=fig.add_subplot(1,1,1) 84 | testing_scores=[] 85 | training_scores=[] 86 | for learning in learnings: 87 | clf=ensemble.GradientBoostingClassifier(learning_rate=learning) 88 | clf.fit(X_train,y_train) 89 | training_scores.append(clf.score(X_train,y_train)) 90 | testing_scores.append(clf.score(X_test,y_test)) 91 | ax.plot(learnings,training_scores,label="Training Score") 92 | ax.plot(learnings,testing_scores,label="Testing Score") 93 | ax.set_xlabel("learning_rate") 94 | ax.set_ylabel("score") 95 | ax.legend(loc="lower right") 96 | ax.set_ylim(0,1.05) 97 | plt.suptitle("GradientBoostingClassifier") 98 | plt.show() 99 | def test_GradientBoostingClassifier_subsample(*data): 100 | ''' 101 | test the performance with different subsample 102 | :param data: train_data, test_data, train_value, test_value 103 | :return: None 104 | ''' 105 | X_train,X_test,y_train,y_test=data 106 | fig=plt.figure() 107 | ax=fig.add_subplot(1,1,1) 108 | subsamples=np.linspace(0.01,1.0) 109 | testing_scores=[] 110 | training_scores=[] 111 | for subsample in subsamples: 112 | clf=ensemble.GradientBoostingClassifier(subsample=subsample) 113 | clf.fit(X_train,y_train) 114 | training_scores.append(clf.score(X_train,y_train)) 115 | testing_scores.append(clf.score(X_test,y_test)) 116 | ax.plot(subsamples,training_scores,label="Training Score") 117 | ax.plot(subsamples,testing_scores,label="Training Score") 118 | ax.set_xlabel("subsample") 119 | ax.set_ylabel("score") 120 | ax.legend(loc="lower right") 121 | ax.set_ylim(0,1.05) 122 | plt.suptitle("GradientBoostingClassifier") 123 | plt.show() 124 | def test_GradientBoostingClassifier_max_features(*data): 125 | ''' 126 | test the performance with different max_features 127 | :param data: train_data, test_data, train_value, test_value 128 | :return: None 129 | ''' 130 | X_train,X_test,y_train,y_test=data 131 | fig=plt.figure() 132 | ax=fig.add_subplot(1,1,1) 133 | max_features=np.linspace(0.01,1.0) 134 | testing_scores=[] 135 | training_scores=[] 136 | for features in max_features: 137 | clf=ensemble.GradientBoostingClassifier(max_features=features) 138 | clf.fit(X_train,y_train) 139 | training_scores.append(clf.score(X_train,y_train)) 140 | testing_scores.append(clf.score(X_test,y_test)) 141 | ax.plot(max_features,training_scores,label="Training Score") 142 | ax.plot(max_features,testing_scores,label="Training Score") 143 | ax.set_xlabel("max_features") 144 | ax.set_ylabel("score") 145 | ax.legend(loc="lower right") 146 | ax.set_ylim(0,1.05) 147 | plt.suptitle("GradientBoostingClassifier") 148 | plt.show() 149 | if __name__=='__main__': 150 | X_train,X_test,y_train,y_test=load_data_classification() 151 | test_GradientBoostingClassifier(X_train,X_test,y_train,y_test) 152 | test_GradientBoostingClassifier_num(X_train,X_test,y_train,y_test) 153 | test_GradientBoostingClassifier_maxdepth(X_train,X_test,y_train,y_test) 154 | test_GradientBoostingClassifier_learning(X_train,X_test,y_train,y_test) 155 | test_GradientBoostingClassifier_subsample(X_train,X_test,y_train,y_test) 156 | test_GradientBoostingClassifier_max_features(X_train,X_test,y_train,y_test) 157 | -------------------------------------------------------------------------------- /10.Ensemble/10.6 Gradient_regresion.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets,cross_validation,ensemble 4 | 5 | def load_data_regression(): 6 | ''' 7 | load the diabetes data for regression 8 | :return: train_data, test_data, train_value, test_value 9 | ''' 10 | diabetes = datasets.load_diabetes() 11 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 12 | test_size=0.25,random_state=0) 13 | def test_GradientBoostingRegressor(*data): 14 | ''' 15 | test the method 16 | :param data: train_data, test_data, train_value, test_value 17 | :return: None 18 | ''' 19 | X_train,X_test,y_train,y_test=data 20 | regr=ensemble.GradientBoostingRegressor() 21 | regr.fit(X_train,y_train) 22 | print("Training score:%f"%regr.score(X_train,y_train)) 23 | print("Testing score:%f"%regr.score(X_test,y_test)) 24 | def test_GradientBoostingRegressor_num(*data): 25 | ''' 26 | test the performance with different n_estimators 27 | :param data: train_data, test_data, train_value, test_value 28 | :return: None 29 | ''' 30 | X_train,X_test,y_train,y_test=data 31 | nums=np.arange(1,200,step=2) 32 | fig=plt.figure() 33 | ax=fig.add_subplot(1,1,1) 34 | testing_scores=[] 35 | training_scores=[] 36 | for num in nums: 37 | regr=ensemble.GradientBoostingRegressor(n_estimators=num) 38 | regr.fit(X_train,y_train) 39 | training_scores.append(regr.score(X_train,y_train)) 40 | testing_scores.append(regr.score(X_test,y_test)) 41 | ax.plot(nums,training_scores,label="Training Score") 42 | ax.plot(nums,testing_scores,label="Testing Score") 43 | ax.set_xlabel("estimator num") 44 | ax.set_ylabel("score") 45 | ax.legend(loc="lower right") 46 | ax.set_ylim(0,1.05) 47 | plt.suptitle("GradientBoostingRegressor") 48 | plt.show() 49 | def test_GradientBoostingRegressor_maxdepth(*data): 50 | ''' 51 | test the performance with different max_depth 52 | :param data: train_data, test_data, train_value, test_value 53 | :return: None 54 | ''' 55 | X_train,X_test,y_train,y_test=data 56 | maxdepths=np.arange(1,20) 57 | fig=plt.figure() 58 | ax=fig.add_subplot(1,1,1) 59 | testing_scores=[] 60 | training_scores=[] 61 | for maxdepth in maxdepths: 62 | regr=ensemble.GradientBoostingRegressor(max_depth=maxdepth,max_leaf_nodes=None) 63 | regr.fit(X_train,y_train) 64 | training_scores.append(regr.score(X_train,y_train)) 65 | testing_scores.append(regr.score(X_test,y_test)) 66 | ax.plot(maxdepths,training_scores,label="Training Score") 67 | ax.plot(maxdepths,testing_scores,label="Testing Score") 68 | ax.set_xlabel("max_depth") 69 | ax.set_ylabel("score") 70 | ax.legend(loc="lower right") 71 | ax.set_ylim(-1,1.05) 72 | plt.suptitle("GradientBoostingRegressor") 73 | plt.show() 74 | def test_GradientBoostingRegressor_learning(*data): 75 | ''' 76 | test the performance with different learning rate 77 | :param data: train_data, test_data, train_value, test_value 78 | :return: None 79 | ''' 80 | X_train,X_test,y_train,y_test=data 81 | learnings=np.linspace(0.01,1.0) 82 | fig=plt.figure() 83 | ax=fig.add_subplot(1,1,1) 84 | testing_scores=[] 85 | training_scores=[] 86 | for learning in learnings: 87 | regr=ensemble.GradientBoostingRegressor(learning_rate=learning) 88 | regr.fit(X_train,y_train) 89 | training_scores.append(regr.score(X_train,y_train)) 90 | testing_scores.append(regr.score(X_test,y_test)) 91 | ax.plot(learnings,training_scores,label="Training Score") 92 | ax.plot(learnings,testing_scores,label="Testing Score") 93 | ax.set_xlabel("learning_rate") 94 | ax.set_ylabel("score") 95 | ax.legend(loc="lower right") 96 | ax.set_ylim(-1,1.05) 97 | plt.suptitle("GradientBoostingRegressor") 98 | plt.show() 99 | def test_GradientBoostingRegressor_subsample(*data): 100 | ''' 101 | test the performance with different subsample 102 | :param data: train_data, test_data, train_value, test_value 103 | :return: None 104 | ''' 105 | X_train,X_test,y_train,y_test=data 106 | fig=plt.figure() 107 | ax=fig.add_subplot(1,1,1) 108 | subsamples=np.linspace(0.01,1.0,num=20) 109 | testing_scores=[] 110 | training_scores=[] 111 | for subsample in subsamples: 112 | regr=ensemble.GradientBoostingRegressor(subsample=subsample) 113 | regr.fit(X_train,y_train) 114 | training_scores.append(regr.score(X_train,y_train)) 115 | testing_scores.append(regr.score(X_test,y_test)) 116 | ax.plot(subsamples,training_scores,label="Training Score") 117 | ax.plot(subsamples,testing_scores,label="Training Score") 118 | ax.set_xlabel("subsample") 119 | ax.set_ylabel("score") 120 | ax.legend(loc="lower right") 121 | ax.set_ylim(-1,1.05) 122 | plt.suptitle("GradientBoostingRegressor") 123 | plt.show() 124 | def test_GradientBoostingRegressor_loss(*data): 125 | ''' 126 | test the performance with differnt loss function and alpha 127 | :param data: train_data, test_data, train_value, test_value 128 | :return: None 129 | ''' 130 | X_train,X_test,y_train,y_test=data 131 | fig=plt.figure() 132 | nums=np.arange(1,200,step=2) 133 | ########## graph huber ###### 134 | ax=fig.add_subplot(2,1,1) 135 | alphas=np.linspace(0.01,1.0,endpoint=False,num=5) 136 | for alpha in alphas: 137 | testing_scores=[] 138 | training_scores=[] 139 | for num in nums: 140 | regr=ensemble.GradientBoostingRegressor(n_estimators=num, 141 | loss='huber',alpha=alpha) 142 | regr.fit(X_train,y_train) 143 | training_scores.append(regr.score(X_train,y_train)) 144 | testing_scores.append(regr.score(X_test,y_test)) 145 | ax.plot(nums,training_scores,label="Training Score:alpha=%f"%alpha) 146 | ax.plot(nums,testing_scores,label="Testing Score:alpha=%f"%alpha) 147 | ax.set_xlabel("estimator num") 148 | ax.set_ylabel("score") 149 | ax.legend(loc="lower right",framealpha=0.4) 150 | ax.set_ylim(0,1.05) 151 | ax.set_title("loss=%huber") 152 | plt.suptitle("GradientBoostingRegressor") 153 | #### graph ls and lad 154 | ax=fig.add_subplot(2,1,2) 155 | for loss in ['ls','lad']: 156 | testing_scores=[] 157 | training_scores=[] 158 | for num in nums: 159 | regr=ensemble.GradientBoostingRegressor(n_estimators=num,loss=loss) 160 | regr.fit(X_train,y_train) 161 | training_scores.append(regr.score(X_train,y_train)) 162 | testing_scores.append(regr.score(X_test,y_test)) 163 | ax.plot(nums,training_scores,label="Training Score:loss=%s"%loss) 164 | ax.plot(nums,testing_scores,label="Testing Score:loss=%s"%loss) 165 | ax.set_xlabel("estimator num") 166 | ax.set_ylabel("score") 167 | ax.legend(loc="lower right",framealpha=0.4) 168 | ax.set_ylim(0,1.05) 169 | ax.set_title("loss=ls,lad") 170 | plt.suptitle("GradientBoostingRegressor") 171 | plt.show() 172 | def test_GradientBoostingRegressor_max_features(*data): 173 | ''' 174 | test the performance with different max_features 175 | :param data: train_data, test_data, train_value, test_value 176 | :return: None 177 | ''' 178 | X_train,X_test,y_train,y_test=data 179 | fig=plt.figure() 180 | ax=fig.add_subplot(1,1,1) 181 | max_features=np.linspace(0.01,1.0) 182 | testing_scores=[] 183 | training_scores=[] 184 | for features in max_features: 185 | regr=ensemble.GradientBoostingRegressor(max_features=features) 186 | regr.fit(X_train,y_train) 187 | training_scores.append(regr.score(X_train,y_train)) 188 | testing_scores.append(regr.score(X_test,y_test)) 189 | ax.plot(max_features,training_scores,label="Training Score") 190 | ax.plot(max_features,testing_scores,label="Training Score") 191 | ax.set_xlabel("max_features") 192 | ax.set_ylabel("score") 193 | ax.legend(loc="lower right") 194 | ax.set_ylim(0,1.05) 195 | plt.suptitle("GradientBoostingRegressor") 196 | plt.show() 197 | 198 | if __name__=='__main__': 199 | X_train,X_test,y_train,y_test=load_data_regression() 200 | test_GradientBoostingRegressor(X_train,X_test,y_train,y_test) 201 | test_GradientBoostingRegressor_num(X_train,X_test,y_train,y_test) 202 | test_GradientBoostingRegressor_maxdepth(X_train,X_test,y_train,y_test) 203 | test_GradientBoostingRegressor_learning(X_train,X_test,y_train,y_test) 204 | test_GradientBoostingRegressor_subsample(X_train,X_test,y_train,y_test) 205 | test_GradientBoostingRegressor_loss(X_train,X_test,y_train,y_test) 206 | test_GradientBoostingRegressor_max_features(X_train,X_test,y_train,y_test) 207 | -------------------------------------------------------------------------------- /10.Ensemble/README.md: -------------------------------------------------------------------------------- 1 | # 综述 2 | 3 | 集成学习, 等价于多算法的融合, 或者说高级的投票机制。 4 | 5 | # 主要两大类 6 | 7 | Boosting算法: 个体模型之间存在强依赖关系,串行生成。 8 | 9 | Bagging算法: 个体模型之间不存在强依赖关系,可以同时生成。 10 | 11 | # Boosting算法 12 | 13 | 是一种常见的统计学习方法。 代表为Adaboost 算法, 是一种通过改变权值使弱学习变成强学习的算法。 14 | 15 | # Bagging算法 16 | 17 | 自主采样法,代表为随即森林。 Random Forest。 18 | 19 | # 实战代码:GitHub: 20 | 21 | 1. Adaboost classifier 22 | 23 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.1%20Adaboost%20classifer.py 24 | 25 | 2. Adaboost regression 26 | 27 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.2%20adaboost%20regression.py 28 | 29 | 3. RF_ classifier 30 | 31 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.3%20RF_classifier.py 32 | 33 | 4.RF_ regression 34 | 35 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.4%20RF_regression.py 36 | 37 | 5. Gradient_ Classifer 38 | 39 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.5%20Gradient_Classifier.py 40 | 41 | 6. Gradient _ Regression 42 | 43 | https://github.com/JasonK93/ML-note/blob/master/10.Ensemble/10.6%20Gradient_regresion.py 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /11. Preprocessing/11.1 Binarize.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import Binarizer 2 | def test_Binarizer(): 3 | ''' 4 | test Binatizer method 5 | :return: None 6 | ''' 7 | X=[ [1,2,3,4,5], 8 | [5,4,3,2,1], 9 | [3,3,3,3,3,], 10 | [1,1,1,1,1] ] 11 | print("before transform:",X) 12 | binarizer=Binarizer(threshold=2.5) 13 | print("after transform:",binarizer.transform(X)) 14 | 15 | if __name__=='__main__': 16 | test_Binarizer() -------------------------------------------------------------------------------- /11. Preprocessing/11.2 One-hot encoder.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import OneHotEncoder 2 | def test_OneHotEncoder(): 3 | ''' 4 | test the method 5 | :return: None 6 | ''' 7 | X=[ [1,2,3,4,5], 8 | [5,4,3,2,1], 9 | [3,3,3,3,3,], 10 | [1,1,1,1,1] ] 11 | print("before transform:",X) 12 | encoder=OneHotEncoder(sparse=False) 13 | encoder.fit(X) 14 | print("active_features_:",encoder.active_features_) 15 | print("feature_indices_:",encoder.feature_indices_) 16 | print("n_values_:",encoder.n_values_) 17 | print("after transform:",encoder.transform( [[1,2,3,4,5]])) 18 | if __name__=='__main__': 19 | test_OneHotEncoder() -------------------------------------------------------------------------------- /11. Preprocessing/11.3 normalize.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import Normalizer 2 | def test_Normalizer(): 3 | ''' 4 | test the method 5 | :return: None 6 | ''' 7 | X=[ [1,2,3,4,5], 8 | [5,4,3,2,1], 9 | [1,3,5,2,4,], 10 | [2,4,1,3,5] ] 11 | print("before transform:",X) 12 | normalizer=Normalizer(norm='l2') 13 | print("after transform:",normalizer.transform(X)) 14 | 15 | if __name__=='__main__': 16 | test_Normalizer() -------------------------------------------------------------------------------- /11. Preprocessing/11.4 standardize.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler,StandardScaler 2 | 3 | def test_MinMaxScaler(): 4 | ''' 5 | test the method of MinMax Scaler 6 | :return: None 7 | ''' 8 | X=[ [1,5,1,2,10], 9 | [2,6,3,2,7], 10 | [3,7,5,6,4,], 11 | [4,8,7,8,1] ] 12 | print("before transform:",X) 13 | scaler=MinMaxScaler(feature_range=(0,2)) 14 | scaler.fit(X) 15 | print("min_ is :",scaler.min_) 16 | print("scale_ is :",scaler.scale_) 17 | print("data_max_ is :",scaler.data_max_) 18 | print("data_min_ is :",scaler.data_min_) 19 | print("data_range_ is :",scaler.data_range_) 20 | print("after transform:",scaler.transform(X)) 21 | def test_MaxAbsScaler(): 22 | ''' 23 | test the method of MaxAbs Scaler 24 | 25 | :return: None 26 | ''' 27 | X=[ [1,5,1,2,10], 28 | [2,6,3,2,7], 29 | [3,7,5,6,4,], 30 | [4,8,7,8,1] ] 31 | print("before transform:",X) 32 | scaler=MaxAbsScaler() 33 | scaler.fit(X) 34 | print("scale_ is :",scaler.scale_) 35 | print("max_abs_ is :",scaler.max_abs_) 36 | print("after transform:",scaler.transform(X)) 37 | def test_StandardScaler(): 38 | ''' 39 | test the method of Standard Scaler 40 | :return: None 41 | ''' 42 | X=[ [1,5,1,2,10], 43 | [2,6,3,2,7], 44 | [3,7,5,6,4,], 45 | [4,8,7,8,1] ] 46 | print("before transform:",X) 47 | scaler=StandardScaler() 48 | scaler.fit(X) 49 | print("scale_ is :",scaler.scale_) 50 | print("mean_ is :",scaler.mean_) 51 | print("var_ is :",scaler.var_) 52 | print("after transform:",scaler.transform(X)) 53 | 54 | if __name__=='__main__': 55 | test_MinMaxScaler() 56 | test_MaxAbsScaler() 57 | test_MaxAbsScaler() -------------------------------------------------------------------------------- /11. Preprocessing/11.5 feature_seleticon_filter.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection import VarianceThreshold,SelectKBest,f_classif 2 | 3 | def test_VarianceThreshold(): 4 | ''' 5 | test the method of VarianceThreshold 6 | :return: None 7 | ''' 8 | X=[[100,1,2,3], 9 | [100,4,5,6], 10 | [100,7,8,9], 11 | [101,11,12,13]] 12 | selector=VarianceThreshold(1) 13 | selector.fit(X) 14 | print("Variances is %s"%selector.variances_) 15 | print("After transform is %s"%selector.transform(X)) 16 | print("The surport is %s"%selector.get_support(True)) 17 | print("After reverse transform is %s"% 18 | selector.inverse_transform(selector.transform(X))) 19 | def test_SelectKBest(): 20 | ''' 21 | test the method of SelectKBert 22 | :return: None 23 | ''' 24 | X=[ [1,2,3,4,5], 25 | [5,4,3,2,1], 26 | [3,3,3,3,3,], 27 | [1,1,1,1,1] ] 28 | y=[0,1,0,1] 29 | print("before transform:",X) 30 | selector=SelectKBest(score_func=f_classif,k=3) 31 | selector.fit(X,y) 32 | print("scores_:",selector.scores_) 33 | print("pvalues_:",selector.pvalues_) 34 | print("selected index:",selector.get_support(True)) 35 | print("after transform:",selector.transform(X)) 36 | if __name__=='__main__': 37 | test_VarianceThreshold() 38 | test_SelectKBest() -------------------------------------------------------------------------------- /11. Preprocessing/11.6 feature_selection_bagging.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection import RFE,RFECV 2 | from sklearn.svm import LinearSVC 3 | from sklearn.datasets import load_iris 4 | from sklearn import cross_validation 5 | 6 | def test_RFE(): 7 | ''' 8 | test the method of RFE, the number of feature aim to 2 9 | :return: None 10 | ''' 11 | iris=load_iris() 12 | X=iris.data 13 | y=iris.target 14 | estimator=LinearSVC() 15 | selector=RFE(estimator=estimator,n_features_to_select=2) 16 | selector.fit(X,y) 17 | print("N_features %s"%selector.n_features_) 18 | print("Support is %s"%selector.support_) 19 | print("Ranking %s"%selector.ranking_) 20 | def test_RFECV(): 21 | ''' 22 | test the method of RFECV 23 | :return: None 24 | ''' 25 | iris=load_iris() 26 | X=iris.data 27 | y=iris.target 28 | estimator=LinearSVC() 29 | selector=RFECV(estimator=estimator,cv=3) 30 | selector.fit(X,y) 31 | print("N_features %s"%selector.n_features_) 32 | print("Support is %s"%selector.support_) 33 | print("Ranking %s"%selector.ranking_) 34 | print("Grid Scores %s"%selector.grid_scores_) 35 | def test_compare_with_no_feature_selection(): 36 | ''' 37 | compare the result before the selection and after 38 | :return: None 39 | ''' 40 | iris=load_iris() 41 | X,y=iris.data,iris.target 42 | estimator=LinearSVC() 43 | selector=RFE(estimator=estimator,n_features_to_select=2) 44 | X_t=selector.fit_transform(X,y) 45 | X_train,X_test,y_train,y_test=cross_validation.train_test_split(X, y, 46 | test_size=0.25,random_state=0,stratify=y) 47 | X_train_t,X_test_t,y_train_t,y_test_t=cross_validation.train_test_split(X_t, y, 48 | test_size=0.25,random_state=0,stratify=y) 49 | clf=LinearSVC() 50 | clf_t=LinearSVC() 51 | clf.fit(X_train,y_train) 52 | clf_t.fit(X_train_t,y_train_t) 53 | print("Original DataSet: test score=%s"%(clf.score(X_test,y_test))) 54 | print("Selected DataSet: test score=%s"%(clf_t.score(X_test_t,y_test_t))) 55 | if __name__=='__main__': 56 | test_RFE() 57 | test_compare_with_no_feature_selection() 58 | test_RFECV() -------------------------------------------------------------------------------- /11. Preprocessing/11.7 feature_selection_embeded.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection import SelectFromModel 2 | from sklearn.svm import LinearSVC 3 | from sklearn.datasets import load_digits,load_diabetes 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.linear_model import Lasso 7 | 8 | def test_SelectFromModel(): 9 | ''' 10 | test the method of SelectFromModel 11 | :return: None 12 | ''' 13 | digits=load_digits() 14 | X=digits.data 15 | y=digits.target 16 | estimator=LinearSVC(penalty='l1',dual=False) 17 | selector=SelectFromModel(estimator=estimator,threshold='mean') 18 | selector.fit(X,y) 19 | selector.transform(X) 20 | print("Threshold %s"%selector.threshold_) 21 | print("Support is %s"%selector.get_support(indices=True)) 22 | def test_Lasso(*data): 23 | ''' 24 | test the correlation between alpha and sparse condition 25 | :param data: train_data, test_data, train_value, test_value 26 | :return: None 27 | ''' 28 | X,y=data 29 | alphas=np.logspace(-2,2) 30 | zeros=[] 31 | for alpha in alphas: 32 | regr=Lasso(alpha=alpha) 33 | regr.fit(X,y) 34 | num=0 35 | for ele in regr.coef_: 36 | if abs(ele) < 1e-5:num+=1 37 | zeros.append(num) 38 | fig=plt.figure() 39 | ax=fig.add_subplot(1,1,1) 40 | ax.plot(alphas,zeros) 41 | ax.set_xlabel(r"$\alpha$") 42 | ax.set_xscale("log") 43 | ax.set_ylim(0,X.shape[1]+1) 44 | ax.set_ylabel("zeros in coef") 45 | ax.set_title("Sparsity In Lasso") 46 | plt.show() 47 | def test_LinearSVC(*data): 48 | ''' 49 | test the correlation between C and sparse condition 50 | :param data: train_data, test_data, train_value, test_value 51 | :return: None 52 | ''' 53 | X,y=data 54 | Cs=np.logspace(-2,2) 55 | zeros=[] 56 | for C in Cs: 57 | clf=LinearSVC(C=C,penalty='l1',dual=False) 58 | clf.fit(X,y) 59 | 60 | num=0 61 | for row in clf.coef_: 62 | for ele in row: 63 | if abs(ele) < 1e-5:num+=1 64 | zeros.append(num) 65 | 66 | fig=plt.figure() 67 | ax=fig.add_subplot(1,1,1) 68 | ax.plot(Cs,zeros) 69 | ax.set_xlabel("C") 70 | ax.set_xscale("log") 71 | ax.set_ylabel("zeros in coef") 72 | ax.set_title("Sparsity In SVM") 73 | plt.show() 74 | if __name__=='__main__': 75 | test_SelectFromModel() 76 | data=load_diabetes() 77 | test_Lasso(data.data,data.target) 78 | data=load_digits() 79 | test_LinearSVC(data.data,data.target) -------------------------------------------------------------------------------- /11. Preprocessing/11.8 pipeline.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from sklearn.svm import LinearSVC 4 | from sklearn.datasets import load_digits 5 | from sklearn import cross_validation 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.pipeline import Pipeline 8 | def test_Pipeline(data): 9 | ''' 10 | test the pipeline 11 | :param data: train_data, test_data, train_value, test_value 12 | :return: None 13 | ''' 14 | X_train,X_test,y_train,y_test=data 15 | steps=[("Linear_SVM",LinearSVC(C=1,penalty='l1',dual=False)), 16 | ("LogisticRegression",LogisticRegression(C=1))] 17 | pipeline=Pipeline(steps) 18 | pipeline.fit(X_train,y_train) 19 | print("Named steps:",pipeline.named_steps) 20 | print("Pipeline Score:",pipeline.score(X_test,y_test)) 21 | if __name__=='__main__': 22 | data=load_digits() 23 | test_Pipeline(cross_validation.train_test_split(data.data, data.target,test_size=0.25 24 | ,random_state=0,stratify=data.target)) -------------------------------------------------------------------------------- /11. Preprocessing/11.9 dictionary learning.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import DictionaryLearning 2 | 3 | def test_DictionaryLearning(): 4 | ''' 5 | test the DL method 6 | :return: None 7 | ''' 8 | X=[[1,2,3,4,5], 9 | [6,7,8,9,10], 10 | [10,9,8,7,6,], 11 | [5,4,3,2,1] ] 12 | print("before transform:",X) 13 | dct=DictionaryLearning(n_components=3) 14 | dct.fit(X) 15 | print("components is :",dct.components_) 16 | print("after transform:",dct.transform(X)) 17 | 18 | if __name__=='__main__': 19 | test_DictionaryLearning() -------------------------------------------------------------------------------- /11. Preprocessing/README.md: -------------------------------------------------------------------------------- 1 | # 综述 2 | 3 | 数据预处理,在大数据分析中起到十分重要的作用。因为在很多的时候我们得到的数据都是dirty的, 也可以说是生数据。而第一步需要做的,就是让这些生数据变成有结构的,可以统一进行运用处理的数据 4 | 5 | # 常用流程 6 | 7 | 去除唯一属性;处理缺失值;属性编码;数据标准化,正则化;特征选择;主成分分析,降维。 8 | 9 | # 实战代码:GitHub: 10 | 11 | Preprocessing: 12 | 13 | https://github.com/JasonK93/ML-note/tree/master/11.%20Preprocessing 14 | 15 | -------------------------------------------------------------------------------- /12. Model evaluation/12.1 Loss function.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import zero_one_loss,log_loss 2 | 3 | 4 | def test_zero_one_loss(): 5 | ''' 6 | test 0-1 loss function 7 | :return: None 8 | ''' 9 | y_true=[1,1,1,1,1,0,0,0,0,0] 10 | y_pred=[0,0,0,1,1,1,1,1,0,0] 11 | print("zero_one_loss:",zero_one_loss(y_true,y_pred,normalize=True)) 12 | print("zero_one_loss:",zero_one_loss(y_true,y_pred,normalize=False)) 13 | def test_log_loss(): 14 | ''' 15 | test log function 16 | :return: None 17 | ''' 18 | y_true=[1, 1, 1, 0, 0, 0] 19 | y_pred=[[0.1, 0.9], 20 | [0.2, 0.8], 21 | [0.3, 0.7], 22 | [0.7, 0.3], 23 | [0.8, 0.2], 24 | [0.9, 0.1]] 25 | print("log_loss:",log_loss(y_true,y_pred,normalize=True)) 26 | print("log_loss:",log_loss(y_true,y_pred,normalize=False)) 27 | 28 | if __name__=="__main__": 29 | test_zero_one_loss() 30 | test_log_loss() -------------------------------------------------------------------------------- /12. Model evaluation/12.2 data split.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,LeaveOneOut\ 2 | ,cross_val_score 3 | import numpy as np 4 | def test_train_test_split(): 5 | ''' 6 | test train_test_split method 7 | :return: None 8 | ''' 9 | X=[[1,2,3,4], 10 | [11,12,13,14], 11 | [21,22,23,24], 12 | [31,32,33,34], 13 | [41,42,43,44], 14 | [51,52,53,54], 15 | [61,62,63,64], 16 | [71,72,73,74]] 17 | y=[1,1,0,0,1,1,0,0] 18 | 19 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=0) 20 | print("X_train=",X_train) 21 | print("X_test=",X_test) 22 | print("y_train=",y_train) 23 | print("y_test=",y_test) 24 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, 25 | random_state=0,stratify=y) 26 | print("Stratify:X_train=",X_train) 27 | print("Stratify:X_test=",X_test) 28 | print("Stratify:y_train=",y_train) 29 | print("Stratify:y_test=",y_test) 30 | def test_KFold(): 31 | ''' 32 | test Kfold 33 | :return: None 34 | ''' 35 | X=np.array([[1,2,3,4], 36 | [11,12,13,14], 37 | [21,22,23,24], 38 | [31,32,33,34], 39 | [41,42,43,44], 40 | [51,52,53,54], 41 | [61,62,63,64], 42 | [71,72,73,74], 43 | [81,82,83,84]]) 44 | y=np.array([1,1,0,0,1,1,0,0,1]) 45 | 46 | folder=KFold(n_splits=3,random_state=0,shuffle=False) 47 | for train_index,test_index in folder.split(X,y): 48 | print("Train Index:",train_index) 49 | print("Test Index:",test_index) 50 | print("X_train:",X[train_index]) 51 | print("X_test:",X[test_index]) 52 | print("") 53 | 54 | shuffle_folder=KFold(n_splits=3,random_state=0,shuffle=True) 55 | for train_index,test_index in shuffle_folder.split(X,y): 56 | print("Shuffled Train Index:",train_index) 57 | print("Shuffled Test Index:",test_index) 58 | print("Shuffled X_train:",X[train_index]) 59 | print("Shuffled X_test:",X[test_index]) 60 | print("") 61 | def test_StratifiedKFold(): 62 | ''' 63 | test Stratified Kfold 64 | :return: None 65 | ''' 66 | X=np.array([[1,2,3,4], 67 | [11,12,13,14], 68 | [21,22,23,24], 69 | [31,32,33,34], 70 | [41,42,43,44], 71 | [51,52,53,54], 72 | [61,62,63,64], 73 | [71,72,73,74]]) 74 | 75 | y=np.array([1,1,0,0,1,1,0,0]) 76 | 77 | folder=KFold(n_splits=4,random_state=0,shuffle=False) 78 | stratified_folder=StratifiedKFold(n_splits=4,random_state=0,shuffle=False) 79 | for train_index,test_index in folder.split(X,y): 80 | print("Train Index:",train_index) 81 | print("Test Index:",test_index) 82 | print("y_train:",y[train_index]) 83 | print("y_test:",y[test_index]) 84 | print("") 85 | 86 | for train_index,test_index in stratified_folder.split(X,y): 87 | print("Stratified Train Index:",train_index) 88 | print("Stratified Test Index:",test_index) 89 | print("Stratified y_train:",y[train_index]) 90 | print("Stratified y_test:",y[test_index]) 91 | print("") 92 | 93 | def test_cross_val_score(): 94 | ''' 95 | test cross_val_score 96 | :return: None 97 | ''' 98 | from sklearn.datasets import load_digits 99 | from sklearn.svm import LinearSVC 100 | 101 | digits=load_digits() 102 | X=digits.data 103 | y=digits.target 104 | 105 | result=cross_val_score(LinearSVC(),X,y,cv=10) 106 | print("Cross Val Score is:",result) 107 | 108 | 109 | if __name__=='__main__': 110 | test_train_test_split() 111 | test_KFold() 112 | test_StratifiedKFold() 113 | test_cross_val_score() -------------------------------------------------------------------------------- /12. Model evaluation/12.3 validation_curve.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.datasets import load_digits 4 | from sklearn.svm import LinearSVC 5 | from sklearn.learning_curve import validation_curve 6 | 7 | def test_validation_curve(): 8 | ''' 9 | test validation_curve with LinerSVC and different C 10 | :return: None 11 | ''' 12 | digits = load_digits() 13 | X,y=digits.data,digits.target 14 | param_name="C" 15 | param_range = np.logspace(-2, 2) 16 | train_scores, test_scores = validation_curve(LinearSVC(), X, y, param_name=param_name, 17 | param_range=param_range,cv=10, scoring="accuracy") 18 | 19 | train_scores_mean = np.mean(train_scores, axis=1) 20 | train_scores_std = np.std(train_scores, axis=1) 21 | test_scores_mean = np.mean(test_scores, axis=1) 22 | test_scores_std = np.std(test_scores, axis=1) 23 | 24 | fig=plt.figure() 25 | ax=fig.add_subplot(1,1,1) 26 | 27 | ax.semilogx(param_range, train_scores_mean, label="Training Accuracy", color="r") 28 | ax.fill_between(param_range, train_scores_mean - train_scores_std, 29 | train_scores_mean + train_scores_std, alpha=0.2, color="r") 30 | ax.semilogx(param_range, test_scores_mean, label="Testing Accuracy", color="g") 31 | ax.fill_between(param_range, test_scores_mean - test_scores_std, 32 | test_scores_mean + test_scores_std, alpha=0.2, color="g") 33 | 34 | ax.set_title("Validation Curve with LinearSVC") 35 | ax.set_xlabel("C") 36 | ax.set_ylabel("Score") 37 | ax.set_ylim(0,1.1) 38 | ax.legend(loc='best') 39 | plt.show() 40 | 41 | if __name__=='__main__': 42 | test_validation_curve() -------------------------------------------------------------------------------- /12. Model evaluation/12.4 grid_search.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_digits 2 | from sklearn.linear_model import LogisticRegression 3 | from sklearn.model_selection import GridSearchCV,RandomizedSearchCV 4 | from sklearn.metrics import classification_report 5 | from sklearn.model_selection import train_test_split 6 | import scipy 7 | 8 | def test_GridSearchCV(): 9 | ''' 10 | Use the GridSearchCV, and LogisticRegression to improve C, penalty, multi_class 11 | :return: None 12 | ''' 13 | digits = load_digits() 14 | X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target,test_size=0.25, 15 | random_state=0,stratify=digits.target) 16 | tuned_parameters = [{'penalty': ['l1','l2'], 17 | 'C': [0.01,0.05,0.1,0.5,1,5,10,50,100], 18 | 'solver':['liblinear'], 19 | 'multi_class': ['ovr']}, 20 | 21 | {'penalty': ['l2'], 22 | 'C': [0.01,0.05,0.1,0.5,1,5,10,50,100], 23 | 'solver':['lbfgs'], 24 | 'multi_class': ['ovr','multinomial']}, 25 | ] 26 | clf=GridSearchCV(LogisticRegression(tol=1e-6),tuned_parameters,cv=10) 27 | clf.fit(X_train,y_train) 28 | print("Best parameters set found:",clf.best_params_) 29 | print("Grid scores:") 30 | for params, mean_score, scores in clf.grid_scores_: 31 | print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params)) 32 | 33 | print("Optimized Score:",clf.score(X_test,y_test)) 34 | print("Detailed classification report:") 35 | y_true, y_pred = y_test, clf.predict(X_test) 36 | print(classification_report(y_true, y_pred)) 37 | 38 | def test_RandomizedSearchCV(): 39 | 40 | ''' 41 | Use RandomizedSearchCV and LogisticRegression, to improve C, multi_class. 42 | :return: None 43 | ''' 44 | digits = load_digits() 45 | X_train,X_test,y_train,y_test=train_test_split(digits.data, digits.target, 46 | test_size=0.25,random_state=0,stratify=digits.target) 47 | 48 | tuned_parameters ={ 'C': scipy.stats.expon(scale=100), 49 | 'multi_class': ['ovr','multinomial']} 50 | clf=RandomizedSearchCV(LogisticRegression(penalty='l2',solver='lbfgs',tol=1e-6), 51 | tuned_parameters,cv=10,scoring="accuracy",n_iter=100) 52 | clf.fit(X_train,y_train) 53 | print("Best parameters set found:",clf.best_params_) 54 | print("Randomized Grid scores:") 55 | for params, mean_score, scores in clf.grid_scores_: 56 | print("\t%0.3f (+/-%0.03f) for %s" % (mean_score, scores.std() * 2, params)) 57 | 58 | print("Optimized Score:",clf.score(X_test,y_test)) 59 | print("Detailed classification report:") 60 | y_true, y_pred = y_test, clf.predict(X_test) 61 | print(classification_report(y_true, y_pred)) 62 | 63 | if __name__=='__main__': 64 | test_GridSearchCV() 65 | test_RandomizedSearchCV() -------------------------------------------------------------------------------- /12. Model evaluation/12.5 classification_metrics.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\ 3 | ,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\ 4 | ,roc_curve 5 | from sklearn.datasets import load_iris 6 | from sklearn.multiclass import OneVsRestClassifier 7 | from sklearn.svm import SVC 8 | from sklearn.model_selection import train_test_split 9 | import matplotlib.pyplot as plt 10 | from sklearn.preprocessing import label_binarize 11 | import numpy as np 12 | 13 | 14 | def test_accuracy_score(): 15 | 16 | y_true=[1,1,1,1,1,0,0,0,0,0] 17 | y_pred=[0,0,1,1,0,0,1,1,0,0] 18 | print('Accuracy Score(normalize=True):',accuracy_score(y_true,y_pred,normalize=True)) 19 | print('Accuracy Score(normalize=False):',accuracy_score(y_true,y_pred,normalize=False)) 20 | 21 | def test_precision_score(): 22 | 23 | y_true=[1,1,1,1,1,0,0,0,0,0] 24 | y_pred=[0,0,1,1,0,0,0,0,0,0] 25 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True)) 26 | print('Precision Score:',precision_score(y_true,y_pred)) 27 | def test_recall_score(): 28 | 29 | y_true=[1,1,1,1,1,0,0,0,0,0] 30 | y_pred=[0,0,1,1,0,0,0,0,0,0] 31 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True)) 32 | print('Precision Score:',precision_score(y_true,y_pred)) 33 | print('Recall Score:',recall_score(y_true,y_pred)) 34 | def test_f1_score(): 35 | 36 | y_true=[1,1,1,1,1,0,0,0,0,0] 37 | y_pred=[0,0,1,1,0,0,0,0,0,0] 38 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True)) 39 | print('Precision Score:',precision_score(y_true,y_pred)) 40 | print('Recall Score:',recall_score(y_true,y_pred)) 41 | print('F1 Score:',f1_score(y_true,y_pred)) 42 | def test_fbeta_score(): 43 | 44 | y_true=[1,1,1,1,1,0,0,0,0,0] 45 | y_pred=[0,0,1,1,0,0,0,0,0,0] 46 | print('Accuracy Score:',accuracy_score(y_true,y_pred,normalize=True)) 47 | print('Precision Score:',precision_score(y_true,y_pred)) 48 | print('Recall Score:',recall_score(y_true,y_pred)) 49 | print('F1 Score:',f1_score(y_true,y_pred)) 50 | print('Fbeta Score(beta=0.001):',fbeta_score(y_true,y_pred,beta=0.001)) 51 | print('Fbeta Score(beta=1):',fbeta_score(y_true,y_pred,beta=1)) 52 | print('Fbeta Score(beta=10):',fbeta_score(y_true,y_pred,beta=10)) 53 | print('Fbeta Score(beta=10000):',fbeta_score(y_true,y_pred,beta=10000)) 54 | def test_classification_report(): 55 | 56 | y_true=[1,1,1,1,1,0,0,0,0,0] 57 | y_pred=[0,0,1,1,0,0,0,0,0,0] 58 | print('Classification Report:\n',classification_report(y_true,y_pred, 59 | target_names=["class_0","class_1"])) 60 | def test_confusion_matrix(): 61 | 62 | y_true=[1,1,1,1,1,0,0,0,0,0] 63 | y_pred=[0,0,1,1,0,0,0,0,0,0] 64 | print('Confusion Matrix:\n',confusion_matrix(y_true,y_pred,labels=[0,1])) 65 | def test_precision_recall_curve(): 66 | 67 | iris=load_iris() 68 | X=iris.data 69 | y=iris.target 70 | 71 | y = label_binarize(y, classes=[0, 1, 2]) 72 | n_classes = y.shape[1] 73 | 74 | np.random.seed(0) 75 | n_samples, n_features = X.shape 76 | X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] 77 | 78 | X_train,X_test,y_train,y_test=train_test_split(X,y, 79 | test_size=0.5,random_state=0) 80 | 81 | clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0)) 82 | clf.fit(X_train,y_train) 83 | y_score = clf.fit(X_train, y_train).decision_function(X_test) 84 | 85 | fig=plt.figure() 86 | ax=fig.add_subplot(1,1,1) 87 | precision = dict() 88 | recall = dict() 89 | for i in range(n_classes): 90 | precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], 91 | y_score[:, i]) 92 | ax.plot(recall[i],precision[i],label="target=%s"%i) 93 | ax.set_xlabel("Recall Score") 94 | ax.set_ylabel("Precision Score") 95 | ax.set_title("P-R") 96 | ax.legend(loc='best') 97 | ax.set_xlim(0,1.1) 98 | ax.set_ylim(0,1.1) 99 | ax.grid() 100 | plt.show() 101 | def test_roc_auc_score(): 102 | 103 | iris=load_iris() 104 | X=iris.data 105 | y=iris.target 106 | 107 | y = label_binarize(y, classes=[0, 1, 2]) 108 | n_classes = y.shape[1] 109 | 110 | np.random.seed(0) 111 | n_samples, n_features = X.shape 112 | X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] 113 | 114 | X_train,X_test,y_train,y_test=train_test_split(X,y, 115 | test_size=0.5,random_state=0) 116 | 117 | clf=OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=0)) 118 | clf.fit(X_train,y_train) 119 | y_score = clf.fit(X_train, y_train).decision_function(X_test) 120 | 121 | fig=plt.figure() 122 | ax=fig.add_subplot(1,1,1) 123 | fpr = dict() 124 | tpr = dict() 125 | roc_auc=dict() 126 | for i in range(n_classes): 127 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i],y_score[:, i]) 128 | roc_auc[i] = roc_auc_score(fpr[i], tpr[i]) 129 | ax.plot(fpr[i],tpr[i],label="target=%s,auc=%s"%(i,roc_auc[i])) 130 | ax.plot([0, 1], [0, 1], 'k--') 131 | ax.set_xlabel("FPR") 132 | ax.set_ylabel("TPR") 133 | ax.set_title("ROC") 134 | ax.legend(loc="best") 135 | ax.set_xlim(0,1.1) 136 | ax.set_ylim(0,1.1) 137 | ax.grid() 138 | plt.show() 139 | 140 | if __name__=='__main__': 141 | test_accuracy_score() 142 | test_precision_score() 143 | test_recall_score() 144 | test_f1_score() 145 | test_fbeta_score() 146 | test_classification_report() 147 | test_confusion_matrix() 148 | test_precision_recall_curve() 149 | # test_roc_auc_score() -------------------------------------------------------------------------------- /12. Model evaluation/12.6 learning curve.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from sklearn.datasets import load_digits 5 | from sklearn.svm import LinearSVC 6 | from sklearn.learning_curve import learning_curve 7 | 8 | def test_learning_curve(): 9 | 10 | digits = load_digits() 11 | X,y=digits.data,digits.target 12 | 13 | train_sizes=np.linspace(0.1,1.0,endpoint=True,dtype='float') 14 | abs_trains_sizes,train_scores, test_scores = learning_curve(LinearSVC(), 15 | X, y,cv=10, scoring="accuracy",train_sizes=train_sizes) 16 | 17 | train_scores_mean = np.mean(train_scores, axis=1) 18 | train_scores_std = np.std(train_scores, axis=1) 19 | test_scores_mean = np.mean(test_scores, axis=1) 20 | test_scores_std = np.std(test_scores, axis=1) 21 | 22 | fig=plt.figure() 23 | ax=fig.add_subplot(1,1,1) 24 | 25 | ax.plot(abs_trains_sizes, train_scores_mean, label="Training Accuracy", color="r") 26 | ax.fill_between(abs_trains_sizes, train_scores_mean - train_scores_std, 27 | train_scores_mean + train_scores_std, alpha=0.2, color="r") 28 | ax.plot(abs_trains_sizes, test_scores_mean, label="Testing Accuracy", color="g") 29 | ax.fill_between(abs_trains_sizes, test_scores_mean - test_scores_std, 30 | test_scores_mean + test_scores_std, alpha=0.2, color="g") 31 | 32 | ax.set_title("Learning Curve with LinearSVC") 33 | ax.set_xlabel("Sample Nums") 34 | ax.set_ylabel("Score") 35 | ax.set_ylim(0,1.1) 36 | ax.legend(loc='best') 37 | plt.show() 38 | 39 | if __name__=="__main__": 40 | test_learning_curve() -------------------------------------------------------------------------------- /12. Model evaluation/12.7 regression_metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import mean_absolute_error,mean_squared_error 2 | 3 | def test_mean_absolute_error(): 4 | 5 | y_true=[1,1,1,1,1,2,2,2,0,0] 6 | y_pred=[0,0,0,1,1,1,0,0,0,0] 7 | 8 | print("Mean Absolute Error:",mean_absolute_error(y_true,y_pred)) 9 | def test_mean_squared_error(): 10 | 11 | y_true=[1,1,1,1,1,2,2,2,0,0] 12 | y_pred=[0,0,0,1,1,1,0,0,0,0] 13 | 14 | print("Mean Absolute Error:",mean_absolute_error(y_true,y_pred)) 15 | print("Mean Square Error:",mean_squared_error(y_true,y_pred)) 16 | 17 | if __name__=="__main__": 18 | test_mean_absolute_error() 19 | test_mean_squared_error() -------------------------------------------------------------------------------- /12. Model evaluation/README.md: -------------------------------------------------------------------------------- 1 | # 综述 2 | 3 | 模型的评估在机器学习中扮演着很重要的角色,用于分别什么才是好的预测模型。机器学习一般包含两个方面,原型设计和应用。原型设计方面会通过验证和离线评估来选择一个较好的模型。评估方法一般有在线评估和离线评估等。在线评估一般是在应用阶段使用新生成的数据来进行评估并更新模型的过程。 4 | 5 | # 离线评估,在线评估 6 | 7 | 离线评估中,我们一般会使用到 准确率(accuracy), 精确率(precision), 召回率(recall)。而在线评估有用户生命周期价值(Customer Lifetime Value), 广告点击率( Click Through Rate), 用户流失率(Customer Churn Rate) 等等。 8 | 9 | # 损失函数 10 | 11 | 损失函数一般用于度量错误的程度。 常用的有:0-1损失函数, 平方损失函数,绝对损失函数, 对数损失函数。 风险函数定义为损失函数的期望,所以学习的目标也可以是风险函数最小的模型。 12 | 13 | # 模型评估 14 | 15 | 度量因素:训练误差,测试误差。根据这两个因素可以推论是否有过拟合或者欠拟合的情况。评估方法常用有:1.留出法:也可以说是三分法( train data, valid data, test data). 2 交叉验证法(Cross- Validation) 3. 留一法( Leave-One-Out) 4. 自助法(Boostrapping) 16 | 17 | # 性能度量 18 | 19 | 准确率,错误率, 混淆矩阵,precision, recall, P-R curve( Precision-Recall 曲线, 被包住的性能好), ROC曲线 20 | 21 | 实战代码:GitHub: 22 | 23 | https://github.com/JasonK93/ML-note/tree/master/12.%20Model%20evaluation 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /2.decision tree(DT)/2.1 Decision Tree-Classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | from sklearn.tree import DecisionTreeClassifier 5 | from sklearn import datasets 6 | from sklearn import cross_validation 7 | import matplotlib.pyplot as plt 8 | def load_data(): 9 | ''' 10 | load iris data from sk-learn. this data has 150 samples and 3 class. 11 | return: 12 | 1 array for the classification problem. 13 | train_data, test_data, train_value, test_value 14 | ''' 15 | iris=datasets.load_iris() 16 | X_train=iris.data 17 | y_train=iris.target 18 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25, 19 | random_state=0,stratify=y_train) 20 | def test_DecisionTreeClassifier(*data): 21 | ''' 22 | test decision tree 23 | :param data: train_data, test_data, train_value, test_value 24 | :return: None 25 | ''' 26 | X_train,X_test,y_train,y_test=data 27 | clf = DecisionTreeClassifier() 28 | clf.fit(X_train, y_train) 29 | 30 | print("Training score: {0}".format(clf.score(X_train,y_train))) 31 | print("Testing score: {0}".format(clf.score(X_test,y_test))) 32 | def test_DecisionTreeClassifier_criterion(*data): 33 | ''' 34 | test the performance with different criterion 35 | :param data: train_data, test_data, train_value, test_value 36 | :return: None 37 | ''' 38 | X_train,X_test,y_train,y_test=data 39 | criterions=['gini','entropy'] 40 | for criterion in criterions: 41 | clf = DecisionTreeClassifier(criterion=criterion) 42 | clf.fit(X_train, y_train) 43 | print("criterion:{0}".format(criterion)) 44 | print("Training score: {0}".format(clf.score(X_train,y_train))) 45 | print("Testing score: {0}".format(clf.score(X_test,y_test))) 46 | def test_DecisionTreeClassifier_splitter(*data): 47 | ''' 48 | test the performance with different splitters 49 | :param data: train_data, test_data, train_value, test_value 50 | :return: None 51 | ''' 52 | X_train,X_test,y_train,y_test=data 53 | splitters=['best','random'] 54 | for splitter in splitters: 55 | clf = DecisionTreeClassifier(splitter=splitter) 56 | clf.fit(X_train, y_train) 57 | print("splitter: {0}".format(splitter)) 58 | print("Training score:{0}".format(clf.score(X_train,y_train))) 59 | print("Testing score: {0}".format(clf.score(X_test,y_test))) 60 | def test_DecisionTreeClassifier_depth(*data,maxdepth): 61 | ''' 62 | test the score with different max_depth 63 | :param data: train_data, test_data, train_value, test_value 64 | :param maxdepth: an integer 65 | :return: None 66 | ''' 67 | X_train,X_test,y_train,y_test=data 68 | depths=np.arange(1,maxdepth) 69 | training_scores=[] 70 | testing_scores=[] 71 | for depth in depths: 72 | clf = DecisionTreeClassifier(max_depth=depth) 73 | clf.fit(X_train, y_train) 74 | training_scores.append(clf.score(X_train,y_train)) 75 | testing_scores.append(clf.score(X_test,y_test)) 76 | 77 | ## graph 78 | fig=plt.figure() 79 | ax=fig.add_subplot(1,1,1) 80 | ax.plot(depths,training_scores,label="traing score",marker='o') 81 | ax.plot(depths,testing_scores,label="testing score",marker='*') 82 | ax.set_xlabel("maxdepth") 83 | ax.set_ylabel("score") 84 | ax.set_title("Decision Tree Classification") 85 | ax.legend(framealpha=0.5,loc='best') 86 | plt.show() 87 | if __name__=='__main__': 88 | X_train,X_test,y_train,y_test=load_data() 89 | test_DecisionTreeClassifier(X_train,X_test,y_train,y_test) 90 | test_DecisionTreeClassifier_criterion(X_train,X_test,y_train,y_test) 91 | test_DecisionTreeClassifier_splitter(X_train,X_test,y_train,y_test) 92 | test_DecisionTreeClassifier_depth(X_train,X_test,y_train,y_test,maxdepth=100) -------------------------------------------------------------------------------- /2.decision tree(DT)/2.2 Decision Tree- Regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | from sklearn.tree import DecisionTreeRegressor 5 | from sklearn import cross_validation 6 | import matplotlib.pyplot as plt 7 | def creat_data(n): 8 | ''' 9 | generate data from random 10 | :param n: the number of sample 11 | :return: train_data, test_data, train_value, test_value 12 | ''' 13 | np.random.seed(0) 14 | X = 5 * np.random.rand(n, 1) 15 | y = np.sin(X).ravel() 16 | noise_num=(int)(n/5) 17 | y[::5] += 3 * (0.5 - np.random.rand(noise_num)) # add noise every 5 sample 18 | return cross_validation.train_test_split(X, y, 19 | test_size=0.25,random_state=1) 20 | def test_DecisionTreeRegressor(*data): 21 | ''' 22 | test DT regression 23 | :param data: train_data, test_data, train_value, test_value 24 | :return: None 25 | ''' 26 | X_train,X_test,y_train,y_test=data 27 | regr = DecisionTreeRegressor() 28 | regr.fit(X_train, y_train) 29 | print("Training score:{0}".format(regr.score(X_train,y_train))) 30 | print("Testing score:{0}".format(regr.score(X_test,y_test))) 31 | ##graph 32 | fig=plt.figure() 33 | ax=fig.add_subplot(1,1,1) 34 | X = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] 35 | Y = regr.predict(X) 36 | ax.scatter(X_train, y_train, label="train sample",c='g') 37 | ax.scatter(X_test, y_test, label="test sample",c='r') 38 | ax.plot(X, Y, label="predict_value", linewidth=2,alpha=0.5) 39 | ax.set_xlabel("data") 40 | ax.set_ylabel("target") 41 | ax.set_title("Decision Tree Regression") 42 | ax.legend(framealpha=0.5) 43 | plt.show() 44 | def test_DecisionTreeRegressor_splitter(*data): 45 | ''' 46 | test the performance with different splitters 47 | :param data: train_data, test_data, train_value, test_value 48 | :return: None 49 | ''' 50 | X_train,X_test,y_train,y_test=data 51 | splitters=['best','random'] 52 | for splitter in splitters: 53 | regr = DecisionTreeRegressor(splitter=splitter) 54 | regr.fit(X_train, y_train) 55 | print("Splitter {0}".format(splitter)) 56 | print("Training score:{0}".format(regr.score(X_train,y_train))) 57 | print("Testing score:{0}".format(regr.score(X_test,y_test))) 58 | def test_DecisionTreeRegressor_depth(*data,maxdepth): 59 | ''' 60 | test the score with different max_depth 61 | :param data: train_data, test_data, train_value, test_value 62 | :param maxdepth: an integer 63 | :return: None 64 | ''' 65 | X_train,X_test,y_train,y_test=data 66 | depths=np.arange(1,maxdepth) 67 | training_scores=[] 68 | testing_scores=[] 69 | for depth in depths: 70 | regr = DecisionTreeRegressor(max_depth=depth) 71 | regr.fit(X_train, y_train) 72 | training_scores.append(regr.score(X_train,y_train)) 73 | testing_scores.append(regr.score(X_test,y_test)) 74 | 75 | ## graph 76 | fig=plt.figure() 77 | ax=fig.add_subplot(1,1,1) 78 | ax.plot(depths,training_scores,label="traing score") 79 | ax.plot(depths,testing_scores,label="testing score") 80 | ax.set_xlabel("maxdepth") 81 | ax.set_ylabel("score") 82 | ax.set_title("Decision Tree Regression") 83 | ax.legend(framealpha=0.5) 84 | plt.show() 85 | if __name__=='__main__': 86 | X_train,X_test,y_train,y_test=creat_data(100) 87 | test_DecisionTreeRegressor(X_train,X_test,y_train,y_test) 88 | test_DecisionTreeRegressor_splitter(X_train,X_test,y_train,y_test) 89 | test_DecisionTreeRegressor_depth(X_train,X_test,y_train,y_test,maxdepth=20) 90 | -------------------------------------------------------------------------------- /2.decision tree(DT)/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 简介 3 | 4 | 决策树的功能很强大,是一种有监督的学习方法。决策树既可以用来解决回归问题,也可以解决分类问题。 5 | 6 | # 原理 7 | 8 | 在特征空间上执行递归的二元分割. 。有节点和有向边组成。 9 | 10 | # 步骤 11 | 12 | 特征选择;决策树生成;决策树剪枝。 13 | 14 | 特征选择根据:熵,基尼系数,方差等因素决定。生成决策树的方法有很多,典型的有ID3,和C4.5。 ID3 采用的信息增益作为度量。C4.5采用信息增益比。树剪枝简化了模型,并且某种程度上减少了过拟合的发生。同时树剪枝也是预测误差和数据复杂度之间的一个折中。 15 | 16 | # 实战代码:GitHub 17 | 18 | 1.决策树分类: 19 | 20 | https://github.com/JasonK93/ML-note/blob/master/2.decision%20tree(DT)/2.1%20Decision%20Tree-Classifier.py 21 | 22 | 2.决策树回归: 23 | 24 | https://github.com/JasonK93/ML-note/blob/master/2.decision%20tree(DT)/2.2%20Decision%20Tree-%20Regression.py 25 | 26 | # 决策图 27 | 28 | 决策树生成后可以对相关规则进行可视化,使用函数export_graphviz() 29 | -------------------------------------------------------------------------------- /3.Bayes/3.1 Gaussian Bayes.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets,cross_validation,naive_bayes 2 | import matplotlib.pyplot as plt 3 | 4 | def load_data(): 5 | ''' 6 | reload the digits dataset from sklearn 7 | :return: train_data, test_data, train_value, test_value 8 | ''' 9 | digits=datasets.load_digits() 10 | return cross_validation.train_test_split(digits.data,digits.target, 11 | test_size=0.25,random_state=0,stratify=digits.target) 12 | 13 | def test_GaussianNB(*data): 14 | ''' 15 | Test Gaussian NB 16 | :param data: train_data, test_data, train_value, test_value 17 | :return: None 18 | ''' 19 | X_train,X_test,y_train,y_test=data 20 | cls=naive_bayes.GaussianNB() 21 | cls.fit(X_train,y_train) 22 | print('Training Score: {0}' .format( cls.score(X_train,y_train))) 23 | print('Testing Score: {0}' .format( cls.score(X_test, y_test))) 24 | 25 | def show_digits(): 26 | ''' 27 | graph the first 25 samples in the data set 28 | :return: None 29 | ''' 30 | digits=datasets.load_digits() 31 | fig=plt.figure() 32 | print("vector from images 0:",digits.data[0]) 33 | for i in range(25): 34 | ax=fig.add_subplot(5,5,i+1) 35 | ax.imshow(digits.images[i],cmap=plt.cm.gray_r, interpolation='nearest') 36 | plt.show() 37 | 38 | if __name__=='__main__': 39 | show_digits() 40 | X_train,X_test,y_train,y_test=load_data() 41 | test_GaussianNB(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /3.Bayes/3.2 Multinomial NB.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets,cross_validation,naive_bayes 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | def load_data(): 6 | ''' 7 | reload the digits dataset from sklearn 8 | :return: train_data, test_data, train_value, test_value 9 | ''' 10 | digits=datasets.load_digits() 11 | return cross_validation.train_test_split(digits.data,digits.target, 12 | test_size=0.25,random_state=0,stratify=digits.target) 13 | 14 | def test_MultinomialNB(*data): 15 | ''' 16 | test Multinomial NB 17 | :param data: train_data, test_data, train_value, test_value 18 | :return: None 19 | ''' 20 | X_train,X_test,y_train,y_test=data 21 | cls=naive_bayes.MultinomialNB() 22 | cls.fit(X_train,y_train) 23 | print('Training Score: {0}' .format( cls.score(X_train,y_train))) 24 | print('Testing Score: {0}'.format(cls.score(X_test, y_test))) 25 | def test_MultinomialNB_alpha(*data): 26 | ''' 27 | test the performance with different alpha 28 | :param data: train_data, test_data, train_value, test_value 29 | :return: None 30 | ''' 31 | X_train,X_test,y_train,y_test=data 32 | alphas=np.logspace(-2,5,num=200) 33 | train_scores=[] 34 | test_scores=[] 35 | for alpha in alphas: 36 | cls=naive_bayes.MultinomialNB(alpha=alpha) 37 | cls.fit(X_train,y_train) 38 | train_scores.append(cls.score(X_train,y_train)) 39 | test_scores.append(cls.score(X_test, y_test)) 40 | 41 | ## graph 42 | fig=plt.figure() 43 | ax=fig.add_subplot(1,1,1) 44 | ax.plot(alphas,train_scores,label="Training Score") 45 | ax.plot(alphas,test_scores,label="Testing Score") 46 | ax.set_xlabel(r"$\alpha$") 47 | ax.set_ylabel("score") 48 | ax.set_ylim(0,1.0) 49 | ax.set_title("MultinomialNB") 50 | ax.set_xscale("log") 51 | plt.show() 52 | def show_digits(): 53 | ''' 54 | graph the first 25 samples in the data set 55 | :return: None 56 | ''' 57 | digits=datasets.load_digits() 58 | fig=plt.figure() 59 | print("vector from images 0:",digits.data[0]) 60 | for i in range(25): 61 | ax=fig.add_subplot(5,5,i+1) 62 | ax.imshow(digits.images[i],cmap=plt.cm.gray_r, interpolation='nearest') 63 | plt.show() 64 | 65 | if __name__=='__main__': 66 | show_digits() 67 | X_train, X_test, y_train, y_test = load_data() 68 | test_MultinomialNB(X_train, X_test, y_train, y_test) 69 | test_MultinomialNB_alpha(X_train, X_test, y_train, y_test) 70 | -------------------------------------------------------------------------------- /3.Bayes/3.3 Bernoulli NB.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets,cross_validation,naive_bayes 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | def load_data(): 6 | ''' 7 | reload the digits dataset from sklearn 8 | :return: train_data, test_data, train_value, test_value 9 | ''' 10 | digits=datasets.load_digits() 11 | return cross_validation.train_test_split(digits.data,digits.target, 12 | test_size=0.25,random_state=0,stratify=digits.target) 13 | 14 | def test_BernoulliNB(*data): 15 | ''' 16 | test BernoulliNB 17 | :param data: train_data, test_data, train_value, test_value 18 | :return: None 19 | ''' 20 | X_train,X_test,y_train,y_test=data 21 | cls=naive_bayes.BernoulliNB() 22 | cls.fit(X_train,y_train) 23 | print('Training Score: {0}'.format(cls.score(X_train,y_train))) 24 | print('Testing Score: {0}'.format(cls.score(X_test, y_test))) 25 | def test_BernoulliNB_alpha(*data): 26 | ''' 27 | test the performance with different alpha 28 | :param data: train_data, test_data, train_value, test_value 29 | :return: None 30 | ''' 31 | X_train,X_test,y_train,y_test=data 32 | alphas=np.logspace(-2,5,num=200) 33 | train_scores=[] 34 | test_scores=[] 35 | for alpha in alphas: 36 | cls=naive_bayes.BernoulliNB(alpha=alpha) 37 | cls.fit(X_train,y_train) 38 | train_scores.append(cls.score(X_train,y_train)) 39 | test_scores.append(cls.score(X_test, y_test)) 40 | 41 | ## graph 42 | fig=plt.figure() 43 | ax=fig.add_subplot(1,1,1) 44 | ax.plot(alphas,train_scores,label="Training Score") 45 | ax.plot(alphas,test_scores,label="Testing Score") 46 | ax.set_xlabel(r"$\alpha$") 47 | ax.set_ylabel("score") 48 | ax.set_ylim(0,1.0) 49 | ax.set_title("BernoulliNB") 50 | ax.set_xscale("log") 51 | ax.legend(loc="best") 52 | plt.show() 53 | def test_BernoulliNB_binarize(*data): 54 | ''' 55 | test the performance with different binarize 56 | :param data: train_data, test_data, train_value, test_value 57 | :return: None 58 | ''' 59 | X_train,X_test,y_train,y_test=data 60 | min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1 61 | max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1 62 | binarizes=np.linspace(min_x,max_x,endpoint=True,num=100) 63 | train_scores=[] 64 | test_scores=[] 65 | for binarize in binarizes: 66 | cls=naive_bayes.BernoulliNB(binarize=binarize) 67 | cls.fit(X_train,y_train) 68 | train_scores.append(cls.score(X_train,y_train)) 69 | test_scores.append(cls.score(X_test, y_test)) 70 | 71 | ## graph 72 | fig=plt.figure() 73 | ax=fig.add_subplot(1,1,1) 74 | ax.plot(binarizes,train_scores,label="Training Score") 75 | ax.plot(binarizes,test_scores,label="Testing Score") 76 | ax.set_xlabel("binarize") 77 | ax.set_ylabel("score") 78 | ax.set_ylim(0,1.0) 79 | ax.set_xlim(min_x-1,max_x+1) 80 | ax.set_title("BernoulliNB") 81 | ax.legend(loc="best") 82 | plt.show() 83 | def show_digits(): 84 | ''' 85 | graph the first 25 samples in the data set 86 | :return: None 87 | ''' 88 | digits=datasets.load_digits() 89 | fig=plt.figure() 90 | print("vector from images 0:",digits.data[0]) 91 | for i in range(25): 92 | ax=fig.add_subplot(5,5,i+1) 93 | ax.imshow(digits.images[i],cmap=plt.cm.gray_r, interpolation='nearest') 94 | plt.show() 95 | 96 | if __name__=='__main__': 97 | show_digits() 98 | X_train, X_test, y_train, y_test = load_data() 99 | test_BernoulliNB(X_train, X_test, y_train, y_test) 100 | test_BernoulliNB_alpha(X_train, X_test, y_train, y_test) 101 | test_BernoulliNB_binarize(X_train, X_test, y_train, y_test) -------------------------------------------------------------------------------- /3.Bayes/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 综述 3 | 4 | 贝叶斯分类原理是通过对某对象的先验概率,利用贝叶斯公式计算出后验概率,再选取最大的概率的事件作为分类对象。 5 | 6 | # 分类器 7 | 8 | 1.高斯分类器GaussianNB:条件概率分布满足高斯分布 9 | 10 | https://github.com/JasonK93/ML-note/blob/master/3.Bayes/3.1%20Gaussian%20Bayes.py 11 | 12 | 2.多项式贝叶斯分类器(MultinomialNB):条件概率满足多项式分布 13 | 14 | https://github.com/JasonK93/ML-note/blob/master/3.Bayes/3.2%20Multinomial%20NB.py 15 | 16 | 3.伯努利贝叶斯分类器(BernouliNB):条件概率满足伯努利分布 17 | 18 | https://github.com/JasonK93/ML-note/blob/master/3.Bayes/3.3%20Bernoulli%20NB.py 19 | 20 | # Partial_fit 21 | 22 | 贝叶斯可以处理大规模数据,当完整的训练集无法放入内存中的时候,可以动态的增加数据来进行使用—-online classifier。将一个大数据集分割成数个数据集分块训练。 23 | -------------------------------------------------------------------------------- /4. KNN/4.1 KNN classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import neighbors, datasets,cross_validation 4 | 5 | def load_classification_data(): 6 | ''' 7 | load the digit data 8 | :return: train_data, test_data, train_value, test_value 9 | ''' 10 | digits=datasets.load_digits() 11 | X_train=digits.data 12 | y_train=digits.target 13 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25, 14 | random_state=0,stratify=y_train) 15 | def test_KNeighborsClassifier(*data): 16 | ''' 17 | test KNN classifier 18 | :param data: train_data, test_data, train_value, test_value 19 | :return: None 20 | ''' 21 | X_train,X_test,y_train,y_test=data 22 | clf=neighbors.KNeighborsClassifier() 23 | clf.fit(X_train,y_train) 24 | print("Training Score:{0}".format(clf.score(X_train,y_train))) 25 | print("Testing Score:{0}".format(clf.score(X_test,y_test))) 26 | def test_KNeighborsClassifier_k_w(*data): 27 | ''' 28 | test the performance with different n_neighbors and weights 29 | :param data: train_data, test_data, train_value, test_value 30 | :return: None 31 | ''' 32 | X_train,X_test,y_train,y_test=data 33 | Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int') 34 | weights=['uniform','distance'] 35 | 36 | fig=plt.figure() 37 | ax=fig.add_subplot(1,1,1) 38 | ### graph 39 | for weight in weights: 40 | training_scores=[] 41 | testing_scores=[] 42 | for K in Ks: 43 | clf=neighbors.KNeighborsClassifier(weights=weight,n_neighbors=K) 44 | clf.fit(X_train,y_train) 45 | testing_scores.append(clf.score(X_test,y_test)) 46 | training_scores.append(clf.score(X_train,y_train)) 47 | ax.plot(Ks,testing_scores,label="testing score:weight={0}".format(weight)) 48 | ax.plot(Ks,training_scores,label="training score:weight={0}".format(weight)) 49 | ax.legend(loc='best') 50 | ax.set_xlabel("K") 51 | ax.set_ylabel("score") 52 | ax.set_ylim(0,1.05) 53 | ax.set_title("KNeighborsClassifier") 54 | plt.show() 55 | def test_KNeighborsClassifier_k_p(*data): 56 | ''' 57 | test the performance with different n_neighbors and p 58 | :param data: train_data, test_data, train_value, test_value 59 | :return: None 60 | ''' 61 | X_train,X_test,y_train,y_test=data 62 | Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int') 63 | Ps=[1,2,10] 64 | 65 | fig=plt.figure() 66 | ax=fig.add_subplot(1,1,1) 67 | ### graph 68 | for P in Ps: 69 | training_scores=[] 70 | testing_scores=[] 71 | for K in Ks: 72 | clf=neighbors.KNeighborsClassifier(p=P,n_neighbors=K) 73 | clf.fit(X_train,y_train) 74 | testing_scores.append(clf.score(X_test,y_test)) 75 | training_scores.append(clf.score(X_train,y_train)) 76 | ax.plot(Ks,testing_scores,label="testing score:p={0}".format(P)) 77 | ax.plot(Ks,training_scores,label="training score:p={0}".format(P)) 78 | ax.legend(loc='best') 79 | ax.set_xlabel("K") 80 | ax.set_ylabel("score") 81 | ax.set_ylim(0,1.05) 82 | ax.set_title("KNeighborsClassifier") 83 | plt.show() 84 | 85 | if __name__=='__main__': 86 | X_train,X_test,y_train,y_test=load_classification_data() 87 | test_KNeighborsClassifier(X_train,X_test,y_train,y_test) 88 | test_KNeighborsClassifier_k_w(X_train,X_test,y_train,y_test) 89 | test_KNeighborsClassifier_k_p(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /4. KNN/4.2 KNN regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import neighbors, cross_validation 4 | 5 | def create_regression_data(n): 6 | ''' 7 | generate data 8 | :param n: the space of data set 9 | :return: train_data, test_data, train_value, test_value 10 | ''' 11 | X =5 * np.random.rand(n, 1) 12 | y = np.sin(X).ravel() 13 | y[::5] += 1 * (0.5 - np.random.rand(int(n/5))) 14 | return cross_validation.train_test_split(X, y,test_size=0.25,random_state=0) 15 | 16 | def test_KNeighborsRegressor(*data): 17 | ''' 18 | test the KNN regressor 19 | :param data: train_data, test_data, train_value, test_value 20 | :return: None 21 | ''' 22 | X_train,X_test,y_train,y_test=data 23 | regr=neighbors.KNeighborsRegressor() 24 | regr.fit(X_train,y_train) 25 | print("Training Score:{0}".format(regr.score(X_train,y_train))) 26 | print("Testing Score:{0}".format(regr.score(X_test,y_test))) 27 | def test_KNeighborsRegressor_k_w(*data): 28 | ''' 29 | test the performance with different n_neighbors and weights 30 | :param data: train_data, test_data, train_value, test_value 31 | :return: None 32 | ''' 33 | X_train,X_test,y_train,y_test=data 34 | Ks=np.linspace(1,y_train.size,num=100,endpoint=False,dtype='int') 35 | weights=['uniform','distance'] 36 | 37 | fig=plt.figure() 38 | ax=fig.add_subplot(1,1,1) 39 | ### graph 40 | for weight in weights: 41 | training_scores=[] 42 | testing_scores=[] 43 | for K in Ks: 44 | regr=neighbors.KNeighborsRegressor(weights=weight,n_neighbors=K) 45 | regr.fit(X_train,y_train) 46 | testing_scores.append(regr.score(X_test,y_test)) 47 | training_scores.append(regr.score(X_train,y_train)) 48 | ax.plot(Ks,testing_scores,label="testing score:weight={0}".format(weight)) 49 | ax.plot(Ks,training_scores,label="training score:weight={0}".format(weight)) 50 | ax.legend(loc='best') 51 | ax.set_xlabel("K") 52 | ax.set_ylabel("score") 53 | ax.set_ylim(0,1.05) 54 | ax.set_title("KNeighborsRegressor") 55 | plt.show() 56 | def test_KNeighborsRegressor_k_p(*data): 57 | ''' 58 | test the performance with different n_neighbors and p 59 | :param data: train_data, test_data, train_value, test_value 60 | :return: None 61 | ''' 62 | X_train,X_test,y_train,y_test=data 63 | Ks=np.linspace(1,y_train.size,endpoint=False,dtype='int') 64 | Ps=[1,2,10] 65 | 66 | fig=plt.figure() 67 | ax=fig.add_subplot(1,1,1) 68 | ### graph 69 | for P in Ps: 70 | training_scores=[] 71 | testing_scores=[] 72 | for K in Ks: 73 | regr=neighbors.KNeighborsRegressor(p=P,n_neighbors=K) 74 | regr.fit(X_train,y_train) 75 | testing_scores.append(regr.score(X_test,y_test)) 76 | training_scores.append(regr.score(X_train,y_train)) 77 | ax.plot(Ks,testing_scores,label="testing score:p={0}".format(P)) 78 | ax.plot(Ks,training_scores,label="training score:p={0}".format(P)) 79 | ax.legend(loc='best') 80 | ax.set_xlabel("K") 81 | ax.set_ylabel("score") 82 | ax.set_ylim(0,1.05) 83 | ax.set_title("KNeighborsRegressor") 84 | plt.show() 85 | 86 | if __name__=='__main__': 87 | X_train,X_test,y_train,y_test=create_regression_data(1000) 88 | test_KNeighborsRegressor(X_train,X_test,y_train,y_test) 89 | test_KNeighborsRegressor_k_w(X_train,X_test,y_train,y_test) 90 | test_KNeighborsRegressor_k_p(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /4. KNN/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 综述 3 | 4 | 通过计算数据特征值之间的距离,根据K值,进行分类。 5 | 6 | # 三要素 7 | 8 | K值,距离度量,分类决策规则。 K值通过价差验证平均误差率进行选择,距离度量一般为欧几里得距离,分类决策规则通常采用多数表决法。 9 | 10 | # 实战代码:GitHub 11 | 12 | 1.分类: 13 | 14 | https://github.com/JasonK93/ML-note/blob/master/4.%20KNN/4.1%20KNN%20classification.py 15 | 16 | 2.回归: 17 | 18 | https://github.com/JasonK93/ML-note/blob/master/4.%20KNN/4.2%20KNN%20regressor.py 19 | -------------------------------------------------------------------------------- /5.Dimension_Reduction/5.1 PCA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets,decomposition 4 | 5 | def load_data(): 6 | ''' 7 | load the data 8 | :return: train_data, train_value 9 | ''' 10 | iris=datasets.load_iris() 11 | return iris.data,iris.target 12 | 13 | def test_PCA(*data): 14 | ''' 15 | test the PCA method 16 | :param data: train_data, train_value 17 | :return: None 18 | ''' 19 | X,y=data 20 | pca=decomposition.PCA(n_components=None) 21 | pca.fit(X) 22 | print('explained variance ratio : %s'% str(pca.explained_variance_ratio_)) 23 | def plot_PCA(*data): 24 | ''' 25 | graph the data after PCA 26 | :param data: train_data, train_value 27 | :return: None 28 | ''' 29 | X,y=data 30 | pca=decomposition.PCA(n_components=2) 31 | pca.fit(X) 32 | X_r=pca.transform(X) 33 | ###### graph 2-D data ######## 34 | fig=plt.figure() 35 | ax=fig.add_subplot(1,1,1) 36 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 37 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 38 | for label ,color in zip( np.unique(y),colors): 39 | position=y==label 40 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),color=color) 41 | 42 | ax.set_xlabel("X[0]") 43 | ax.set_ylabel("Y[0]") 44 | ax.legend(loc="best") 45 | ax.set_title("PCA") 46 | plt.show() 47 | if __name__=='__main__': 48 | X,y=load_data() 49 | test_PCA(X,y) 50 | plot_PCA(X,y) -------------------------------------------------------------------------------- /5.Dimension_Reduction/5.2 KPCA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets,decomposition 4 | 5 | def load_data(): 6 | ''' 7 | load the iris data 8 | :return: train_data, train_value 9 | ''' 10 | iris=datasets.load_iris()# 使用 scikit-learn 自带的 iris 数据集 11 | return iris.data,iris.target 12 | 13 | def test_KPCA(*data): 14 | ''' 15 | test the KPCA method 16 | :param data: train_data, train_value 17 | :return: None 18 | ''' 19 | X,y=data 20 | kernels=['linear','poly','rbf','sigmoid'] 21 | for kernel in kernels: 22 | kpca=decomposition.KernelPCA(n_components=None,kernel=kernel) # Use 4 different kernel 23 | kpca.fit(X) 24 | print('kernel={0} --> lambdas: {1}'.format (kernel,kpca.lambdas_)) 25 | def plot_KPCA(*data): 26 | ''' 27 | graph after KPCA 28 | :param data: train_data, train_value 29 | :return: None 30 | ''' 31 | X,y=data 32 | kernels=['linear','poly','rbf','sigmoid'] 33 | fig=plt.figure() 34 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 35 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 36 | 37 | for i,kernel in enumerate(kernels): 38 | kpca=decomposition.KernelPCA(n_components=2,kernel=kernel) 39 | kpca.fit(X) 40 | X_r=kpca.transform(X) 41 | ax=fig.add_subplot(2,2,i+1) 42 | for label ,color in zip( np.unique(y),colors): 43 | position=y==label 44 | ax.scatter(X_r[position,0],X_r[position,1],label="target= %d"%label, 45 | color=color) 46 | ax.set_xlabel("X[0]") 47 | ax.set_ylabel("X[1]") 48 | ax.legend(loc="best") 49 | ax.set_title("kernel={0}".format(kernel)) 50 | plt.suptitle("KPCA") 51 | plt.show() 52 | def plot_KPCA_poly(*data): 53 | ''' 54 | graph after KPCA with poly kernel 55 | :param data: train_data, train_value 56 | :return: None 57 | ''' 58 | X,y=data 59 | fig=plt.figure() 60 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 61 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 62 | Params=[(3,1,1),(3,10,1),(3,1,10),(3,10,10),(10,1,1),(10,10,1),(10,1,10),(10,10,10)] # parameter of poly 63 | # p , gamma , r ) 64 | # p :3,10 65 | # gamma :1,10 66 | # r :1,10 67 | # 8 combination 68 | for i,(p,gamma,r) in enumerate(Params): 69 | kpca=decomposition.KernelPCA(n_components=2,kernel='poly' 70 | ,gamma=gamma,degree=p,coef0=r) 71 | kpca.fit(X) 72 | X_r=kpca.transform(X) 73 | ax=fig.add_subplot(2,4,i+1) 74 | for label ,color in zip( np.unique(y),colors): 75 | position=y==label 76 | ax.scatter(X_r[position,0],X_r[position,1],label="target= %d"%label, 77 | color=color) 78 | ax.set_xlabel("X[0]") 79 | ax.set_xticks([]) 80 | ax.set_yticks([]) 81 | ax.set_ylabel("X[1]") 82 | ax.legend(loc="best") 83 | ax.set_title(r"$ ({0} (x \cdot z+1)+{1})^{{2}}$".format(gamma,r,p)) 84 | plt.suptitle("KPCA-Poly") 85 | plt.show() 86 | def plot_KPCA_rbf(*data): 87 | ''' 88 | graph with kernel of rbf 89 | :param data: train_data, train_value 90 | :return: None 91 | ''' 92 | X,y=data 93 | fig=plt.figure() 94 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 95 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 96 | Gammas=[0.5,1,4,10] 97 | for i,gamma in enumerate(Gammas): 98 | kpca=decomposition.KernelPCA(n_components=2,kernel='rbf',gamma=gamma) 99 | kpca.fit(X) 100 | X_r=kpca.transform(X) 101 | ax=fig.add_subplot(2,2,i+1) 102 | for label ,color in zip( np.unique(y),colors): 103 | position=y==label 104 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label), 105 | color=color) 106 | ax.set_xlabel("X[0]") 107 | ax.set_xticks([]) 108 | ax.set_yticks([]) 109 | ax.set_ylabel("X[1]") 110 | ax.legend(loc="best") 111 | ax.set_title(r"$\exp(-{0}||x-z||^2)$".format(gamma)) 112 | plt.suptitle("KPCA-rbf") 113 | plt.show() 114 | def plot_KPCA_sigmoid(*data): 115 | ''' 116 | graph with sigmoid kernel 117 | :param data: train_data, train_value 118 | :return: None 119 | ''' 120 | X,y=data 121 | fig=plt.figure() 122 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 123 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 124 | Params=[(0.01,0.1),(0.01,0.2),(0.1,0.1),(0.1,0.2),(0.2,0.1),(0.2,0.2)]# parameter of sigmoid kernel 125 | # gamma,coef0 126 | # gamma : 0.01,0.1,0.2 127 | # coef0 : 0.1,0.2 128 | # 6 combination 129 | for i,(gamma,r) in enumerate(Params): 130 | kpca=decomposition.KernelPCA(n_components=2,kernel='sigmoid',gamma=gamma,coef0=r) 131 | kpca.fit(X) 132 | X_r=kpca.transform(X) 133 | ax=fig.add_subplot(3,2,i+1) 134 | for label ,color in zip( np.unique(y),colors): 135 | position=y==label 136 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label), 137 | color=color) 138 | ax.set_xlabel("X[0]") 139 | ax.set_xticks([]) 140 | ax.set_yticks([]) 141 | ax.set_ylabel("X[1]") 142 | ax.legend(loc="best") 143 | ax.set_title(r"$\tanh({0}(x\cdot z)+{1})$".format(gamma,r)) 144 | plt.suptitle("KPCA-sigmoid") 145 | plt.show() 146 | if __name__=='__main__': 147 | X,y=load_data() 148 | test_KPCA(X,y) 149 | plot_KPCA(X,y) 150 | plot_KPCA_poly(X,y) 151 | plot_KPCA_rbf(X,y) 152 | plot_KPCA_sigmoid(X,y) -------------------------------------------------------------------------------- /5.Dimension_Reduction/5.3 MDS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets,manifold 4 | 5 | def load_data(): 6 | ''' 7 | load the iris data 8 | :return: train_data, train_value 9 | ''' 10 | iris=datasets.load_iris() 11 | return iris.data,iris.target 12 | 13 | def test_MDS(*data): 14 | ''' 15 | test MDS method 16 | :param data: train_data, train_value 17 | :return: None 18 | ''' 19 | X,y=data 20 | for n in [4,3,2,1]: 21 | mds=manifold.MDS(n_components=n) 22 | mds.fit(X) 23 | print('stress(n_components={0}) : {1}'.format (n, str(mds.stress_))) 24 | def plot_MDS(*data): 25 | ''' 26 | graph after MDS 27 | :param data: train_data, train_value 28 | :return: None 29 | ''' 30 | X,y=data 31 | mds=manifold.MDS(n_components=2) 32 | X_r=mds.fit_transform(X) 33 | 34 | ### graph 35 | fig=plt.figure() 36 | ax=fig.add_subplot(1,1,1) 37 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 38 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 39 | for label ,color in zip( np.unique(y),colors): 40 | position=y==label 41 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}".format(label),color=color) 42 | 43 | ax.set_xlabel("X[0]") 44 | ax.set_ylabel("X[1]") 45 | ax.legend(loc="best") 46 | ax.set_title("MDS") 47 | plt.show() 48 | if __name__=='__main__': 49 | X,y=load_data() 50 | test_MDS(X,y) 51 | plot_MDS(X,y) -------------------------------------------------------------------------------- /5.Dimension_Reduction/5.4 Isomap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets,manifold 4 | 5 | def load_data(): 6 | ''' 7 | load the iris data 8 | :return: train_data, train_value 9 | ''' 10 | iris=datasets.load_iris() 11 | return iris.data,iris.target 12 | 13 | def test_Isomap(*data): 14 | ''' 15 | test Isomap method 16 | :param data: train_data, train_value 17 | :return: None 18 | ''' 19 | X,y=data 20 | for n in [4,3,2,1]: 21 | isomap=manifold.Isomap(n_components=n) 22 | isomap.fit(X) 23 | print('reconstruction_error(n_components=%d) : %s'% 24 | (n, isomap.reconstruction_error())) 25 | def plot_Isomap_k(*data): 26 | ''' 27 | test the performance with different n_neighbors and reduce to 2-D 28 | :param data: train_data, train_value 29 | :return: None 30 | ''' 31 | X,y=data 32 | Ks=[1,5,25,y.size-1] 33 | 34 | fig=plt.figure() 35 | for i, k in enumerate(Ks): 36 | isomap=manifold.Isomap(n_components=2,n_neighbors=k) 37 | X_r=isomap.fit_transform(X) 38 | 39 | ax=fig.add_subplot(2,2,i+1) 40 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 41 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 42 | for label ,color in zip( np.unique(y),colors): 43 | position=y==label 44 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}" 45 | .format(label),color=color) 46 | 47 | ax.set_xlabel("X[0]") 48 | ax.set_ylabel("X[1]") 49 | ax.legend(loc="best") 50 | ax.set_title("k={0}".format(k)) 51 | plt.suptitle("Isomap") 52 | plt.show() 53 | def plot_Isomap_k_d1(*data): 54 | ''' 55 | test the performance with different n_neighbors and reduce to 1-D 56 | :param data: train_data, train_value 57 | :return: None 58 | ''' 59 | X,y=data 60 | Ks=[1,5,25,y.size-1] 61 | 62 | fig=plt.figure() 63 | for i, k in enumerate(Ks): 64 | isomap=manifold.Isomap(n_components=1,n_neighbors=k) 65 | X_r=isomap.fit_transform(X) 66 | 67 | ax=fig.add_subplot(2,2,i+1) 68 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 69 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 70 | for label ,color in zip( np.unique(y),colors): 71 | position=y==label 72 | ax.scatter(X_r[position],np.zeros_like(X_r[position]), 73 | label="target= {0}".format(label),color=color) 74 | 75 | ax.set_xlabel("X") 76 | ax.set_ylabel("Y") 77 | ax.legend(loc="best") 78 | ax.set_title("k={0}".format(k)) 79 | plt.suptitle("Isomap") 80 | plt.show() 81 | if __name__=='__main__': 82 | X,y=load_data() 83 | test_Isomap(X,y) 84 | plot_Isomap_k(X,y) 85 | plot_Isomap_k_d1(X,y) -------------------------------------------------------------------------------- /5.Dimension_Reduction/5.5 LLE.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import datasets,manifold 4 | 5 | def load_data(): 6 | ''' 7 | load the iris data 8 | :return: train_data, train_value 9 | ''' 10 | iris=datasets.load_iris() 11 | return iris.data,iris.target 12 | def test_LocallyLinearEmbedding(*data): 13 | ''' 14 | test the LLE method 15 | :param data: train_data, train_value 16 | :return: None 17 | ''' 18 | X,y=data 19 | for n in [4,3,2,1]: 20 | lle=manifold.LocallyLinearEmbedding(n_components=n) 21 | lle.fit(X) 22 | print('reconstruction_error(n_components=%d) : %s'% 23 | (n, lle.reconstruction_error_)) 24 | def plot_LocallyLinearEmbedding_k(*data): 25 | ''' 26 | test the performance with different n_neighbors and reduce to 2-D 27 | :param data: train_data, train_value 28 | :return: None 29 | ''' 30 | X,y=data 31 | Ks=[1,5,25,y.size-1] 32 | 33 | fig=plt.figure() 34 | for i, k in enumerate(Ks): 35 | lle=manifold.LocallyLinearEmbedding(n_components=2,n_neighbors=k) 36 | X_r=lle.fit_transform(X) 37 | 38 | ax=fig.add_subplot(2,2,i+1) 39 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 40 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 41 | for label ,color in zip( np.unique(y),colors): 42 | position=y==label 43 | ax.scatter(X_r[position,0],X_r[position,1],label="target= {0}" 44 | .format(label),color=color) 45 | 46 | ax.set_xlabel("X[0]") 47 | ax.set_ylabel("X[1]") 48 | ax.legend(loc="best") 49 | ax.set_title("k={0}".format(k)) 50 | plt.suptitle("LocallyLinearEmbedding") 51 | plt.show() 52 | def plot_LocallyLinearEmbedding_k_d1(*data): 53 | ''' 54 | test the performance with different n_neighbors and reduce to 1-D 55 | :param data: train_data, train_value 56 | :return: None 57 | ''' 58 | X,y=data 59 | Ks=[1,5,25,y.size-1] 60 | 61 | fig=plt.figure() 62 | for i, k in enumerate(Ks): 63 | lle=manifold.LocallyLinearEmbedding(n_components=1,n_neighbors=k) 64 | X_r=lle.fit_transform(X) 65 | 66 | ax=fig.add_subplot(2,2,i+1) 67 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 68 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 69 | for label ,color in zip( np.unique(y),colors): 70 | position=y==label 71 | ax.scatter(X_r[position],np.zeros_like(X_r[position]), 72 | label="target= {0}".format(label),color=color) 73 | 74 | ax.set_xlabel("X") 75 | ax.set_ylabel("Y") 76 | ax.legend(loc="best") 77 | ax.set_title("k={0}".format(k)) 78 | plt.suptitle("LocallyLinearEmbedding") 79 | plt.show() 80 | if __name__=='__main__': 81 | X,y=load_data() 82 | test_LocallyLinearEmbedding(X,y) 83 | plot_LocallyLinearEmbedding_k(X,y) 84 | plot_LocallyLinearEmbedding_k_d1(X,y) -------------------------------------------------------------------------------- /5.Dimension_Reduction/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 综述 3 | 4 | 针对数据特征的处理,从而避免维度灾难,并且减少噪音数据特征的影响,提高精度。 5 | 6 | # PCA 7 | 8 | 主成分分析法,是一种维度上的压缩变换。但是由于是无监督的压缩,很多的时候是将开始的特征进行了线性组合,从而生成了新的不能合理解释的新的特征。 9 | 10 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.1%20PCA.py 11 | 12 | # SVD 13 | 14 | 奇异值分解降维。该方法等价于PCA主成分分析,核心都是求解XX(T)的特征值以及对应的特征向量。 15 | 16 | # KPCA 17 | 18 | 核主成分分析法。由于主成分分析法是线性的降维,并不能满足现实任务中的要求,所以需要非线性映射的降维。所以有了基于核技术的降维方法,核主成分分析。 19 | 20 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.2%20KPCA.py 21 | 22 | # 流形学习降维 23 | 24 | 流形学习是一种借鉴了拓扑流形概念的降维方法,是一种非线性的降维方法。其特点在于,构造的局部邻域不同,利用这些邻域结构构造全局的低维嵌入方法不同。 25 | 26 | # MDS 27 | 28 | 多维缩放降维,要求原始空间中的样本之间的距离在低维空间中得到保持。 29 | 30 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.3%20MDS.py 31 | 32 | # Isomap 33 | 34 | 等度量映射降维,利用流形在局部上与欧几里得空间同胚的性质,找到每个点在低维流形上的邻近点近邻连接图。计算最短路径问题。利用MDS方法获得低维空间。 35 | 36 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.4%20Isomap.py 37 | 38 | # LLE 39 | 40 | 局部线性嵌入降维的主要目标是,降维的同时保证邻域内样本的线性关系。 41 | 42 | 43 | 44 | https://github.com/JasonK93/ML-note/blob/master/5.Dimension_Reduction/5.5%20LLE.py 45 | -------------------------------------------------------------------------------- /6. Clustering/6.1 Kmeans.py: -------------------------------------------------------------------------------- 1 | from sklearn import cluster 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets.samples_generator import make_blobs 5 | from sklearn.metrics import adjusted_rand_score 6 | 7 | def create_data(centers,num=100,std=0.7): 8 | ''' 9 | generate data 10 | :param centers: dimension of centre 11 | :param num: number of sample 12 | :param std: std of each cluster 13 | :return: data, target 14 | ''' 15 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std) 16 | return X,labels_true 17 | def plot_data(*data): 18 | ''' 19 | graph the dataset 20 | :param data: data, target 21 | :return: None 22 | ''' 23 | X,labels_true=data 24 | labels=np.unique(labels_true) 25 | fig=plt.figure() 26 | ax=fig.add_subplot(1,1,1) 27 | colors='rgbyckm' 28 | for i,label in enumerate(labels): 29 | position=labels_true==label 30 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label), 31 | color=colors[i%len(colors)]) 32 | 33 | ax.legend(loc="best",framealpha=0.5) 34 | ax.set_xlabel("X[0]") 35 | ax.set_ylabel("Y[1]") 36 | ax.set_title("data") 37 | plt.show() 38 | 39 | 40 | 41 | def test_Kmeans(*data): 42 | ''' 43 | test the Kmeans 44 | :param data: data, target 45 | :return: None 46 | ''' 47 | X,labels_true=data 48 | clst=cluster.KMeans() 49 | clst.fit(X) 50 | predicted_labels=clst.predict(X) 51 | print("ARI:{0}".format( adjusted_rand_score(labels_true,predicted_labels))) 52 | print("Sum center distance {0}".format(clst.inertia_)) 53 | def test_Kmeans_nclusters(*data): 54 | ''' 55 | test the performance with different n_clusters 56 | :param data: data, target 57 | :return: None 58 | ''' 59 | X,labels_true=data 60 | nums=range(1,50) 61 | ARIs=[] 62 | Distances=[] 63 | for num in nums: 64 | clst=cluster.KMeans(n_clusters=num) 65 | clst.fit(X) 66 | predicted_labels=clst.predict(X) 67 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) 68 | Distances.append(clst.inertia_) 69 | 70 | ## graph 71 | fig=plt.figure() 72 | ax=fig.add_subplot(1,2,1) 73 | ax.plot(nums,ARIs,marker="+") 74 | ax.set_xlabel("n_clusters") 75 | ax.set_ylabel("ARI") 76 | ax=fig.add_subplot(1,2,2) 77 | ax.plot(nums,Distances,marker='o') 78 | ax.set_xlabel("n_clusters") 79 | ax.set_ylabel("inertia_") 80 | fig.suptitle("KMeans") 81 | plt.show() 82 | def test_Kmeans_n_init(*data): 83 | ''' 84 | test the performance with different n_init and init paramter 85 | :param data: data, target 86 | :return: None 87 | ''' 88 | X,labels_true=data 89 | nums=range(1,50) 90 | ## graph 91 | fig=plt.figure() 92 | 93 | ARIs_k=[] 94 | Distances_k=[] 95 | ARIs_r=[] 96 | Distances_r=[] 97 | for num in nums: 98 | clst=cluster.KMeans(n_init=num,init='k-means++') 99 | clst.fit(X) 100 | predicted_labels=clst.predict(X) 101 | ARIs_k.append(adjusted_rand_score(labels_true,predicted_labels)) 102 | Distances_k.append(clst.inertia_) 103 | 104 | clst=cluster.KMeans(n_init=num,init='random') 105 | clst.fit(X) 106 | predicted_labels=clst.predict(X) 107 | ARIs_r.append(adjusted_rand_score(labels_true,predicted_labels)) 108 | Distances_r.append(clst.inertia_) 109 | 110 | ax=fig.add_subplot(1,2,1) 111 | ax.plot(nums,ARIs_k,marker="+",label="k-means++") 112 | ax.plot(nums,ARIs_r,marker="+",label="random") 113 | ax.set_xlabel("n_init") 114 | ax.set_ylabel("ARI") 115 | ax.set_ylim(0,1) 116 | ax.legend(loc='best') 117 | ax=fig.add_subplot(1,2,2) 118 | ax.plot(nums,Distances_k,marker='o',label="k-means++") 119 | ax.plot(nums,Distances_r,marker='o',label="random") 120 | ax.set_xlabel("n_init") 121 | ax.set_ylabel("inertia_") 122 | ax.legend(loc='best') 123 | 124 | fig.suptitle("KMeans") 125 | plt.show() 126 | 127 | if __name__=='__main__': 128 | centers=[[1,1],[2,2],[1,2],[10,20]] 129 | X,labels_true=create_data(centers,1000,0.5) 130 | plot_data(X,labels_true) 131 | test_Kmeans(X,labels_true) 132 | test_Kmeans_nclusters(X,labels_true) 133 | test_Kmeans_n_init(X,labels_true) 134 | -------------------------------------------------------------------------------- /6. Clustering/6.2 DBSCAN.py: -------------------------------------------------------------------------------- 1 | from sklearn import cluster 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets.samples_generator import make_blobs 5 | from sklearn.metrics import adjusted_rand_score 6 | 7 | 8 | def create_data(centers,num=100,std=0.7): 9 | ''' 10 | generate data 11 | :param centers: dimension of centre 12 | :param num: number of sample 13 | :param std: std of each cluster 14 | :return: data, target 15 | ''' 16 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std) 17 | return X,labels_true 18 | def plot_data(*data): 19 | ''' 20 | graph the dataset 21 | :param data: data, target 22 | :return: None 23 | ''' 24 | X,labels_true=data 25 | labels=np.unique(labels_true) 26 | fig=plt.figure() 27 | ax=fig.add_subplot(1,1,1) 28 | colors='rgbyckm' 29 | for i,label in enumerate(labels): 30 | position=labels_true==label 31 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label), 32 | color=colors[i%len(colors)]) 33 | 34 | ax.legend(loc="best",framealpha=0.5) 35 | ax.set_xlabel("X[0]") 36 | ax.set_ylabel("Y[1]") 37 | ax.set_title("data") 38 | plt.show() 39 | 40 | def test_DBSCAN(*data): 41 | ''' 42 | test the DBSCAN method 43 | :param data: train, target 44 | :return: None 45 | ''' 46 | X,labels_true=data 47 | clst=cluster.DBSCAN() 48 | predicted_labels=clst.fit_predict(X) 49 | print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels)) 50 | print("Core sample num:{0}".format(len(clst.core_sample_indices_))) 51 | def test_DBSCAN_epsilon(*data): 52 | ''' 53 | test the score with different eps 54 | :param data: train, target 55 | :return: None 56 | ''' 57 | X,labels_true=data 58 | epsilons=np.logspace(-1,1.5) 59 | ARIs=[] 60 | Core_nums=[] 61 | for epsilon in epsilons: 62 | clst=cluster.DBSCAN(eps=epsilon) 63 | predicted_labels=clst.fit_predict(X) 64 | ARIs.append( adjusted_rand_score(labels_true,predicted_labels)) 65 | Core_nums.append(len(clst.core_sample_indices_)) 66 | 67 | ## graph 68 | fig=plt.figure() 69 | ax=fig.add_subplot(1,2,1) 70 | ax.plot(epsilons,ARIs,marker='+') 71 | ax.set_xscale('log') 72 | ax.set_xlabel(r"$\epsilon$") 73 | ax.set_ylim(0,1) 74 | ax.set_ylabel('ARI') 75 | 76 | ax=fig.add_subplot(1,2,2) 77 | ax.plot(epsilons,Core_nums,marker='o') 78 | ax.set_xscale('log') 79 | ax.set_xlabel(r"$\epsilon$") 80 | ax.set_ylabel('Core_Nums') 81 | 82 | fig.suptitle("DBSCAN") 83 | plt.show() 84 | def test_DBSCAN_min_samples(*data): 85 | ''' 86 | test the score with different min_sample 87 | :param data: train, target 88 | :return: None 89 | ''' 90 | X,labels_true=data 91 | min_samples=range(1,100) 92 | ARIs=[] 93 | Core_nums=[] 94 | for num in min_samples: 95 | clst=cluster.DBSCAN(min_samples=num) 96 | predicted_labels=clst.fit_predict(X) 97 | ARIs.append( adjusted_rand_score(labels_true,predicted_labels)) 98 | Core_nums.append(len(clst.core_sample_indices_)) 99 | 100 | ## graph 101 | fig=plt.figure() 102 | ax=fig.add_subplot(1,2,1) 103 | ax.plot(min_samples,ARIs,marker='+') 104 | ax.set_xlabel( "min_samples") 105 | ax.set_ylim(0,1) 106 | ax.set_ylabel('ARI') 107 | 108 | ax=fig.add_subplot(1,2,2) 109 | ax.plot(min_samples,Core_nums,marker='o') 110 | ax.set_xlabel( "min_samples") 111 | ax.set_ylabel('Core_Nums') 112 | 113 | fig.suptitle("DBSCAN") 114 | plt.show() 115 | 116 | if __name__=='__main__': 117 | centers=[[1,1],[2,2],[1,2],[10,20]] 118 | X,labels_true=create_data(centers,1000,0.5) 119 | plot_data(X,labels_true) 120 | test_DBSCAN(X,labels_true) 121 | test_DBSCAN_epsilon(X,labels_true) 122 | test_DBSCAN_min_samples(X,labels_true) 123 | -------------------------------------------------------------------------------- /6. Clustering/6.3 Agglomerative Clustering.py: -------------------------------------------------------------------------------- 1 | from sklearn import cluster 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets.samples_generator import make_blobs 5 | from sklearn.metrics import adjusted_rand_score 6 | 7 | 8 | def create_data(centers,num=100,std=0.7): 9 | ''' 10 | generate data 11 | :param centers: dimension of centre 12 | :param num: number of sample 13 | :param std: std of each cluster 14 | :return: data, target 15 | ''' 16 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std) 17 | return X,labels_true 18 | def plot_data(*data): 19 | ''' 20 | graph the dataset 21 | :param data: data, target 22 | :return: None 23 | ''' 24 | X,labels_true=data 25 | labels=np.unique(labels_true) 26 | fig=plt.figure() 27 | ax=fig.add_subplot(1,1,1) 28 | colors='rgbyckm' 29 | for i,label in enumerate(labels): 30 | position=labels_true==label 31 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label), 32 | color=colors[i%len(colors)]) 33 | 34 | ax.legend(loc="best",framealpha=0.5) 35 | ax.set_xlabel("X[0]") 36 | ax.set_ylabel("Y[1]") 37 | ax.set_title("data") 38 | plt.show() 39 | 40 | def test_AgglomerativeClustering(*data): 41 | ''' 42 | test AGG method 43 | :param data: data, target 44 | :return: None 45 | ''' 46 | X,labels_true=data 47 | clst=cluster.AgglomerativeClustering() 48 | predicted_labels=clst.fit_predict(X) 49 | print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels))) 50 | def test_AgglomerativeClustering_nclusters(*data): 51 | ''' 52 | test the performance with different n_clusters 53 | :param data: data, target 54 | :return: None 55 | ''' 56 | X,labels_true=data 57 | nums=range(1,50) 58 | ARIs=[] 59 | for num in nums: 60 | clst=cluster.AgglomerativeClustering(n_clusters=num) 61 | predicted_labels=clst.fit_predict(X) 62 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) 63 | 64 | ## graph 65 | fig=plt.figure() 66 | ax=fig.add_subplot(1,1,1) 67 | ax.plot(nums,ARIs,marker="+") 68 | ax.set_xlabel("n_clusters") 69 | ax.set_ylabel("ARI") 70 | fig.suptitle("AgglomerativeClustering") 71 | plt.show() 72 | def test_AgglomerativeClustering_linkage(*data): 73 | ''' 74 | test the performance with different linkages 75 | :param data: data, target 76 | :return: None 77 | ''' 78 | X,labels_true=data 79 | nums=range(1,50) 80 | fig=plt.figure() 81 | ax=fig.add_subplot(1,1,1) 82 | 83 | linkages=['ward','complete','average'] 84 | markers="+o*" 85 | for i, linkage in enumerate(linkages): 86 | ARIs=[] 87 | for num in nums: 88 | clst=cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage) 89 | predicted_labels=clst.fit_predict(X) 90 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) 91 | ax.plot(nums,ARIs,marker=markers[i],label="linkage:{0}".format(linkage)) 92 | 93 | ax.set_xlabel("n_clusters") 94 | ax.set_ylabel("ARI") 95 | ax.legend(loc="best") 96 | fig.suptitle("AgglomerativeClustering") 97 | plt.show() 98 | 99 | if __name__=='__main__': 100 | centers=[[1,1],[2,2],[1,2],[10,20]] 101 | X,labels_true=create_data(centers,1000,0.5) 102 | plot_data(X,labels_true) 103 | test_AgglomerativeClustering(X,labels_true) 104 | test_AgglomerativeClustering_nclusters(X,labels_true) 105 | test_AgglomerativeClustering_linkage(X,labels_true) 106 | 107 | 108 | -------------------------------------------------------------------------------- /6. Clustering/6.4 GaussianMixture.py: -------------------------------------------------------------------------------- 1 | from sklearn import cluster 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.datasets.samples_generator import make_blobs 5 | from sklearn.metrics import adjusted_rand_score 6 | from sklearn import mixture 7 | 8 | 9 | def create_data(centers,num=100,std=0.7): 10 | ''' 11 | generate data 12 | :param centers: dimension of centre 13 | :param num: number of sample 14 | :param std: std of each cluster 15 | :return: data, target 16 | ''' 17 | X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std) 18 | return X,labels_true 19 | def plot_data(*data): 20 | ''' 21 | graph the dataset 22 | :param data: data, target 23 | :return: None 24 | ''' 25 | X,labels_true=data 26 | labels=np.unique(labels_true) 27 | fig=plt.figure() 28 | ax=fig.add_subplot(1,1,1) 29 | colors='rgbyckm' 30 | for i,label in enumerate(labels): 31 | position=labels_true==label 32 | ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label), 33 | color=colors[i%len(colors)]) 34 | 35 | ax.legend(loc="best",framealpha=0.5) 36 | ax.set_xlabel("X[0]") 37 | ax.set_ylabel("Y[1]") 38 | ax.set_title("data") 39 | plt.show() 40 | 41 | def test_GMM(*data): 42 | ''' 43 | test the method of GMM 44 | :param data: data , target 45 | :return: None 46 | ''' 47 | X,labels_true=data 48 | clst=mixture.GaussianMixture() 49 | clst.fit(X) 50 | predicted_labels=clst.predict(X) 51 | print("ARI:{0}".format(adjusted_rand_score(labels_true,predicted_labels))) 52 | def test_GMM_n_components(*data): 53 | ''' 54 | test the performance with different N_components 55 | :param data: data, target 56 | :return: None 57 | ''' 58 | X,labels_true=data 59 | nums=range(1,50) 60 | ARIs=[] 61 | for num in nums: 62 | clst=mixture.GaussianMixture(n_components=num) 63 | clst.fit(X) 64 | predicted_labels=clst.predict(X) 65 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) 66 | 67 | ## graph 68 | fig=plt.figure() 69 | ax=fig.add_subplot(1,1,1) 70 | ax.plot(nums,ARIs,marker="+") 71 | ax.set_xlabel("n_components") 72 | ax.set_ylabel("ARI") 73 | fig.suptitle("GMM") 74 | plt.show() 75 | def test_GMM_cov_type(*data): 76 | ''' 77 | test the performance with different cov_type 78 | :param data: data, target 79 | :return: None 80 | ''' 81 | X,labels_true=data 82 | nums=range(1,50) 83 | 84 | cov_types=['spherical','tied','diag','full'] 85 | markers="+o*s" 86 | fig=plt.figure() 87 | ax=fig.add_subplot(1,1,1) 88 | 89 | for i ,cov_type in enumerate(cov_types): 90 | ARIs=[] 91 | for num in nums: 92 | clst=mixture.GaussianMixture(n_components=num,covariance_type=cov_type) 93 | clst.fit(X) 94 | predicted_labels=clst.predict(X) 95 | ARIs.append(adjusted_rand_score(labels_true,predicted_labels)) 96 | ax.plot(nums,ARIs,marker=markers[i],label="covariance_type:{0}".format(cov_type)) 97 | 98 | ax.set_xlabel("n_components") 99 | ax.legend(loc="best") 100 | ax.set_ylabel("ARI") 101 | fig.suptitle("GMM") 102 | plt.show() 103 | 104 | if __name__=='__main__': 105 | centers=[[1,1],[2,2],[1,2],[10,20]] 106 | X,labels_true=create_data(centers,1000,0.5) 107 | plot_data(X,labels_true) 108 | test_GMM(X,labels_true) 109 | test_GMM_n_components(X,labels_true) 110 | test_GMM_cov_type(X,labels_true) 111 | -------------------------------------------------------------------------------- /6. Clustering/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 综述 3 | 4 | 聚类算法是非监督学习,目的在于对于一些没有target的数据集,进行分类的算法。这是一种探索性分析的方法,用来分析数据的内在特点,寻找数据的分布规律。 5 | 6 | # 聚类的有效性指标 7 | 8 | 聚类的有效性指标主要有两种,外部指标,内部指标。 9 | 10 | 外部指标:由聚类结果与某个参考模型比较获得。1.Jaccard 系数;2.FM指数;3.Rand 指数;4.ARI指数。 11 | 12 | 内部指标:由考察聚类结果直接获得。1.DB指数;2.Dunn指数。 13 | 14 | # 度量距离 15 | 16 | 欧几里得距离,曼哈顿距离,汉明距离,VDM距离等等 17 | 18 | # 原型聚类 19 | 20 | 常用的原型聚类由,K均值聚类,高斯混合聚类等等。K均值的目标函数是最小均方误差。高斯混合聚类,假设聚类服从高斯分布。 21 | 22 | # 密度聚类 23 | 24 | Density-based clustering 假设聚类结构能够通过样本分布的紧密程度来确定。常用的算法由,DBSCAN。 25 | 26 | # 层次聚类 27 | 28 | hierarchical clustering 可在不同层上对数据集进行划分。形成类似树一样的聚类结构。 29 | 30 | # EM算法 31 | 32 | 期望最大算法,是一种迭代方法,主要用于含有隐变量的概率模型的参数估计。其中主要分两步,E为求期望,M为求极大。在混合高斯聚类等方法中有应用。 33 | 34 | # 现实任务中的聚类要求: 35 | 36 | 1.可伸缩性:数据量的变化不影响聚类结果的准确度。2.不同类型数据的处理能力要求。3.适应不同类簇形状的混合聚类要求。4.初始化参数的敏感性的解决要求。5.算法的抗噪能力。6.增量聚类的实现。7.对输入次序的敏感度把握要求。8.高维数据的处理能力要求。9.结果的可读性,可视化性,可解释性,与可应用性。 37 | 38 | #实战代码:GitHub 39 | 40 | 1.Kmeans: 41 | 42 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.1%20Kmeans.py 43 | 44 | 2.DBSCAN: 45 | 46 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.2%20DBSCAN.py 47 | 48 | 3. Agglomerative Clustering: 49 | 50 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.3%20Agglomerative%20Clustering.py 51 | 52 | 4.GaussianMixture: 53 | 54 | https://github.com/JasonK93/ML-note/blob/master/6.%20Clustering/6.4%20GaussianMixture.py 55 | -------------------------------------------------------------------------------- /7. Support Vector Machine/7.1 SVM-liner_SVC.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model,cross_validation,svm 4 | 5 | def load_data_classfication(): 6 | ''' 7 | load iris data set 8 | :return: train_data,test_data, train_target, test_target 9 | ''' 10 | iris=datasets.load_iris() 11 | X_train=iris.data 12 | y_train=iris.target 13 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25, 14 | random_state=0,stratify=y_train) 15 | 16 | def test_SVC_linear(*data): 17 | ''' 18 | test method of SVC 19 | :param data: train_data,test_data, train_target, test_target 20 | :return: None 21 | ''' 22 | X_train,X_test,y_train,y_test=data 23 | cls=svm.SVC(kernel='linear') 24 | cls.fit(X_train,y_train) 25 | print('Coefficients:{0}, intercept {1}'.format(cls.coef_,cls.intercept_)) 26 | print('Score: {0}' .format(cls.score(X_test, y_test))) 27 | def test_SVC_poly(*data): 28 | ''' 29 | test the performance with different degree, gamma, codf0 30 | :param data: train_data,test_data, train_target, test_target 31 | :return: None 32 | ''' 33 | X_train,X_test,y_train,y_test=data 34 | fig=plt.figure() 35 | ### test degree #### 36 | degrees=range(1,20) 37 | train_scores=[] 38 | test_scores=[] 39 | for degree in degrees: 40 | cls=svm.SVC(kernel='poly',degree=degree) 41 | cls.fit(X_train,y_train) 42 | train_scores.append(cls.score(X_train,y_train)) 43 | test_scores.append(cls.score(X_test, y_test)) 44 | ax=fig.add_subplot(1,3,1) 45 | ax.plot(degrees,train_scores,label="Training score ",marker='+' ) 46 | ax.plot(degrees,test_scores,label= " Testing score ",marker='o' ) 47 | ax.set_title( "SVC_poly_degree ") 48 | ax.set_xlabel("p") 49 | ax.set_ylabel("score") 50 | ax.set_ylim(0,1.05) 51 | ax.legend(loc="best",framealpha=0.5) 52 | 53 | ### test gamma , degree fixed with 3#### 54 | gammas=range(1,20) 55 | train_scores=[] 56 | test_scores=[] 57 | for gamma in gammas: 58 | cls=svm.SVC(kernel='poly',gamma=gamma,degree=3) 59 | cls.fit(X_train,y_train) 60 | train_scores.append(cls.score(X_train,y_train)) 61 | test_scores.append(cls.score(X_test, y_test)) 62 | ax=fig.add_subplot(1,3,2) 63 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 64 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 65 | ax.set_title( "SVC_poly_gamma ") 66 | ax.set_xlabel(r"$\gamma$") 67 | ax.set_ylabel("score") 68 | ax.set_ylim(0,1.05) 69 | ax.legend(loc="best",framealpha=0.5) 70 | ### test r , gamma fixed with 10 , degree fixed with 3###### 71 | rs=range(0,20) 72 | train_scores=[] 73 | test_scores=[] 74 | for r in rs: 75 | cls=svm.SVC(kernel='poly',gamma=10,degree=3,coef0=r) 76 | cls.fit(X_train,y_train) 77 | train_scores.append(cls.score(X_train,y_train)) 78 | test_scores.append(cls.score(X_test, y_test)) 79 | ax=fig.add_subplot(1,3,3) 80 | ax.plot(rs,train_scores,label="Training score ",marker='+' ) 81 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' ) 82 | ax.set_title( "SVC_poly_r ") 83 | ax.set_xlabel(r"r") 84 | ax.set_ylabel("score") 85 | ax.set_ylim(0,1.05) 86 | ax.legend(loc="best",framealpha=0.5) 87 | plt.show() 88 | def test_SVC_rbf(*data): 89 | ''' 90 | test SVC with Gaussian kernel and different gamma 91 | :param data: train_data,test_data, train_target, test_target 92 | :return: None 93 | ''' 94 | X_train,X_test,y_train,y_test=data 95 | gammas=range(1,20) 96 | train_scores=[] 97 | test_scores=[] 98 | for gamma in gammas: 99 | cls=svm.SVC(kernel='rbf',gamma=gamma) 100 | cls.fit(X_train,y_train) 101 | train_scores.append(cls.score(X_train,y_train)) 102 | test_scores.append(cls.score(X_test, y_test)) 103 | fig=plt.figure() 104 | ax=fig.add_subplot(1,1,1) 105 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 106 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 107 | ax.set_title( "SVC_rbf") 108 | ax.set_xlabel(r"$\gamma$") 109 | ax.set_ylabel("score") 110 | ax.set_ylim(0,1.05) 111 | ax.legend(loc="best",framealpha=0.5) 112 | plt.show() 113 | def test_SVC_sigmoid(*data): 114 | ''' 115 | test SVC with sigmoid kernel with different gamma and coef0 116 | :param data: train_data,test_data, train_target, test_target 117 | :return: None 118 | ''' 119 | X_train,X_test,y_train,y_test=data 120 | fig=plt.figure() 121 | 122 | ### test gamma ,fixed coef0 with 0 #### 123 | gammas=np.logspace(-2,1) 124 | train_scores=[] 125 | test_scores=[] 126 | 127 | for gamma in gammas: 128 | cls=svm.SVC(kernel='sigmoid',gamma=gamma,coef0=0) 129 | cls.fit(X_train,y_train) 130 | train_scores.append(cls.score(X_train,y_train)) 131 | test_scores.append(cls.score(X_test, y_test)) 132 | ax=fig.add_subplot(1,2,1) 133 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 134 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 135 | ax.set_title( "SVC_sigmoid_gamma ") 136 | ax.set_xscale("log") 137 | ax.set_xlabel(r"$\gamma$") 138 | ax.set_ylabel("score") 139 | ax.set_ylim(0,1.05) 140 | ax.legend(loc="best",framealpha=0.5) 141 | ### test r,fixed gamma with 0.01 ###### 142 | rs=np.linspace(0,5) 143 | train_scores=[] 144 | test_scores=[] 145 | 146 | for r in rs: 147 | cls=svm.SVC(kernel='sigmoid',coef0=r,gamma=0.01) 148 | cls.fit(X_train,y_train) 149 | train_scores.append(cls.score(X_train,y_train)) 150 | test_scores.append(cls.score(X_test, y_test)) 151 | ax=fig.add_subplot(1,2,2) 152 | ax.plot(rs,train_scores,label="Training score ",marker='+' ) 153 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' ) 154 | ax.set_title( "SVC_sigmoid_r ") 155 | ax.set_xlabel(r"r") 156 | ax.set_ylabel("score") 157 | ax.set_ylim(0,1.05) 158 | ax.legend(loc="best",framealpha=0.5) 159 | plt.show() 160 | if __name__=="__main__": 161 | X_train,X_test,y_train,y_test=load_data_classfication() 162 | test_SVC_linear(X_train,X_test,y_train,y_test) 163 | test_SVC_poly(X_train,X_test,y_train,y_test) 164 | test_SVC_rbf(X_train,X_test,y_train,y_test) 165 | test_SVC_sigmoid(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /7. Support Vector Machine/7.2 SVM-unliner_SVC.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model,cross_validation,svm 4 | 5 | def load_data_classfication(): 6 | ''' 7 | load iris data set 8 | :return: train_data,test_data, train_target, test_target 9 | ''' 10 | iris=datasets.load_iris() 11 | X_train=iris.data 12 | y_train=iris.target 13 | return cross_validation.train_test_split(X_train, y_train,test_size=0.25, 14 | random_state=0,stratify=y_train) 15 | 16 | def test_SVC_linear(*data): 17 | ''' 18 | test method of SVC 19 | :param data: train_data,test_data, train_target, test_target 20 | :return: None 21 | ''' 22 | X_train,X_test,y_train,y_test=data 23 | cls=svm.SVC(kernel='linear') 24 | cls.fit(X_train,y_train) 25 | print('Coefficients:{0}, intercept {1}'.format(cls.coef_,cls.intercept_)) 26 | print('Score: {0}' .format(cls.score(X_test, y_test))) 27 | def test_SVC_poly(*data): 28 | ''' 29 | test SVC with poly kernel and different degree, gamma, coef0 30 | :param data: train_data,test_data, train_target, test_target 31 | :return: None 32 | ''' 33 | X_train,X_test,y_train,y_test=data 34 | fig=plt.figure() 35 | ### test degree #### 36 | degrees=range(1,20) 37 | train_scores=[] 38 | test_scores=[] 39 | for degree in degrees: 40 | cls=svm.SVC(kernel='poly',degree=degree) 41 | cls.fit(X_train,y_train) 42 | train_scores.append(cls.score(X_train,y_train)) 43 | test_scores.append(cls.score(X_test, y_test)) 44 | ax=fig.add_subplot(1,3,1) 45 | ax.plot(degrees,train_scores,label="Training score ",marker='+' ) 46 | ax.plot(degrees,test_scores,label= " Testing score ",marker='o' ) 47 | ax.set_title( "SVC_poly_degree ") 48 | ax.set_xlabel("p") 49 | ax.set_ylabel("score") 50 | ax.set_ylim(0,1.05) 51 | ax.legend(loc="best",framealpha=0.5) 52 | 53 | ### test gamma ,fix degree with 3#### 54 | gammas=range(1,20) 55 | train_scores=[] 56 | test_scores=[] 57 | for gamma in gammas: 58 | cls=svm.SVC(kernel='poly',gamma=gamma,degree=3) 59 | cls.fit(X_train,y_train) 60 | train_scores.append(cls.score(X_train,y_train)) 61 | test_scores.append(cls.score(X_test, y_test)) 62 | ax=fig.add_subplot(1,3,2) 63 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 64 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 65 | ax.set_title( "SVC_poly_gamma ") 66 | ax.set_xlabel(r"$\gamma$") 67 | ax.set_ylabel("score") 68 | ax.set_ylim(0,1.05) 69 | ax.legend(loc="best",framealpha=0.5) 70 | ### test r ,fix gamma with 10 , degree fixed with 3###### 71 | rs=range(0,20) 72 | train_scores=[] 73 | test_scores=[] 74 | for r in rs: 75 | cls=svm.SVC(kernel='poly',gamma=10,degree=3,coef0=r) 76 | cls.fit(X_train,y_train) 77 | train_scores.append(cls.score(X_train,y_train)) 78 | test_scores.append(cls.score(X_test, y_test)) 79 | ax=fig.add_subplot(1,3,3) 80 | ax.plot(rs,train_scores,label="Training score ",marker='+' ) 81 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' ) 82 | ax.set_title( "SVC_poly_r ") 83 | ax.set_xlabel(r"r") 84 | ax.set_ylabel("score") 85 | ax.set_ylim(0,1.05) 86 | ax.legend(loc="best",framealpha=0.5) 87 | plt.show() 88 | def test_SVC_rbf(*data): 89 | ''' 90 | test SVC with gaussian kernel(rbf) and different gamma 91 | :param data: train_data,test_data, train_target, test_target 92 | :return: None 93 | ''' 94 | X_train,X_test,y_train,y_test=data 95 | gammas=range(1,20) 96 | train_scores=[] 97 | test_scores=[] 98 | for gamma in gammas: 99 | cls=svm.SVC(kernel='rbf',gamma=gamma) 100 | cls.fit(X_train,y_train) 101 | train_scores.append(cls.score(X_train,y_train)) 102 | test_scores.append(cls.score(X_test, y_test)) 103 | fig=plt.figure() 104 | ax=fig.add_subplot(1,1,1) 105 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 106 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 107 | ax.set_title( "SVC_rbf") 108 | ax.set_xlabel(r"$\gamma$") 109 | ax.set_ylabel("score") 110 | ax.set_ylim(0,1.05) 111 | ax.legend(loc="best",framealpha=0.5) 112 | plt.show() 113 | def test_SVC_sigmoid(*data): 114 | ''' 115 | test SVC with sigmoid kernel and different gamma and coef0 116 | :param data: train_data,test_data, train_target, test_target 117 | :return: None 118 | ''' 119 | X_train,X_test,y_train,y_test=data 120 | fig=plt.figure() 121 | 122 | ### test gamma ,fix coef0 with 0 #### 123 | gammas=np.logspace(-2,1) 124 | train_scores=[] 125 | test_scores=[] 126 | 127 | for gamma in gammas: 128 | cls=svm.SVC(kernel='sigmoid',gamma=gamma,coef0=0) 129 | cls.fit(X_train,y_train) 130 | train_scores.append(cls.score(X_train,y_train)) 131 | test_scores.append(cls.score(X_test, y_test)) 132 | ax=fig.add_subplot(1,2,1) 133 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 134 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 135 | ax.set_title( "SVC_sigmoid_gamma ") 136 | ax.set_xscale("log") 137 | ax.set_xlabel(r"$\gamma$") 138 | ax.set_ylabel("score") 139 | ax.set_ylim(0,1.05) 140 | ax.legend(loc="best",framealpha=0.5) 141 | ### test r,fix gamma with 0.01 ###### 142 | rs=np.linspace(0,5) 143 | train_scores=[] 144 | test_scores=[] 145 | 146 | for r in rs: 147 | cls=svm.SVC(kernel='sigmoid',coef0=r,gamma=0.01) 148 | cls.fit(X_train,y_train) 149 | train_scores.append(cls.score(X_train,y_train)) 150 | test_scores.append(cls.score(X_test, y_test)) 151 | ax=fig.add_subplot(1,2,2) 152 | ax.plot(rs,train_scores,label="Training score ",marker='+' ) 153 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' ) 154 | ax.set_title( "SVC_sigmoid_r ") 155 | ax.set_xlabel(r"r") 156 | ax.set_ylabel("score") 157 | ax.set_ylim(0,1.05) 158 | ax.legend(loc="best",framealpha=0.5) 159 | plt.show() 160 | if __name__=="__main__": 161 | X_train,X_test,y_train,y_test=load_data_classfication() 162 | test_SVC_linear(X_train,X_test,y_train,y_test) 163 | test_SVC_poly(X_train,X_test,y_train,y_test) 164 | test_SVC_rbf(X_train,X_test,y_train,y_test) 165 | test_SVC_sigmoid(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /7. Support Vector Machine/7.3 liner_SVR.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model,cross_validation,svm 4 | def load_data_regression(): 5 | ''' 6 | load dataset for regression 7 | :return: train_data,test_data, train_target, test_target 8 | ''' 9 | diabetes = datasets.load_diabetes() 10 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 11 | test_size=0.25,random_state=0) 12 | 13 | def test_LinearSVR(*data): 14 | ''' 15 | test Liner SVR 16 | :param data: train_data,test_data, train_target, test_target 17 | :return: None 18 | ''' 19 | X_train,X_test,y_train,y_test=data 20 | regr=svm.LinearSVR() 21 | regr.fit(X_train,y_train) 22 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 23 | print('Score: {0}' .format(regr.score(X_test, y_test))) 24 | def test_LinearSVR_loss(*data): 25 | ''' 26 | test SVr with different loss function 27 | :param data: train_data,test_data, train_target, test_target 28 | :return: 29 | ''' 30 | X_train,X_test,y_train,y_test=data 31 | losses=['epsilon_insensitive','squared_epsilon_insensitive'] 32 | for loss in losses: 33 | regr=svm.LinearSVR(loss=loss) 34 | regr.fit(X_train,y_train) 35 | print("loss:{0}".format(loss)) 36 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 37 | print('Score: {0}' .format(regr.score(X_test, y_test))) 38 | def test_LinearSVR_epsilon(*data): 39 | ''' 40 | test the performance with different epsilon 41 | :param data: train_data,test_data, train_target, test_target 42 | :return: None 43 | ''' 44 | X_train,X_test,y_train,y_test=data 45 | epsilons=np.logspace(-2,2) 46 | train_scores=[] 47 | test_scores=[] 48 | for epsilon in epsilons: 49 | regr=svm.LinearSVR(epsilon=epsilon,loss='squared_epsilon_insensitive') 50 | regr.fit(X_train,y_train) 51 | train_scores.append(regr.score(X_train, y_train)) 52 | test_scores.append(regr.score(X_test, y_test)) 53 | fig=plt.figure() 54 | ax=fig.add_subplot(1,1,1) 55 | ax.plot(epsilons,train_scores,label="Training score ",marker='+' ) 56 | ax.plot(epsilons,test_scores,label= " Testing score ",marker='o' ) 57 | ax.set_title( "LinearSVR_epsilon ") 58 | ax.set_xscale("log") 59 | ax.set_xlabel(r"$\epsilon$") 60 | ax.set_ylabel("score") 61 | ax.set_ylim(-1,1.05) 62 | ax.legend(loc="best",framealpha=0.5) 63 | plt.show() 64 | def test_LinearSVR_C(*data): 65 | ''' 66 | test the performance with different C 67 | :param data: train_data,test_data, train_target, test_target 68 | :return: None 69 | ''' 70 | X_train,X_test,y_train,y_test=data 71 | Cs=np.logspace(-1,2) 72 | train_scores=[] 73 | test_scores=[] 74 | for C in Cs: 75 | regr=svm.LinearSVR(epsilon=0.1,loss='squared_epsilon_insensitive',C=C) 76 | regr.fit(X_train,y_train) 77 | train_scores.append(regr.score(X_train, y_train)) 78 | test_scores.append(regr.score(X_test, y_test)) 79 | fig=plt.figure() 80 | ax=fig.add_subplot(1,1,1) 81 | ax.plot(Cs,train_scores,label="Training score ",marker='+' ) 82 | ax.plot(Cs,test_scores,label= " Testing score ",marker='o' ) 83 | ax.set_title( "LinearSVR_C ") 84 | ax.set_xscale("log") 85 | ax.set_xlabel(r"C") 86 | ax.set_ylabel("score") 87 | ax.set_ylim(-1,1.05) 88 | ax.legend(loc="best",framealpha=0.5) 89 | plt.show() 90 | if __name__=="__main__": 91 | X_train,X_test,y_train,y_test=load_data_regression() 92 | test_LinearSVR(X_train,X_test,y_train,y_test) 93 | test_LinearSVR_loss(X_train,X_test,y_train,y_test) 94 | test_LinearSVR_epsilon(X_train,X_test,y_train,y_test) 95 | test_LinearSVR_C(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /7. Support Vector Machine/7.4 unliner_SVR.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn import datasets, linear_model,cross_validation,svm 4 | def load_data_regression(): 5 | ''' 6 | load dataset for regression 7 | :return: train_data,test_data, train_target, test_target 8 | ''' 9 | diabetes = datasets.load_diabetes() 10 | return cross_validation.train_test_split(diabetes.data,diabetes.target, 11 | test_size=0.25,random_state=0) 12 | 13 | def test_SVR_linear(*data): 14 | ''' 15 | test SVR with liner kernel 16 | :param data: train_data,test_data, train_target, test_target 17 | :return: None 18 | ''' 19 | X_train,X_test,y_train,y_test=data 20 | regr=svm.SVR(kernel='linear') 21 | regr.fit(X_train,y_train) 22 | print('Coefficients:{0}, intercept {1}'.format(regr.coef_,regr.intercept_)) 23 | print('Score: {0}' .format(regr.score(X_test, y_test))) 24 | 25 | def test_SVR_poly(*data): 26 | ''' 27 | test SVR with poly kernel, and different degree, gamma, coef0 28 | :param data: train_data,test_data, train_target, test_target 29 | :return: None 30 | ''' 31 | X_train,X_test,y_train,y_test=data 32 | fig=plt.figure() 33 | ### test degree #### 34 | degrees=range(1,20) 35 | train_scores=[] 36 | test_scores=[] 37 | for degree in degrees: 38 | regr=svm.SVR(kernel='poly',degree=degree,coef0=1) 39 | regr.fit(X_train,y_train) 40 | train_scores.append(regr.score(X_train,y_train)) 41 | test_scores.append(regr.score(X_test, y_test)) 42 | ax=fig.add_subplot(1,3,1) 43 | ax.plot(degrees,train_scores,label="Training score ",marker='+' ) 44 | ax.plot(degrees,test_scores,label= " Testing score ",marker='o' ) 45 | ax.set_title( "SVR_poly_degree r=1") 46 | ax.set_xlabel("p") 47 | ax.set_ylabel("score") 48 | ax.set_ylim(-1,1.) 49 | ax.legend(loc="best",framealpha=0.5) 50 | 51 | ### test gamma,fix degree with 3, fix coef0 with 1 #### 52 | gammas=range(1,40) 53 | train_scores=[] 54 | test_scores=[] 55 | for gamma in gammas: 56 | regr=svm.SVR(kernel='poly',gamma=gamma,degree=3,coef0=1) 57 | regr.fit(X_train,y_train) 58 | train_scores.append(regr.score(X_train,y_train)) 59 | test_scores.append(regr.score(X_test, y_test)) 60 | ax=fig.add_subplot(1,3,2) 61 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 62 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 63 | ax.set_title( "SVR_poly_gamma r=1") 64 | ax.set_xlabel(r"$\gamma$") 65 | ax.set_ylabel("score") 66 | ax.set_ylim(-1,1) 67 | ax.legend(loc="best",framealpha=0.5) 68 | ### test r,fix gamma with 20,fix degree with 3 ###### 69 | rs=range(0,20) 70 | train_scores=[] 71 | test_scores=[] 72 | for r in rs: 73 | regr=svm.SVR(kernel='poly',gamma=20,degree=3,coef0=r) 74 | regr.fit(X_train,y_train) 75 | train_scores.append(regr.score(X_train,y_train)) 76 | test_scores.append(regr.score(X_test, y_test)) 77 | ax=fig.add_subplot(1,3,3) 78 | ax.plot(rs,train_scores,label="Training score ",marker='+' ) 79 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' ) 80 | ax.set_title( "SVR_poly_r gamma=20 degree=3") 81 | ax.set_xlabel(r"r") 82 | ax.set_ylabel("score") 83 | ax.set_ylim(-1,1.) 84 | ax.legend(loc="best",framealpha=0.5) 85 | plt.show() 86 | def test_SVR_rbf(*data): 87 | ''' 88 | test SVR with RBF kernel and different gamma 89 | :param data: train_data,test_data, train_target, test_target 90 | :return: None 91 | ''' 92 | X_train,X_test,y_train,y_test=data 93 | gammas=range(1,20) 94 | train_scores=[] 95 | test_scores=[] 96 | for gamma in gammas: 97 | regr=svm.SVR(kernel='rbf',gamma=gamma) 98 | regr.fit(X_train,y_train) 99 | train_scores.append(regr.score(X_train,y_train)) 100 | test_scores.append(regr.score(X_test, y_test)) 101 | fig=plt.figure() 102 | ax=fig.add_subplot(1,1,1) 103 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 104 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 105 | ax.set_title( "SVR_rbf") 106 | ax.set_xlabel(r"$\gamma$") 107 | ax.set_ylabel("score") 108 | ax.set_ylim(-1,1) 109 | ax.legend(loc="best",framealpha=0.5) 110 | plt.show() 111 | def test_SVR_sigmoid(*data): 112 | ''' 113 | test SVR with sigmoid kernel and different gamma, coef0 114 | :param data: train_data,test_data, train_target, test_target 115 | :return: None 116 | ''' 117 | X_train,X_test,y_train,y_test=data 118 | fig=plt.figure() 119 | 120 | ### test gammam,fix coef0 with 0.01 #### 121 | gammas=np.logspace(-1,3) 122 | train_scores=[] 123 | test_scores=[] 124 | 125 | for gamma in gammas: 126 | regr=svm.SVR(kernel='sigmoid',gamma=gamma,coef0=0.01) 127 | regr.fit(X_train,y_train) 128 | train_scores.append(regr.score(X_train,y_train)) 129 | test_scores.append(regr.score(X_test, y_test)) 130 | ax=fig.add_subplot(1,2,1) 131 | ax.plot(gammas,train_scores,label="Training score ",marker='+' ) 132 | ax.plot(gammas,test_scores,label= " Testing score ",marker='o' ) 133 | ax.set_title( "SVR_sigmoid_gamma r=0.01") 134 | ax.set_xscale("log") 135 | ax.set_xlabel(r"$\gamma$") 136 | ax.set_ylabel("score") 137 | ax.set_ylim(-1,1) 138 | ax.legend(loc="best",framealpha=0.5) 139 | ### test r ,fix gamma with 10 ###### 140 | rs=np.linspace(0,5) 141 | train_scores=[] 142 | test_scores=[] 143 | 144 | for r in rs: 145 | regr=svm.SVR(kernel='sigmoid',coef0=r,gamma=10) 146 | regr.fit(X_train,y_train) 147 | train_scores.append(regr.score(X_train,y_train)) 148 | test_scores.append(regr.score(X_test, y_test)) 149 | ax=fig.add_subplot(1,2,2) 150 | ax.plot(rs,train_scores,label="Training score ",marker='+' ) 151 | ax.plot(rs,test_scores,label= " Testing score ",marker='o' ) 152 | ax.set_title( "SVR_sigmoid_r gamma=10") 153 | ax.set_xlabel(r"r") 154 | ax.set_ylabel("score") 155 | ax.set_ylim(-1,1) 156 | ax.legend(loc="best",framealpha=0.5) 157 | plt.show() 158 | if __name__=="__main__": 159 | X_train,X_test,y_train,y_test=load_data_regression() 160 | test_SVR_linear(X_train,X_test,y_train,y_test) 161 | test_SVR_poly(X_train,X_test,y_train,y_test) 162 | test_SVR_rbf(X_train,X_test,y_train,y_test) 163 | test_SVR_sigmoid(X_train,X_test,y_train,y_test) -------------------------------------------------------------------------------- /7. Support Vector Machine/README.md: -------------------------------------------------------------------------------- 1 | 2 | # 综述 3 | 4 | 支持向量机(Support Vector Machine) 在使用核技术之后,是可以进行非线性分类的。模型的基本定义,是使得在空间中分类的间隔最大化。分割是超平面分割,目标函数是是满足KKT条件下的最大值或者倒数最小值。 5 | 6 | # 涉及知识点: 7 | 8 | 决策函数, KKT条件, 对偶问题,拉格朗日函数, 惩罚参数, 9 | 10 | # 常用核函数: 11 | 12 | 多项式核函数, 高斯核函数, sigmoid 核函数等等。 13 | 14 | # 支持向量机回归 15 | 16 | Support Vector Regression( SVR) , 损失函数是一范数, 但是一般会设置参数,当距离大于参数时,才考虑损失函数。 17 | 18 | # SVM优缺点: 19 | 20 | 优点:可以解决非线性的优化问题。避免了神经网络的结构选择, 核局部极小点问题。 21 | 22 | 缺点:确实数据敏感,对于非线性问题,很依赖于核函数的选择,没有通用的解法。主流算法的时间复杂度是O(n2),所以在大规模数据下的计算需要庞大的计算量。同时结果对超参数的依赖程度很大。(比如RBF核的超参数, gamma核惩罚项C) 23 | 24 | # 实战代码:GitHub 25 | 26 | 1.SVM 线性分类-SVC 27 | 28 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.1%20SVM-liner_SVC.py 29 | 30 | 2. SVM非线性分类-SVC 31 | 32 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.2%20SVM-unliner_SVC.py 33 | 34 | 3. SVM线性回归-SVR 35 | 36 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.3%20liner_SVR.py 37 | 38 | 4.SVM非线性回归-SVR 39 | 40 | https://github.com/JasonK93/ML-note/blob/master/7.%20Support%20Vector%20Machine/7.4%20unliner_SVR.py 41 | 42 | -------------------------------------------------------------------------------- /8. Artificial Neural Network/README.md: -------------------------------------------------------------------------------- 1 | # 综述 2 | 3 | 受生物学影响,人工神经网络由一系列简单的单元组成。ANN是机器学习的一个庞大的分支,有着几百种主要的算法,例如 感知机神经网络(Perceptron Neural Network), 反向传播神经网络(back-propagation , BP), Hopfield Neural Network, 自组织映射(self-organizing Map, SOM), 学习矢量量化(Learning Vector Quantization, LVQ)等等。 4 | 5 | # 涉及知识点 6 | 7 | 感知机,损失函数,对偶形式, 多重感知机,前馈神经网络,反向传播神经网络,激活函数,隐藏层,学习率,收敛速度。 8 | 9 | # 实战代码:GitHub 10 | -------------------------------------------------------------------------------- /8. Artificial Neural Network/test.py: -------------------------------------------------------------------------------- 1 | # this is some part for ANN -------------------------------------------------------------------------------- /9. Semi-Supervised Learning/9.1 labelpropogation.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn import metrics 5 | from sklearn import datasets 6 | from sklearn.semi_supervised import LabelPropagation 7 | 8 | def load_data(): 9 | ''' 10 | load digit data set 11 | :return: data( have target), data_target, data( not have target) 12 | ''' 13 | digits = datasets.load_digits() 14 | ###### shuffle ######## 15 | rng = np.random.RandomState(0) 16 | indices = np.arange(len(digits.data)) 17 | rng.shuffle(indices) 18 | X = digits.data[indices] 19 | y = digits.target[indices] 20 | 21 | n_labeled_points = int(len(y)/10) 22 | unlabeled_indices = np.arange(len(y))[n_labeled_points:] 23 | 24 | return X,y,unlabeled_indices 25 | 26 | def test_LabelPropagation(*data): 27 | ''' 28 | 测试 LabelPropagation 的用法 29 | :param data: 一个元组,依次为: 样本集合、样本标记集合、 未标记样本的下标集合 30 | :return: None 31 | ''' 32 | X,y,unlabeled_indices=data 33 | y_train=np.copy(y) # 必须拷贝,后面要用到 y 34 | y_train[unlabeled_indices]=-1 # 未标记样本的标记设定为 -1 35 | clf=LabelPropagation(max_iter=100,kernel='rbf',gamma=0.1) 36 | clf.fit(X,y_train) 37 | ### 获取预测准确率 38 | predicted_labels = clf.transduction_[unlabeled_indices] # 预测标记 39 | true_labels = y[unlabeled_indices] # 真实标记 40 | print("Accuracy:%f"%metrics.accuracy_score(true_labels,predicted_labels)) 41 | # 或者 print("Accuracy:%f"%clf.score(X[unlabeled_indices],true_labels)) 42 | def test_LabelPropagation_rbf(*data): 43 | ''' 44 | test LabelPropagation with rbf kernel, and different alpha and gamma 45 | :param data: data( have target), data_target, data( not have target) 46 | :return: None 47 | ''' 48 | X,y,unlabeled_indices=data 49 | y_train=np.copy(y) 50 | y_train[unlabeled_indices]=-1 51 | 52 | fig=plt.figure() 53 | ax=fig.add_subplot(1,1,1) 54 | alphas=np.linspace(0.01,1,num=10,endpoint=True) 55 | gammas=np.logspace(-2,2,num=50) 56 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 57 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 58 | ## train and graph 59 | for alpha,color in zip(alphas,colors): 60 | scores=[] 61 | for gamma in gammas: 62 | clf=LabelPropagation(max_iter=100,gamma=gamma,alpha=alpha,kernel='rbf') 63 | clf.fit(X,y_train) 64 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices])) 65 | ax.plot(gammas,scores,label=r"$\alpha=%s$"%alpha,color=color) 66 | 67 | 68 | ax.set_xlabel(r"$\gamma$") 69 | ax.set_ylabel("score") 70 | ax.set_xscale("log") 71 | ax.legend(loc="best") 72 | ax.set_title("LabelPropagation rbf kernel") 73 | plt.show() 74 | def test_LabelPropagation_knn(*data): 75 | ''' 76 | test LabelPropagation with knn kernel, and different alpha , n_neighbors 77 | :param data: data( have target), data_target, data( not have target) 78 | :return: None 79 | ''' 80 | X,y,unlabeled_indices=data 81 | y_train=np.copy(y) 82 | y_train[unlabeled_indices]=-1 83 | 84 | fig=plt.figure() 85 | ax=fig.add_subplot(1,1,1) 86 | alphas=np.linspace(0.01,1,num=10,endpoint=True) 87 | Ks=[1,2,3,4,5,8,10,15,20,25,30,35,40,50] 88 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 89 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 90 | 91 | for alpha,color in zip(alphas,colors): 92 | scores=[] 93 | for K in Ks: 94 | clf=LabelPropagation(max_iter=100,n_neighbors=K,alpha=alpha,kernel='knn') 95 | clf.fit(X,y_train) 96 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices])) 97 | ax.plot(Ks,scores,label=r"$\alpha=%s$"%alpha,color=color) 98 | 99 | 100 | ax.set_xlabel(r"$k$") 101 | ax.set_ylabel("score") 102 | ax.legend(loc="best") 103 | ax.set_title("LabelPropagation knn kernel") 104 | plt.show() 105 | if __name__=='__main__': 106 | data=load_data() 107 | test_LabelPropagation(*data) 108 | test_LabelPropagation_rbf(*data) 109 | test_LabelPropagation_knn(*data) -------------------------------------------------------------------------------- /9. Semi-Supervised Learning/9.2LabelSpreading.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn import metrics 4 | from sklearn import datasets 5 | from sklearn.semi_supervised.label_propagation import LabelSpreading 6 | 7 | def load_data(): 8 | ''' 9 | load data 10 | :return: data( have target), data_target, data( not have target) 11 | ''' 12 | digits = datasets.load_digits() 13 | 14 | rng = np.random.RandomState(0) 15 | indices = np.arange(len(digits.data)) 16 | rng.shuffle(indices) 17 | X = digits.data[indices] 18 | y = digits.target[indices] 19 | 20 | n_labeled_points = int(len(y)/10) 21 | unlabeled_indices = np.arange(len(y))[n_labeled_points:] 22 | 23 | return X,y,unlabeled_indices 24 | 25 | def test_LabelSpreading(*data): 26 | ''' 27 | test LabelSpreading 28 | :param data: data( have target), data_target, data( not have target) 29 | :return: None 30 | ''' 31 | X,y,unlabeled_indices=data 32 | y_train=np.copy(y) 33 | y_train[unlabeled_indices]=-1 34 | clf=LabelSpreading(max_iter=100,kernel='rbf',gamma=0.1) 35 | clf.fit(X,y_train) 36 | 37 | predicted_labels = clf.transduction_[unlabeled_indices] 38 | true_labels = y[unlabeled_indices] 39 | print("Accuracy:%f"%metrics.accuracy_score(true_labels,predicted_labels)) 40 | 41 | def test_LabelSpreading_rbf(*data): 42 | ''' 43 | test LabelSpreading with rbf kernel and different alpha, gamma 44 | :param data: data( have target), data_target, data( not have target) 45 | :return: None 46 | ''' 47 | X,y,unlabeled_indices=data 48 | y_train=np.copy(y) 49 | y_train[unlabeled_indices]=-1 50 | 51 | fig=plt.figure() 52 | ax=fig.add_subplot(1,1,1) 53 | alphas=np.linspace(0.01,1,num=10,endpoint=True) 54 | gammas=np.logspace(-2,2,num=50) 55 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 56 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 57 | 58 | for alpha,color in zip(alphas,colors): 59 | scores=[] 60 | for gamma in gammas: 61 | clf=LabelSpreading(max_iter=100,gamma=gamma,alpha=alpha,kernel='rbf') 62 | clf.fit(X,y_train) 63 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices])) 64 | ax.plot(gammas,scores,label=r"$\alpha=%s$"%alpha,color=color) 65 | 66 | 67 | ax.set_xlabel(r"$\gamma$") 68 | ax.set_ylabel("score") 69 | ax.set_xscale("log") 70 | ax.legend(loc="best") 71 | ax.set_title("LabelSpreading rbf kernel") 72 | plt.show() 73 | def test_LabelSpreading_knn(*data): 74 | ''' 75 | test LabelSpreading with knn kernel, and different alpha , n_neighbors 76 | :param data: data( have target), data_target, data( not have target) 77 | :return: None 78 | ''' 79 | X,y,unlabeled_indices=data 80 | y_train=np.copy(y) 81 | y_train[unlabeled_indices]=-1 82 | 83 | fig=plt.figure() 84 | ax=fig.add_subplot(1,1,1) 85 | alphas=np.linspace(0.01,1,num=10,endpoint=True) 86 | Ks=[1,2,3,4,5,8,10,15,20,25,30,35,40,50] 87 | colors=((1,0,0),(0,1,0),(0,0,1),(0.5,0.5,0),(0,0.5,0.5),(0.5,0,0.5), 88 | (0.4,0.6,0),(0.6,0.4,0),(0,0.6,0.4),(0.5,0.3,0.2),) 89 | 90 | for alpha,color in zip(alphas,colors): 91 | scores=[] 92 | for K in Ks: 93 | clf=LabelSpreading(kernel='knn',max_iter=100,n_neighbors=K,alpha=alpha) 94 | clf.fit(X,y_train) 95 | scores.append(clf.score(X[unlabeled_indices],y[unlabeled_indices])) 96 | ax.plot(Ks,scores,label=r"$\alpha=%s$"%alpha,color=color) 97 | 98 | 99 | ax.set_xlabel(r"$k$") 100 | ax.set_ylabel("score") 101 | ax.legend(loc="best") 102 | ax.set_title("LabelSpreading knn kernel") 103 | plt.show() 104 | if __name__=='__main__': 105 | data=load_data() 106 | test_LabelSpreading(*data) 107 | test_LabelSpreading_rbf(*data) 108 | test_LabelSpreading_knn(*data) -------------------------------------------------------------------------------- /9. Semi-Supervised Learning/README.md: -------------------------------------------------------------------------------- 1 | # 综述 2 | 3 | 综合利用有标记的数据和没有标记的俄数据,来生成合适的分类函数模型。 4 | 5 | # 方法 6 | 7 | 生成式半监督学习方法,例如生成式高斯混合模型原理;图半监督学习,等等。 8 | 9 | # 补充 10 | 11 | 半监督学习的存在,首先要优于单纯只利用存在标记的数据集的结果。与此同时,半监督的目标是提高广泛性,但不是必然会提升的。这源于半监督需要充分利用领域知识来设计模型。 12 | 13 | # 实战代码:GitHub 14 | 15 | LabelPropagation: 16 | 17 | https://github.com/JasonK93/ML-note/blob/master/9.%20Semi-Supervised%20Learning/9.1%20labelpropogation.py 18 | 19 | 2. LabelSpreading: 20 | 21 | https://github.com/JasonK93/ML-note/blob/master/9.%20Semi-Supervised%20Learning/9.2LabelSpreading.py 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML-note 2 | ML Learning 3 | 4 | This repo including some Machine Learning method. It can be run directly with the dependencies installed. Some description will set in each dirctory. 5 | 6 | --------------------------------------------------------------------------------