├── Code ├── 1-1多项式拟合sin函数.py ├── 1-2欠拟合,拟合,过拟合.py ├── 1-3糖尿病的线性拟合.py ├── 1-4线性回归.py ├── 1-5弟弟妹妹身高的线性拟合.py ├── 10-1EM.py ├── 10-2Kmean和GaussianMixture.py ├── 11-1马尔可夫.py ├── 11-2马尔可夫训练模型.py ├── 11-3马尔可夫解码.py ├── 11-4马尔可夫(维特比最短路径).py ├── 11-5马尔可夫算法(对应课本P213习题).py ├── 12-1SVD数据压缩.py ├── 12-2PCA数据降维.py ├── 13-1文本特征处理(泰坦尼克).py ├── 13-2文本特征处理-词带.py ├── 13-3文本特征处理(泰坦尼克)-TFIDF.py ├── 13-4文本特征处理-单词二维化.py ├── 13-5文本特征处理-机器语言学习.py ├── 14-1交叉验证.py ├── 14-2Pipeline.py ├── 2-1(重要)二分类模型(感知器学习的原始算法).py ├── 2-2二分类模型(对偶算法).py ├── 2-3二分类模型(sklearn包里的分类算法).py ├── 2-4二分类的课后练习.py ├── 3-1K近邻的距离图.py ├── 3-2K近邻法距离加权与统一的对比.py ├── 3-2(1)K近邻法距离加权与统一的对比.py ├── 3-3KNN算法(原始和包).py ├── 3-4KNN(糖尿病).py ├── 3-5KNN(cifar-10).py ├── 4-1原始贝叶斯.py ├── 4-2导包的高斯贝叶斯.py ├── 4-3高斯伯努利多项式贝叶斯.py ├── 4-4高斯做的分类(数字样本).py ├── 4-5高斯(鱼样本).py ├── 4-6高斯(cifar-10).py ├── 5-1原始决策树.py ├── 5-2决策树(鸢尾样本).py ├── 5-3决策树(数字样本).py ├── 5-4多层决策树回归.py ├── 5-5决策树(鱼样本).py ├── 5-6决策树(cifar-10).py ├── 5-7决策树剪枝(乳腺癌样本).py ├── 5-8计算熵(entropy)的函数.py ├── 6-1逻辑斯蒂的概率分布.py ├── 6-2原始逻辑斯蒂(鸢尾样本).py ├── 6-2逻辑斯蒂(鸢尾样本).py ├── 6-4逻辑斯蒂(数字样本).py ├── 6-5逻辑斯蒂(乳腺癌样本)评估(二分类).py ├── 6-6逻辑斯蒂(广告样本).py ├── 7-1查找best参数.py ├── 7-2决策树(数字样本)评估(多分类).py ├── 7-2官网svm(花样本)评估(多分类).py ├── 8-1原始svm.py ├── 8-2svc参数讲解.py ├── 8-3核是可以选的.py ├── 8-4SVC(数字样本).py ├── 8-5SVC(cifar-10).py ├── 9-1bagging三种集成学习方式.py ├── 9-2原始Adaboost.py ├── 9-3Adaboost与RandomForest.py ├── 9-4集成学习(酒样本).py └── advertising.csv ├── Code_2022 ├── class10-test1.py ├── class10-test2.py ├── class10-test3.py ├── class11-test1.py ├── class11-test2.py ├── class12-test.py ├── class12-test2.py ├── class13-test1.py ├── class13-test2.py ├── class13-test3.py ├── class13-test4.py ├── class13-test5.py ├── class13-test6.py ├── class13-test7.py ├── class14-test1.py ├── class14-test2.py ├── class14-test3.py ├── class15-test1.py ├── class15-test2.py ├── class2-test1.py ├── class2-test2.py ├── class2-test3.py ├── class2-test4.py ├── class2-test5.py ├── class3-test1.py ├── class4-test1.py ├── class4-test2.py ├── class4-test3.py ├── class5-test1.py ├── class5-test2.py ├── class6-test1.py ├── class6-test2.py ├── class7-test1.py ├── class7-test2.py ├── class7-test3.py ├── class7-test4.py ├── class8-test1.py ├── class8-test2.py ├── class8-test3.py ├── class8-test4.py ├── class9-test1.py ├── class9-test2.py ├── class9-test3.py └── readme ├── LICENSE ├── README.md └── Slides ├── A Step by Step Backpropagation Example for Regression using an One-hot Encoded Categorical Variable .pdf ├── CH00 OverView.pdf ├── CH01 Statistical Learning.pdf ├── CH02 Perceptron.pdf ├── CH03 KNN.pdf ├── CH04 NaiveBayes.pdf ├── CH05 DecisionTree.pdf ├── CH06 LogicRegression and Maximum Entropy Model.pdf ├── CH07 SVM.pdf ├── CH08 Boosting.pdf ├── CH09 EM.pdf ├── CH10 Hidden Markov Model.pdf ├── CH16 PCA.pdf ├── CH21 PageRank.pdf ├── CH22 Transformer.pdf ├── CHX0 Summary.pdf └── CHX5 NN-CNN.pdf /Code/1-1多项式拟合sin函数.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.optimize import leastsq 3 | import matplotlib.pyplot as plt 4 | # %matplotlib inline 5 | 6 | # 目标函数 7 | def real_func(x): 8 | return np.sin(2*np.pi*x) 9 | 10 | # 多项式 11 | def fit_func(p, x): 12 | f = np.poly1d(p) 13 | return f(x) 14 | 15 | # 残差 16 | def residuals_func(p, x, y): 17 | ret = fit_func(p, x) - y 18 | return ret 19 | 20 | f = np.poly1d([1,1,1]) 21 | print(f(6)) 22 | 23 | # 十个点 24 | x = np.linspace(0, 1, 10) 25 | x_points = np.linspace(0, 1, 1000) 26 | # 加上正态分布噪音的目标函数的值 27 | y_ = real_func(x) 28 | y = [np.random.normal(0, 0.1) + y1 for y1 in y_] 29 | 30 | def fitting(M=0): 31 | """ 32 | M 为 多项式的次数 33 | """ 34 | # 随机初始化多项式参数 35 | p_init = np.random.rand(M + 1) 36 | # 最小二乘法 37 | p_lsq = leastsq(residuals_func, p_init, args=(x, y)) 38 | print('Fitting Parameters:', p_lsq[0]) 39 | # 可视化 40 | plt.plot(x_points, real_func(x_points), label='real') 41 | plt.plot(x_points, fit_func(p_lsq[0], x_points), label='fitted curve') 42 | plt.plot(x, y, 'bo', label='noise') 43 | plt.legend() 44 | plt.show() 45 | return p_lsq 46 | 47 | p_lsq_0 = fitting(M=3) 48 | 49 | 50 | -------------------------------------------------------------------------------- /Code/1-2欠拟合,拟合,过拟合.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.model_selection import cross_val_score 7 | 8 | 9 | def true_fun(X): 10 | return np.cos(1.5 * np.pi * X) 11 | 12 | np.random.seed(0) 13 | 14 | n_samples = 30 15 | degrees = [1, 4, 15] 16 | 17 | X = np.sort(np.random.rand(n_samples)) 18 | y = true_fun(X) + np.random.randn(n_samples) * 0.1 19 | 20 | plt.figure(figsize=(14, 5)) 21 | for i in range(len(degrees)): 22 | ax = plt.subplot(1, len(degrees), i + 1) 23 | plt.setp(ax, xticks=(), yticks=()) 24 | 25 | polynomial_features = PolynomialFeatures(degree=degrees[i], 26 | include_bias=False) 27 | linear_regression = LinearRegression() 28 | pipeline = Pipeline([("polynomial_features", polynomial_features), 29 | ("linear_regression", linear_regression)]) 30 | pipeline.fit(X[:, np.newaxis], y) 31 | 32 | # Evaluate the models using crossvalidation 33 | scores = cross_val_score(pipeline, X[:, np.newaxis], y, 34 | scoring="neg_mean_squared_error", cv=10) 35 | 36 | X_test = np.linspace(0, 1, 100) 37 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 38 | plt.plot(X_test, true_fun(X_test), label="True function") 39 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 40 | plt.xlabel("x") 41 | plt.ylabel("y") 42 | plt.xlim((0, 1)) 43 | plt.ylim((-2, 2)) 44 | plt.legend(loc="best") 45 | plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format( 46 | degrees[i], -scores.mean(), scores.std())) 47 | plt.show() 48 | -------------------------------------------------------------------------------- /Code/1-3糖尿病的线性拟合.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from sklearn import datasets, linear_model 5 | from sklearn.metrics import mean_squared_error, r2_score 6 | 7 | # Load the diabetes dataset 8 | diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True) 9 | 10 | # Use only one feature 11 | diabetes_X_1 = diabetes_X[:, np.newaxis, 2] 12 | 13 | print(diabetes_X.shape) 14 | print(diabetes_X) 15 | 16 | import matplotlib.pyplot as plt 17 | import numpy as np 18 | from sklearn import datasets, linear_model 19 | from sklearn.metrics import mean_squared_error, r2_score 20 | 21 | # Load the diabetes dataset 22 | diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True) 23 | 24 | # Use only one feature 25 | diabetes_X = diabetes_X[:, np.newaxis, 2] 26 | 27 | # Split the data into training/testing sets 28 | diabetes_X_train = diabetes_X[:-20] 29 | diabetes_X_test = diabetes_X[-20:] 30 | 31 | # Split the targets into training/testing sets 32 | diabetes_y_train = diabetes_y[:-20] 33 | diabetes_y_test = diabetes_y[-20:] 34 | 35 | # Create linear regression object 36 | regr = linear_model.LinearRegression() 37 | 38 | # Train the model using the training sets 39 | regr.fit(diabetes_X_train, diabetes_y_train) 40 | 41 | # Make predictions using the testing set 42 | diabetes_y_pred = regr.predict(diabetes_X_test) 43 | 44 | # The coefficients 45 | print('Coefficients: \n', regr.coef_) 46 | # The mean squared error 47 | print('Mean squared error: %.2f' 48 | % mean_squared_error(diabetes_y_test, diabetes_y_pred)) 49 | # The coefficient of determination: 1 is perfect prediction 50 | print('Coefficient of determination: %.2f' 51 | % r2_score(diabetes_y_test, diabetes_y_pred)) 52 | 53 | # Plot outputs 54 | plt.scatter(diabetes_X_test, diabetes_y_test, color='black') 55 | plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3) 56 | 57 | plt.xticks(()) 58 | plt.yticks(()) 59 | plt.show() 60 | 61 | -------------------------------------------------------------------------------- /Code/1-4线性回归.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) 4 | # y = 1 * x_0 + 2 * x_1 + 3 5 | y = np.dot(X, np.array([1, 2])) + 3 6 | reg = LinearRegression().fit(X, y) 7 | print(reg.score(X, y)) 8 | print(reg.coef_) 9 | print(reg.intercept_) 10 | print(reg.predict(np.array([[3, 5]]))) 11 | 12 | -------------------------------------------------------------------------------- /Code/1-5弟弟妹妹身高的线性拟合.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | import matplotlib.pyplot as plt 4 | a = [[71],[68],[66],[67],[70],[71],[70],[73],[72],[65],[66]] 5 | b = [69,64,65,63,65,62,65,64,66,59,62] 6 | def fit_func(p, x): 7 | f = np.poly1d(p) 8 | return f(x) 9 | def residuals_func(p, x, y): 10 | ret = fit_func(p, x) - y 11 | return ret 12 | 13 | plt.scatter(a, b, label = 'real data') 14 | plt.xlabel('bother height') 15 | plt.ylabel('sister height') 16 | plt.title('this is a demo') 17 | 18 | reg = LinearRegression().fit(a, b) 19 | y_pred = reg.predict(a) 20 | plt.plot(a, y_pred, color='red', label = 'prediect') 21 | plt.legend() # 将标注显示出来 22 | plt.show() 23 | -------------------------------------------------------------------------------- /Code/10-1EM.py: -------------------------------------------------------------------------------- 1 | 2 | import itertools 3 | 4 | import numpy as np 5 | from scipy import linalg 6 | import matplotlib.pyplot as plt 7 | import matplotlib as mpl 8 | 9 | from sklearn import mixture 10 | 11 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', 12 | 'darkorange']) 13 | 14 | 15 | def plot_results(X, Y_, means, covariances, index, title): 16 | splot = plt.subplot(2, 1, 1 + index) 17 | for i, (mean, covar, color) in enumerate(zip( 18 | means, covariances, color_iter)): 19 | v, w = linalg.eigh(covar) 20 | v = 2. * np.sqrt(2.) * np.sqrt(v) 21 | u = w[0] / linalg.norm(w[0]) 22 | # as the DP will not use every component it has access to 23 | # unless it needs it, we shouldn't plot the redundant 24 | # components. 25 | if not np.any(Y_ == i): 26 | continue 27 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 28 | 29 | # Plot an ellipse to show the Gaussian component 30 | angle = np.arctan(u[1] / u[0]) 31 | angle = 180. * angle / np.pi # convert to degrees 32 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) 33 | ell.set_clip_box(splot.bbox) 34 | ell.set_alpha(0.5) 35 | splot.add_artist(ell) 36 | 37 | plt.xlim(-9., 5.) 38 | plt.ylim(-3., 6.) 39 | plt.xticks(()) 40 | plt.yticks(()) 41 | plt.title(title) 42 | 43 | 44 | # Number of samples per component 45 | n_samples = 500 46 | 47 | # Generate random sample, two components 48 | np.random.seed(0) 49 | C = np.array([[0., -0.1], [1.7, .4]]) 50 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), 51 | .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] 52 | 53 | # Fit a Gaussian mixture with EM using five components 54 | gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X) 55 | plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, 56 | 'Gaussian Mixture') 57 | 58 | 59 | 60 | plt.show() 61 | 62 | 63 | 64 | print(gmm.weights_) 65 | print(gmm.means_) 66 | print(gmm.covariances_) 67 | 68 | from sklearn import mixture 69 | 70 | 71 | 72 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', 73 | 'darkorange']) 74 | 75 | 76 | def plot_results(X, Y_, means, covariances, index, title): 77 | splot = plt.subplot(2, 1, 1 + index) 78 | for i, (mean, covar, color) in enumerate(zip( 79 | means, covariances, color_iter)): 80 | v, w = linalg.eigh(covar) 81 | v = 2. * np.sqrt(2.) * np.sqrt(v) 82 | u = w[0] / linalg.norm(w[0]) 83 | # as the DP will not use every component it has access to 84 | # unless it needs it, we shouldn't plot the redundant 85 | # components. 86 | if not np.any(Y_ == i): 87 | continue 88 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 89 | 90 | # Plot an ellipse to show the Gaussian component 91 | angle = np.arctan(u[1] / u[0]) 92 | angle = 180. * angle / np.pi # convert to degrees 93 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) 94 | ell.set_clip_box(splot.bbox) 95 | ell.set_alpha(0.5) 96 | splot.add_artist(ell) 97 | 98 | plt.xlim(-9., 5.) 99 | plt.ylim(-3., 6.) 100 | plt.xticks(()) 101 | plt.yticks(()) 102 | plt.title(title) 103 | 104 | 105 | -------------------------------------------------------------------------------- /Code/10-2Kmean和GaussianMixture.py: -------------------------------------------------------------------------------- 1 | 2 | import itertools 3 | 4 | import numpy as np 5 | from scipy import linalg 6 | import matplotlib.pyplot as plt 7 | import matplotlib as mpl 8 | 9 | from sklearn import mixture 10 | 11 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', 12 | 'darkorange']) 13 | 14 | 15 | def plot_results(X, Y_, means, covariances, index, title): 16 | splot = plt.subplot(2, 1, 1 + index) 17 | for i, (mean, covar, color) in enumerate(zip( 18 | means, covariances, color_iter)): 19 | v, w = linalg.eigh(covar) 20 | v = 2. * np.sqrt(2.) * np.sqrt(v) 21 | u = w[0] / linalg.norm(w[0]) 22 | # as the DP will not use every component it has access to 23 | # unless it needs it, we shouldn't plot the redundant 24 | # components. 25 | if not np.any(Y_ == i): 26 | continue 27 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 28 | 29 | # Plot an ellipse to show the Gaussian component 30 | angle = np.arctan(u[1] / u[0]) 31 | angle = 180. * angle / np.pi # convert to degrees 32 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) 33 | ell.set_clip_box(splot.bbox) 34 | ell.set_alpha(0.5) 35 | splot.add_artist(ell) 36 | 37 | plt.xlim(-9., 5.) 38 | plt.ylim(-3., 6.) 39 | plt.xticks(()) 40 | plt.yticks(()) 41 | plt.title(title) 42 | 43 | 44 | # Number of samples per component 45 | n_samples = 500 46 | 47 | # Generate random sample, two components 48 | np.random.seed(0) 49 | C = np.array([[0., -0.1], [1.7, .4]]) 50 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), 51 | .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] 52 | 53 | # Fit a Gaussian mixture with EM using five components 54 | gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X) 55 | plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, 56 | 'Gaussian Mixture') 57 | 58 | 59 | 60 | plt.show() 61 | 62 | print(gmm.weights_) 63 | print(gmm.means_) 64 | print(gmm.covariances_) 65 | 66 | from sklearn.cluster import KMeans 67 | import numpy as np 68 | X = np.array([[1, 2], [1, 4], [1, 0],[4, 2], [4, 4], [4, 0]]) 69 | kmeans = KMeans(n_clusters=2, random_state=0).fit(X) 70 | print(kmeans.labels_) 71 | print(kmeans.predict([[0, 0], [4, 4]])) 72 | print(kmeans.cluster_centers_) 73 | 74 | 75 | #euclidian distance between 2 data points. For as many data points as necessary. 76 | def euclidean_distance(a, b): 77 | return np.linalg.norm(a-b) 78 | 79 | 80 | 81 | 82 | 83 | def kmeans(data,k=3): 84 | m = data.shape[0] 85 | index = random.sample(range(m),k) 86 | mu = data[index] #随机选择初始均值向量 87 | 88 | 89 | while True: 90 | 91 | C = defaultdict(list) 92 | 93 | for j in range(0,m): 94 | dij = [euclidean_distance(data[j],mu[i]) for i in range(k)] 95 | lambda_j = np.argmin(dij) #选择最小的值得下标 96 | 97 | C[lambda_j].append(data[j].tolist()) 98 | 99 | new_mu = [np.mean(C[i],axis=0).tolist() for i in range(k)] 100 | 101 | if (euclidean_distance(np.array(new_mu),np.array(mu))>1e-9): 102 | mu = new_mu 103 | else: 104 | break 105 | 106 | return C,mu 107 | 108 | 109 | watermelon = np.array([[ 0.697 ,0.46 ], 110 | [ 0.774 ,0.376], 111 | [ 0.634 ,0.264], 112 | [ 0.608 ,0.318], 113 | [ 0.556 ,0.215], 114 | [ 0.403 ,0.237], 115 | [ 0.481 ,0.149], 116 | [ 0.437 ,0.211], 117 | [ 0.666 ,0.091], 118 | [ 0.243 ,0.267], 119 | [ 0.245 ,0.057], 120 | [ 0.343 ,0.099], 121 | [ 0.639 ,0.161], 122 | [ 0.657 ,0.198], 123 | [ 0.36 ,0.37 ], 124 | [ 0.593 ,0.042], 125 | [ 0.719 ,0.103], 126 | [ 0.359 ,0.188], 127 | [ 0.339 ,0.241], 128 | [ 0.282 ,0.257], 129 | [ 0.748 ,0.232], 130 | [ 0.714 ,0.346], 131 | [ 0.483 ,0.312], 132 | [ 0.478 ,0.437], 133 | [ 0.525 ,0.369], 134 | [ 0.751 ,0.489], 135 | [ 0.532 ,0.472], 136 | [ 0.473 ,0.376], 137 | [ 0.725 ,0.445], 138 | [ 0.446 ,0.459]]) 139 | 140 | 141 | k = 2 142 | res,mu = kmeans(watermelon,k) 143 | print(res) 144 | print('新的中心:',mu) 145 | 146 | 147 | class GaussianMixture: 148 | "Model mixture of two univariate Gaussians and their EM estimation" 149 | 150 | def __init__(self, data, mu_min=min(data), mu_max=max(data), sigma_min=.1, sigma_max=1, mix=.5): 151 | self.data = data 152 | # init with multiple gaussians 153 | self.one = Gaussian(uniform(mu_min, mu_max), 154 | uniform(sigma_min, sigma_max)) 155 | self.two = Gaussian(uniform(mu_min, mu_max), 156 | uniform(sigma_min, sigma_max)) 157 | 158 | # as well as how much to mix them 159 | self.mix = mix 160 | self.loglike = 0. # = log(p = 1) 161 | 162 | def Estep(self): 163 | "Perform an E(stimation)-step, freshening up self.loglike in the process" 164 | # compute weights 165 | self.loglike = 0. # = log(p = 1) 166 | for datum in self.data: 167 | # unnormalized weights 168 | wp1 = self.one.pdf(datum) * self.mix 169 | wp2 = self.two.pdf(datum) * (1. - self.mix) 170 | # compute denominator 171 | den = wp1 + wp2 172 | # normalize 173 | wp1 /= den 174 | wp2 /= den 175 | # add into loglike 176 | self.loglike += log(wp1 + wp2) 177 | # yield weight tuple 178 | yield (wp1, wp2) 179 | 180 | def Mstep(self, weights): 181 | "Perform an M(aximization)-step" 182 | # compute denominators 183 | (left, rigt) = zip(*weights) 184 | one_den = sum(left) 185 | two_den = sum(rigt) 186 | # compute new means 187 | self.one.mu = sum(w * d / one_den for (w, d) in zip(left, data)) 188 | self.two.mu = sum(w * d / two_den for (w, d) in zip(rigt, data)) 189 | # compute new sigmas 190 | self.one.sigma = sqrt(sum(w * ((d - self.one.mu) ** 2) 191 | for (w, d) in zip(left, data)) / one_den) 192 | self.two.sigma = sqrt(sum(w * ((d - self.two.mu) ** 2) 193 | for (w, d) in zip(rigt, data)) / two_den) 194 | # compute new mix 195 | self.mix = one_den / len(data) 196 | 197 | def iterate(self, N=1, verbose=False): 198 | "Perform N iterations, then compute log-likelihood" 199 | 200 | def pdf(self, x): 201 | return (self.mix) * self.one.pdf(x) + (1 - self.mix) * self.two.pdf(x) 202 | 203 | def __repr__(self): 204 | return 'GaussianMixture({0}, {1}, mix={2.03})'.format(self.one, 205 | self.two, 206 | self.mix) 207 | 208 | def __str__(self): 209 | return 'Mixture: {0}, {1}, mix={2:.03})'.format(self.one, 210 | self.two, 211 | self.mix) -------------------------------------------------------------------------------- /Code/11-1马尔可夫.py: -------------------------------------------------------------------------------- 1 | from math import exp 2 | 3 | import numpy as np 4 | from hmmlearn import hmm 5 | 6 | status = ['盒子1', '盒子2', '盒子3'] 7 | obs = ['白球', '黑球'] 8 | n_status = len(status) 9 | m_obs = len(obs) 10 | start_probability = np.array([0.2, 0.5, 0.3]) 11 | transition_probability = np.array([ 12 | [0.5, 0.4, 0.1], #盒子1到1,1到2,1到3的概率 13 | [0.2, 0.2, 0.6], 14 | [0.2, 0.5, 0.3] 15 | ]) 16 | emission_probalitity = np.array([ 17 | [0.4, 0.6], 18 | [0.8, 0.2], 19 | [0.5, 0.5] 20 | ]) 21 | 22 | model = hmm.MultinomialHMM(n_components=n_status) 23 | model.startprob_ = start_probability 24 | model.transmat_ = transition_probability 25 | model.emissionprob_ = emission_probalitity 26 | 27 | # 预测问题 28 | seen=np.array([0,1,0]) #白球,黑球,白球 29 | 30 | # 观测序列的概率计算问题 31 | # score函数返回的是以自然对数为底的对数概率值 32 | # ln0.13022≈−2.0385 33 | print(exp(model.score(seen.reshape(-1,1)))) -------------------------------------------------------------------------------- /Code/11-2马尔可夫训练模型.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import hmmlearn.hmm as hmm 3 | 4 | states = ['盒子1', '盒子2', '盒子3'] 5 | obs = ['白球', '黑球'] 6 | n_states = len(states) 7 | m_obs = len(obs) 8 | 9 | model2 = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.001) 10 | X2 = np.array([ 11 | [0, 1, 0, 0, 1], 12 | [0, 0, 0, 1, 1], 13 | [1, 1, 0, 1, 0], 14 | [0, 1, 0, 1, 1], 15 | [0, 0, 0, 1, 0] 16 | ]) 17 | model2.fit(X2) 18 | print("输出根据数据训练出来的π") 19 | print(model2.startprob_) 20 | print("输出根据数据训练出来的A") 21 | print(model2.transmat_) 22 | print("输出根据数据训练出来的B") 23 | print(model2.emissionprob_) 24 | #从观测的结果反过去求盒子和球 -------------------------------------------------------------------------------- /Code/11-3马尔可夫解码.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hmmlearn import hmm 3 | status = ['盒子1', '盒子2', '盒子3'] 4 | obs = ['白球', '黑球'] 5 | n_status = len(status) 6 | m_obs = len(obs) 7 | start_probability = np.array([0.2, 0.5, 0.3]) 8 | transition_probability = np.array([ 9 | [0.5, 0.4, 0.1], 10 | [0.2, 0.2, 0.6], 11 | [0.2, 0.5, 0.3] 12 | ]) 13 | emission_probalitity = np.array([ 14 | [0.4, 0.6], 15 | [0.8, 0.2], 16 | [0.5, 0.5] 17 | ]) 18 | 19 | model = hmm.MultinomialHMM(n_components=n_status) 20 | model.startprob_ = start_probability 21 | model.transmat_ = transition_probability 22 | model.emissionprob_ = emission_probalitity 23 | 24 | se = np.array([[0, 1, 0, 0, 1]]).T 25 | logprob, box_index = model.decode(se, algorithm='viterbi') 26 | print("颜色:", end="") 27 | print(" ".join(map(lambda t: obs[t], [0, 1, 0, 0, 1]))) 28 | print("盒子:", end="") 29 | print(" ".join(map(lambda t: status[t], box_index))) 30 | print("概率值:", end="") 31 | print(np.exp(logprob)) # 这个是因为在hmmlearn底层将概率进行了对数化,防止出现乘积为0的情况 -------------------------------------------------------------------------------- /Code/11-4马尔可夫(维特比最短路径).py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hmmlearn import hmm 3 | startprob = np.array([0.6, 0.3, 0.1, 0.0]) 4 | # The transition matrix, note that there are no transitions possible 5 | # between component 1 and 3 6 | transmat = np.array([[0.7, 0.2, 0.0, 0.1], 7 | [0.3, 0.5, 0.2, 0.0], 8 | [0.0, 0.3, 0.5, 0.2], 9 | [0.2, 0.0, 0.2, 0.6]]) 10 | # The means of each component 11 | means = np.array([[0.0, 0.0], 12 | [0.0, 11.0], 13 | [9.0, 10.0], 14 | [11.0, -1.0]]) 15 | # The covariance of each component 16 | covars = .5 * np.tile(np.identity(2), (4, 1, 1)) 17 | 18 | # Build an HMM instance and set parameters 19 | model3 = hmm.GaussianHMM(n_components=4, covariance_type="full") 20 | 21 | # Instead of fitting it from the data, we directly set the estimated 22 | # parameters, the means and covariance of the components 23 | model3.startprob_ = startprob 24 | model3.transmat_ = transmat 25 | model3.means_ = means 26 | model3.covars_ = covars 27 | 28 | 29 | seen = np.array([[1.1,2.0],[-1,2.0],[3,7]]) 30 | logprob, state = model3.decode(seen, algorithm="viterbi") 31 | print(logprob,state) 32 | 33 | 34 | -------------------------------------------------------------------------------- /Code/11-5马尔可夫算法(对应课本P213习题).py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class HiddenMarkov: 4 | def forward(self, Q, V, A, B, O, PI): # 使用前向算法 5 | N = len(Q) # 状态序列的大小 6 | M = len(O) # 观测序列的大小 7 | alphas = np.zeros((N, M)) # alpha值 8 | T = M # 有几个时刻,有几个观测序列,就有几个时刻 9 | for t in range(T): # 遍历每一时刻,算出alpha值 10 | indexOfO = V.index(O[t]) # 找出序列对应的索引 11 | for i in range(N): 12 | if t == 0: # 计算初值 13 | alphas[i][t] = PI[t][i] * B[i][indexOfO] # P176(10.15) 14 | print('alpha1(%d)=p%db%db(o1)=%f' % (i, i, i, alphas[i][t])) 15 | else: 16 | alphas[i][t] = np.dot([alpha[t - 1] for alpha in alphas], [a[i] for a in A]) * B[i][ 17 | indexOfO] # 对应P176(10.16) 18 | print('alpha%d(%d)=[sigma alpha%d(i)ai%d]b%d(o%d)=%f' % (t, i, t - 1, i, i, t, alphas[i][t])) 19 | # print(alphas) 20 | P = np.sum([alpha[M - 1] for alpha in alphas]) # P176(10.17) 21 | # alpha11 = pi[0][0] * B[0][0] #代表a1(1) 22 | # alpha12 = pi[0][1] * B[1][0] #代表a1(2) 23 | # alpha13 = pi[0][2] * B[2][0] #代表a1(3) 24 | 25 | def backward(self, Q, V, A, B, O, PI): # 后向算法 26 | N = len(Q) # 状态序列的大小 27 | M = len(O) # 观测序列的大小 28 | betas = np.ones((N, M)) # beta 29 | for i in range(N): 30 | print('beta%d(%d)=1' % (M, i)) 31 | for t in range(M - 2, -1, -1): 32 | indexOfO = V.index(O[t + 1]) # 找出序列对应的索引 33 | for i in range(N): 34 | betas[i][t] = np.dot(np.multiply(A[i], [b[indexOfO] for b in B]), [beta[t + 1] for beta in betas]) 35 | realT = t + 1 36 | realI = i + 1 37 | print('beta%d(%d)=[sigma a%djbj(o%d)]beta%d(j)=(' % (realT, realI, realI, realT + 1, realT + 1), 38 | end='') 39 | for j in range(N): 40 | print("%.2f*%.2f*%.2f+" % (A[i][j], B[j][indexOfO], betas[j][t + 1]), end='') 41 | print("0)=%.3f" % betas[i][t]) 42 | # print(betas) 43 | indexOfO = V.index(O[0]) 44 | P = np.dot(np.multiply(PI, [b[indexOfO] for b in B]), [beta[0] for beta in betas]) 45 | print("P(O|lambda)=", end="") 46 | for i in range(N): 47 | print("%.1f*%.1f*%.5f+" % (PI[0][i], B[i][indexOfO], betas[i][0]), end="") 48 | print("0=%f" % P) 49 | 50 | def viterbi(self, Q, V, A, B, O, PI): 51 | N = len(Q) # 状态序列的大小 52 | M = len(O) # 观测序列的大小 53 | deltas = np.zeros((N, M)) 54 | psis = np.zeros((N, M)) 55 | I = np.zeros((1, M)) 56 | for t in range(M): 57 | realT = t+1 58 | indexOfO = V.index(O[t]) # 找出序列对应的索引 59 | for i in range(N): 60 | realI = i+1 61 | if t == 0: 62 | deltas[i][t] = PI[0][i] * B[i][indexOfO] 63 | psis[i][t] = 0 64 | print('delta1(%d)=pi%d * b%d(o1)=%.2f * %.2f=%.2f'%(realI, realI, realI, PI[0][i], B[i][indexOfO], deltas[i][t])) 65 | print('psis1(%d)=0' % (realI)) 66 | else: 67 | deltas[i][t] = np.max(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])) * B[i][indexOfO] 68 | print('delta%d(%d)=max[delta%d(j)aj%d]b%d(o%d)=%.2f*%.2f=%.5f'%(realT, realI, realT-1, realI, realI, realT, np.max(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])), B[i][indexOfO], deltas[i][t])) 69 | psis[i][t] = np.argmax(np.multiply([delta[t-1] for delta in deltas], [a[i] for a in A])) 70 | print('psis%d(%d)=argmax[delta%d(j)aj%d]=%d' % (realT, realI, realT-1, realI, psis[i][t])) 71 | print(deltas) 72 | print(psis) 73 | I[0][M-1] = np.argmax([delta[M-1] for delta in deltas]) 74 | print('i%d=argmax[deltaT(i)]=%d' % (M, I[0][M-1]+1)) 75 | for t in range(M-2, -1, -1): 76 | I[0][t] = psis[int(I[0][t+1])][t+1] 77 | print('i%d=psis%d(i%d)=%d' % (t+1, t+2, t+2, I[0][t]+1)) 78 | print(I) 79 | 80 | 81 | Q = [1, 2, 3] 82 | V = ['红', '白'] 83 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]] 84 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]] 85 | # O = ['红', '白', '红', '红', '白', '红', '白', '白'] 86 | O = ['红', '白', '红', '白'] 87 | PI = [[0.2, 0.4, 0.4]] 88 | 89 | 90 | HMM = HiddenMarkov() 91 | HMM.forward(Q, V, A, B, O, PI) 92 | HMM.backward(Q, V, A, B, O, PI) 93 | HMM.viterbi(Q, V, A, B, O, PI) 94 | 95 | -------------------------------------------------------------------------------- /Code/12-1SVD数据压缩.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.decomposition import TruncatedSVD 3 | from scipy.sparse import random as sparse_random 4 | from sklearn.random_projection import sparse_random_matrix 5 | X = sparse_random(100, 100, density=0.01, format='csr', 6 | random_state=42) 7 | svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) 8 | svd.fit(X) 9 | 10 | print(svd.explained_variance_ratio_) 11 | 12 | print(svd.explained_variance_ratio_.sum()) 13 | print(svd.singular_values_) 14 | import numpy as np 15 | 16 | a = np.random.randn(9, 6) + 1j*np.random.randn(9, 6) 17 | b = np.random.randn(2, 7, 8, 3) + 1j*np.random.randn(2, 7, 8, 3) 18 | 19 | u, s, vh = np.linalg.svd(a, full_matrices=True) 20 | print(a.shape) 21 | print(u.shape, s.shape, vh.shape) 22 | 23 | from PIL import Image 24 | import matplotlib.image as mpimg 25 | 26 | 27 | I = mpimg.imread('data/F_test.jpeg') 28 | #Now, let's look at the size of this numpy array object img as well as plot it using imshow. 29 | print(I.shape) 30 | plt.axis('off') 31 | plt.imshow(I) 32 | 33 | def show_img(img): 34 | plt.figure(figsize = (10, 7.5)) 35 | plt.imshow(img, cmap = 'gray', vmin=0, vmax=255, aspect = 'auto') 36 | plt.axis('off') 37 | plt.show() 38 | 39 | U, S, V_T = np.linalg.svd(I) 40 | #U.shape, S.shape, V_T.shape 41 | 42 | 43 | 44 | I = I[:,:,1] 45 | print(I.shape) 46 | 47 | 48 | plt.figure(figsize = (9, 5)) 49 | plt.plot(np.arange(S.shape[0]), S) 50 | plt.yscale('log') 51 | plt.xlabel('Index of $\sigma$') 52 | plt.ylabel('log(value of $\sigma$)') 53 | plt.title('Singular values $\sigma_i$ vs its index') 54 | plt.show() 55 | plt.figure(figsize = (9, 5)) 56 | plt.plot(np.cumsum(S) / sum(S)) 57 | plt.xlabel('Index of $\sigma$') 58 | plt.ylabel('Value of $\sigma$') 59 | plt.title('Cumulative sum of $\sigma_i$ vs its index\n(Percent of explained variance)') 60 | plt.show() 61 | S_full = np.zeros((U.shape[0], V_T.shape[0])) 62 | 63 | #S_full.shape 64 | 65 | S_diag = np.diag(S) 66 | S_full[:S_diag.shape[0], :S_diag.shape[1]] = S_diag 67 | 68 | for i in [5, 10, 25, 50, 100, 200, U.shape[0]]: 69 | print(str(i) + '\n') 70 | show_img(U[:, :i].dot(S_full[:i, :i].dot(V_T[:i, :]))) 71 | print('-' * 100 + '\n') 72 | 73 | 74 | -------------------------------------------------------------------------------- /Code/12-2PCA数据降维.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.datasets import load_iris 3 | import numpy as np 4 | iris = load_iris() 5 | #checking to see what datasets are available in iris 6 | print(iris.keys()) 7 | print(iris.data.shape) 8 | print(iris.feature_names) 9 | 10 | from sklearn.decomposition import PCA 11 | pca = PCA(2) 12 | print(pca) 13 | 14 | X, y = iris.data, iris.target 15 | X_proj = pca.fit_transform(X) 16 | print(X_proj.shape) 17 | 18 | plt.scatter(X_proj[:,0], X_proj[:,1],c=y) 19 | plt.show() 20 | 21 | from sklearn.datasets import load_digits 22 | digits = load_digits() 23 | print(digits.keys()) 24 | 25 | print(digits.data.shape) 26 | 27 | print(digits.images.shape) 28 | 29 | X,y = digits.data, digits.target 30 | pca_digits=PCA(0.95) 31 | X_proj = pca_digits.fit_transform(X) 32 | print(X.shape, X_proj.shape) 33 | 34 | 35 | pca_digits=PCA(2) 36 | X_proj = pca_digits.fit_transform(X) 37 | print(np.sum(pca_digits.explained_variance_ratio_)) 38 | 39 | 40 | print(X_proj.shape) 41 | 42 | 43 | plt.scatter(X_proj[:,0], X_proj[:,1], c=y) 44 | plt.colorbar() 45 | plt.show() 46 | 47 | pca_digits = PCA(64).fit(X) 48 | plt.semilogx(np.cumsum(pca_digits.explained_variance_ratio_)) 49 | plt.xlabel('Number of Components') 50 | plt.ylabel('Variance retained') 51 | plt.ylim(0,1) 52 | plt.show() 53 | 54 | from PIL import Image 55 | 56 | im1 = Image.open('data/F_test.jpeg') 57 | im1.save('data/F_test.png') 58 | 59 | import matplotlib.image as mpimg 60 | img = mpimg.imread('data/F_test.png') 61 | #Now, let's look at the size of this numpy array object img as well as plot it using imshow. 62 | print(img.shape) 63 | plt.axis('off') 64 | plt.imshow(img) 65 | 66 | 67 | img_r = np.reshape(img, (800, 3600)) 68 | print(img_r.shape) 69 | 70 | 71 | ipca = PCA(64).fit(img_r) 72 | img_c = ipca.transform(img_r) 73 | print(img_c.shape) 74 | print(np.sum(ipca.explained_variance_ratio_)) 75 | 76 | 77 | temp = ipca.inverse_transform(img_c) 78 | print(temp.shape) 79 | 80 | 81 | temp = np.reshape(temp, (800,1200,3)) 82 | print(temp.shape) 83 | 84 | 85 | plt.axis('off') 86 | plt.imshow(temp) 87 | -------------------------------------------------------------------------------- /Code/13-1文本特征处理(泰坦尼克).py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | from sklearn.preprocessing import OneHotEncoder 4 | 5 | train = pd.read_csv('onehot/train.csv') 6 | train.head() 7 | 8 | 9 | train.info() 10 | 11 | 12 | data = train 13 | data['Died']= 1 - data['Survived'] 14 | plt.show(data.groupby('Sex').agg('sum')[['Survived','Died']].plot(kind='bar',stacked=True)) 15 | 16 | 17 | encoder = OneHotEncoder(sparse=False) 18 | En_ec = encoder.fit_transform(train[['Sex']]) 19 | En_ec = pd.DataFrame(En_ec) 20 | train_new = pd.concat([train,En_ec],axis=1) 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Code/13-2文本特征处理-词带.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | corpus = [ 3 | 'This is the first document.', 4 | 'This document is the second document.', 5 | 'And this is the third one.', 6 | 'Is this the first document?', 7 | ] 8 | 9 | vectorizer = CountVectorizer() 10 | X = vectorizer.fit_transform(corpus) 11 | print(vectorizer.get_feature_names()) 12 | 13 | print(X.toarray()) 14 | 15 | 16 | vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2)) 17 | X2 = vectorizer2.fit_transform(corpus) 18 | print(vectorizer2.get_feature_names()) 19 | 20 | print(X2.toarray()) 21 | -------------------------------------------------------------------------------- /Code/13-3文本特征处理(泰坦尼克)-TFIDF.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | corpus = ['this is the first document', 3 | 'this document is the second document', 4 | 'and this is the third one', 5 | 'is this the first document'] 6 | 7 | def display_features(features,feature_names): 8 | df = pd.DataFrame(data=features, columns = feature_names) 9 | print(df) 10 | 11 | 12 | 13 | 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | 16 | 17 | def bow_extractor(corpus, ngram_range=(1, 1)): 18 | vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range) 19 | features = vectorizer.fit_transform(corpus) 20 | return vectorizer, features 21 | 22 | 23 | 24 | bow_vectorizer, bow_features = bow_extractor(corpus) 25 | print(bow_features.todense()) 26 | 27 | feature_names = bow_vectorizer.get_feature_names() 28 | 29 | print(feature_names) 30 | 31 | features = bow_features.todense() 32 | display_features(features, feature_names) 33 | 34 | 35 | from sklearn.feature_extraction.text import TfidfTransformer 36 | 37 | 38 | def tfidf_transformer(bow_matrix): 39 | transformer = TfidfTransformer(norm='l2', 40 | smooth_idf=True, 41 | use_idf=True) 42 | tfidf_matrix = transformer.fit_transform(bow_matrix) 43 | return transformer, tfidf_matrix 44 | 45 | 46 | 47 | import numpy as np 48 | 49 | feature_names = bow_vectorizer.get_feature_names() 50 | tfidf_trans, tdidf_features = tfidf_transformer(bow_features) 51 | 52 | features = np.round(tdidf_features.todense(), 2) 53 | display_features(features, feature_names) 54 | 55 | -------------------------------------------------------------------------------- /Code/13-4文本特征处理-单词二维化.py: -------------------------------------------------------------------------------- 1 | from gensim.models import Word2Vec 2 | # define training data 3 | sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'], 4 | ['this', 'is', 'the', 'second', 'sentence'], 5 | ['yet', 'another', 'sentence'], 6 | ['one', 'more', 'sentence'], 7 | ['and', 'the', 'final', 'sentence']] 8 | # train model 9 | model = Word2Vec(sentences, min_count=1) 10 | # summarize the loaded model 11 | print(model) 12 | # summarize vocabulary 13 | words = list(model.wv.vocab) 14 | print(words) 15 | # access vector for one word 16 | print(model['sentence']) 17 | 18 | 19 | from sklearn.decomposition import PCA 20 | 21 | 22 | 23 | from matplotlib import pyplot 24 | 25 | 26 | 27 | X = model[model.wv.vocab] 28 | pca = PCA(n_components=2) 29 | result = pca.fit_transform(X) 30 | # create a scatter plot of the projection 31 | pyplot.scatter(result[:, 0], result[:, 1]) 32 | words = list(model.wv.vocab) 33 | for i, word in enumerate(words): 34 | pyplot.annotate(word, xy=(result[i, 0], result[i, 1])) 35 | pyplot.show() 36 | 37 | -------------------------------------------------------------------------------- /Code/13-5文本特征处理-机器语言学习.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | from tqdm import tqdm 5 | tqdm.pandas(desc="progress-bar") 6 | from gensim.models import Doc2Vec 7 | from sklearn import utils 8 | from sklearn.model_selection import train_test_split 9 | import gensim 10 | from sklearn.linear_model import LogisticRegression 11 | from gensim.models.doc2vec import TaggedDocument 12 | import re 13 | import seaborn as sns 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | df = pd.read_csv('onehot/Consumer_Complaints_sim.csv') 18 | df = df[['Sub-issue','Product']] 19 | df = df[pd.notnull(df['Sub-issue'])] 20 | df.head(10) 21 | 22 | 23 | 24 | df.shape 25 | 26 | 27 | cnt_pro = df['Product'].value_counts() 28 | 29 | plt.figure(figsize=(12,4)) 30 | sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8) 31 | plt.ylabel('Number of Occurrences', fontsize=12) 32 | plt.xlabel('Product', fontsize=12) 33 | plt.xticks(rotation=90) 34 | plt.show() 35 | 36 | 37 | df.rename(columns = {'Consumer complaint narrative':'narrative'}, inplace = True) 38 | df.rename(columns = {'Sub-issue':'narrative'}, inplace = True) 39 | 40 | 41 | from gensim.models import doc2vec 42 | 43 | def label_sentences(corpus, label_type): 44 | """ 45 | Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. 46 | We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is 47 | a dummy index of the complaint narrative. 48 | """ 49 | labeled = [] 50 | for i, v in enumerate(corpus): 51 | label = label_type + '_' + str(i) 52 | labeled.append(doc2vec.TaggedDocument(v.split(), [label])) 53 | return labeled 54 | 55 | 56 | X_train, X_test, y_train, y_test = train_test_split(df.narrative, df.Product, random_state=0, test_size=0.3) 57 | X_train = label_sentences(X_train, 'Train') 58 | X_test = label_sentences(X_test, 'Test') 59 | all_data = X_train + X_test 60 | 61 | 62 | model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065) 63 | model_dbow.build_vocab([x for x in tqdm(all_data)]) 64 | 65 | 66 | 67 | for epoch in range(30): 68 | model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1) 69 | model_dbow.alpha -= 0.002 70 | model_dbow.min_alpha = model_dbow.alpha 71 | 72 | 73 | def get_vectors(model, corpus_size, vectors_size, vectors_type): 74 | """ 75 | Get vectors from trained doc2vec model 76 | :param doc2vec_model: Trained Doc2Vec model 77 | :param corpus_size: Size of the data 78 | :param vectors_size: Size of the embedding vectors 79 | :param vectors_type: Training or Testing vectors 80 | :return: list of vectors 81 | """ 82 | vectors = np.zeros((corpus_size, vectors_size)) 83 | for i in range(0, corpus_size): 84 | prefix = vectors_type + '_' + str(i) 85 | vectors[i] = model.docvecs[prefix] 86 | return vectors 87 | 88 | 89 | train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train') 90 | test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test') 91 | print(test_vectors_dbow) 92 | print(train_vectors_dbow.shape) 93 | print(test_vectors_dbow.shape) 94 | 95 | 96 | from sklearn.linear_model import LogisticRegression 97 | 98 | logreg = LogisticRegression(multi_class='multinomial', solver = 'lbfgs') 99 | logreg.fit(train_vectors_dbow, y_train) 100 | 101 | 102 | print("逻辑斯蒂测准确率="+logreg.score(test_vectors_dbow, y_test)) 103 | 104 | -------------------------------------------------------------------------------- /Code/14-1交叉验证.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | from sklearn import datasets 4 | from sklearn import svm 5 | 6 | iris = datasets.load_iris() 7 | print(iris.data.shape, iris.target.shape) 8 | 9 | X_train, X_test, y_train, y_test = train_test_split( 10 | iris.data, iris.target, test_size=0.4, random_state=0) 11 | 12 | 13 | 14 | clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) 15 | clf.score(X_test, y_test) 16 | 17 | from sklearn.model_selection import cross_val_score 18 | clf = svm.SVC(kernel='linear', C=1) 19 | scores = cross_val_score(clf, iris.data, iris.target, cv=5 ) #交叉测试了5次 20 | print(scores) 21 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) 22 | 23 | 24 | from sklearn.model_selection import ShuffleSplit 25 | n_samples = iris.data.shape[0] 26 | cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0) 27 | scores = cross_val_score(clf, iris.data, iris.target, cv=cv) 28 | print(scores) 29 | 30 | from sklearn.model_selection import cross_validate 31 | from sklearn.metrics import recall_score 32 | from sklearn.metrics.scorer import make_scorer 33 | scoring = {'prec_macro': 'precision_macro', 34 | 'rec_micro': make_scorer(recall_score, average='macro')} 35 | scores = cross_validate(clf, iris.data, iris.target, scoring=scoring, 36 | cv=5, return_train_score=True) 37 | print(scores) -------------------------------------------------------------------------------- /Code/14-2Pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | from sklearn import datasets 6 | from sklearn.decomposition import PCA 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.model_selection import GridSearchCV 10 | 11 | 12 | # Define a pipeline to search for the best combination of PCA truncation 13 | # and classifier regularization. 14 | pca = PCA() 15 | # set the tolerance to a large value to make the example faster 16 | logistic = LogisticRegression(max_iter=10000, tol=0.1) 17 | pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) 18 | 19 | X_digits, y_digits = datasets.load_digits(return_X_y=True) 20 | 21 | # Parameters of pipelines can be set using ‘__’ separated parameter names: 22 | param_grid = { 23 | 'pca__n_components': [5, 15, 30, 45, 64], 24 | 'logistic__C': np.logspace(-4, 4, 4), 25 | } 26 | search = GridSearchCV(pipe, param_grid, n_jobs=-1) 27 | search.fit(X_digits, y_digits) 28 | print("Best parameter (CV score=%0.3f):" % search.best_score_) 29 | print(search.best_params_) 30 | 31 | # Plot the PCA spectrum 32 | pca.fit(X_digits) 33 | 34 | fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(6, 6)) 35 | ax0.plot(np.arange(1, pca.n_components_ + 1), 36 | pca.explained_variance_ratio_, '+', linewidth=2) 37 | ax0.set_ylabel('PCA explained variance ratio') 38 | 39 | ax0.axvline(search.best_estimator_.named_steps['pca'].n_components, 40 | linestyle=':', label='n_components chosen') 41 | ax0.legend(prop=dict(size=12)) 42 | 43 | # For each number of components, find the best classifier results 44 | results = pd.DataFrame(search.cv_results_) 45 | components_col = 'param_pca__n_components' 46 | best_clfs = results.groupby(components_col).apply( 47 | lambda g: g.nlargest(1, 'mean_test_score')) 48 | 49 | best_clfs.plot(x=components_col, y='mean_test_score', yerr='std_test_score', 50 | legend=False, ax=ax1) 51 | ax1.set_ylabel('Classification accuracy (val)') 52 | ax1.set_xlabel('n_components') 53 | 54 | plt.xlim(-1, 70) 55 | 56 | plt.tight_layout() 57 | plt.show() 58 | -------------------------------------------------------------------------------- /Code/2-1(重要)二分类模型(感知器学习的原始算法).py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.datasets import load_iris 4 | import matplotlib.pyplot as plt 5 | #%matplotlib inline 6 | iris = load_iris() 7 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 8 | df['label'] = iris.target 9 | print(df.head(10)) 10 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 11 | print(df.label.value_counts()) 12 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0') 13 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1') 14 | plt.xlabel('sepal length') 15 | plt.ylabel('sepal width') 16 | plt.legend() 17 | plt.show() 18 | data = np.array(df.iloc[:100, [0, 1, -1]]) 19 | print(data[:10,:]) 20 | 21 | 22 | X, y = data[:,:-1], data[:,-1] 23 | X[:10,:] 24 | 25 | 26 | y = np.array([1 if i == 1 else -1 for i in y ]) 27 | 28 | 29 | class Perceptron_Model: 30 | def __init__(self): 31 | self.w = np.ones(len(data[0]) - 1, dtype=np.float32) 32 | print(self.w) 33 | self.b = 0 34 | self.l_rate = 0.1 35 | # self.data = data 36 | 37 | def sign(self, x, w, b): 38 | y = np.dot(x, w) + b 39 | return y 40 | 41 | # 随机梯度下降法 42 | def fit(self, X_train, y_train): 43 | is_wrong = False 44 | while not is_wrong: 45 | wrong_count = 0 46 | for d in range(len(X_train)): 47 | X = X_train[d] 48 | y = y_train[d] 49 | if y * self.sign(X, self.w, self.b) <= 0: 50 | self.w = self.w + self.l_rate * np.dot(y, X) 51 | self.b = self.b + self.l_rate * y 52 | wrong_count += 1 53 | if wrong_count == 0: 54 | is_wrong = True 55 | return 'Perceptron Model!' 56 | 57 | def score(self): 58 | pass 59 | 60 | perceptron = Perceptron_Model() 61 | perceptron.fit(X, y) 62 | x_points = np.linspace(4, 7, 10) 63 | y_ = -(perceptron.w[0] * x_points + perceptron.b) / perceptron.w[1] 64 | plt.plot(x_points, y_) 65 | 66 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0') 67 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 68 | plt.xlabel('sepal length') 69 | plt.ylabel('sepal width') 70 | plt.legend() 71 | plt.show() -------------------------------------------------------------------------------- /Code/2-2二分类模型(对偶算法).py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.datasets import load_iris 4 | import matplotlib.pyplot as plt 5 | #%matplotlib inline 6 | iris = load_iris() 7 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 8 | df['label'] = iris.target 9 | print(df.head(10)) 10 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 11 | print(df.label.value_counts()) 12 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0') 13 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1') 14 | plt.xlabel('sepal length') 15 | plt.ylabel('sepal width') 16 | plt.legend() 17 | data = np.array(df.iloc[:100, [0, 1, -1]]) 18 | print(data[:10,:]) 19 | 20 | X, y = data[:,:-1], data[:,-1] 21 | X[:10,:] 22 | 23 | clf = Perceptron(tol=1e-3, random_state=0, max_iter=1000) 24 | clf.fit(X, y) 25 | print(clf.coef_) 26 | print(clf.intercept_) 27 | x_ponits = np.arange(4, 8) 28 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1] 29 | plt.plot(x_ponits, y_) 30 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0') 31 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 32 | plt.xlabel('sepal length') 33 | plt.ylabel('sepal width') 34 | plt.legend() 35 | plt.show() 36 | # class PLA_dual: 37 | # def __init__(self, max_iter=1000): 38 | # self.b = 0 39 | # self.lr = 0.1 40 | # self.max_iter = max_iter 41 | # self.iter = 0 42 | # 43 | # def cal_w(self, X): 44 | # w = 0 45 | # for i in range(len(self.alpha)): 46 | # w += self.alpha[i] * y[i] * X[i] 47 | # return w 48 | # 49 | # def gram_matrix(self, X): 50 | # return np.dot(X, X.T) 51 | # 52 | # def fit(self, X, y): 53 | # N, M = X.shape 54 | # self.alpha = np.zeros(N) 55 | # gram = self.gram_matrix(X) 56 | # for n in range(self.max_iter): 57 | # self.iter = n 58 | # wrong_items = 0 59 | # for i in range(N): 60 | # tmp = 0 61 | # for j in range(N): 62 | # tmp += self.alpha[j] * y[j] * gram[i, j] 63 | # tmp += self.b 64 | # if y[i] * tmp <= 0: 65 | # self.alpha[i] += self.lr 66 | # self.b += self.lr * y[i] 67 | # wrong_items += 1 68 | # if wrong_items == 0: 69 | # self.w = self.cal_w(X) 70 | # print("finished at iters: {}, w: {}, b: {}".format(self.iter, self.w, self.b)) 71 | # return 72 | # self.w = self.cal_w(X) 73 | # print("finished for reaching the max_iter: {}, w: {}, b: {}".format(self.max_iter, self.w, self.b)) 74 | # return 75 | # 76 | # perceptron3 = PLA_dual() 77 | # perceptron3.fit(X, y) 78 | # def plot(model, tilte): 79 | # x_points = np.linspace(4, 7, 10) 80 | # y_ = -(model.w[0]*x_points + model.b)/model.w[1] 81 | # plt.plot(x_points, y_) 82 | # print(y_) 83 | # 84 | # plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='-1') 85 | # plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 86 | # plt.xlabel('sepal length') 87 | # plt.ylabel('sepal width') 88 | # plt.title(tilte) 89 | # plt.legend() 90 | # plt.show() 91 | # plot(perceptron3, 'PLA_dual') -------------------------------------------------------------------------------- /Code/2-3二分类模型(sklearn包里的分类算法).py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.datasets import load_iris 4 | import matplotlib.pyplot as plt 5 | from sklearn.linear_model import Perceptron 6 | #%matplotlib inline 7 | iris = load_iris() 8 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 9 | df['label'] = iris.target 10 | print(df.head(10)) 11 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 12 | print(df.label.value_counts()) 13 | data = np.array(df.iloc[:100, [0, 1, -1]]) 14 | print(data[:10,:]) 15 | print() 16 | 17 | X, y = data[:,:-1], data[:,-1] 18 | X[:10,:] 19 | y = np.array([1 if i == 1 else -1 for i in y ]) 20 | #clf = Perceptron(fit_intercept=False, shuffle=False) 21 | clf = Perceptron(tol=1e-3, random_state=0, max_iter=1000) 22 | clf.fit(X, y) 23 | print(clf.coef_) 24 | print(clf.intercept_) 25 | x_ponits = np.arange(4, 8) 26 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1] 27 | plt.plot(x_ponits, y_) 28 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0') 29 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 30 | plt.xlabel('sepal length') 31 | plt.ylabel('sepal width') 32 | plt.legend() 33 | plt.show() 34 | 35 | -------------------------------------------------------------------------------- /Code/2-4二分类的课后练习.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | Iteration=[] 6 | w_0=[-1.5] 7 | w_1=[0] 8 | w_2=[2] 9 | Training_Example=[] 10 | x1=[] 11 | x2=[] 12 | Class=[] 13 | s=[] 14 | Action=[] 15 | 16 | count=0 17 | 18 | 19 | 20 | data = np.array([[0,1,-1],[2, 0,-1],[1,1,1]]) 21 | X, y = data[:,:-1], data[:,-1] 22 | 23 | class Perceptron_Model: 24 | def __init__(self): 25 | self.w = np.array([0,2]) 26 | self.b = -1.5 27 | self.l_rate = 1.0 28 | 29 | # self.data = data 30 | 31 | 32 | def sign(self, x, w, b): 33 | 34 | y = np.dot(x, w) + b 35 | s.append(y) 36 | return y 37 | 38 | # 随机梯度下降法 39 | def fit(self, X_train, y_train): 40 | count=1 41 | is_wrong = False 42 | while not is_wrong: 43 | 44 | 45 | wrong_count = 0 46 | for d in range(len(X_train)): 47 | if d==0: 48 | Training_Example.append("a") 49 | elif d==1: 50 | Training_Example.append("b") 51 | else: 52 | Training_Example.append("c") 53 | X = X_train[d] 54 | x1.append(X[0]) 55 | x2.append(X[1]) 56 | y = y_train[d] 57 | if y<0: 58 | Class.append("-") 59 | else: 60 | Class.append("+") 61 | if y * self.sign(X, self.w, self.b) <= 0: 62 | Iteration.append(count) 63 | count = count + 1 64 | self.w = self.w + self.l_rate * np.dot(y, X) 65 | w_1.append(self.w[0]) 66 | w_2.append(self.w[1]) 67 | self.b = self.b + self.l_rate * y 68 | w_0.append(self.b) 69 | if(y>0):Action.append("Add") 70 | else:Action.append("Subtract") 71 | wrong_count += 1 72 | else: 73 | Iteration.append(count) 74 | count = count + 1 75 | w_1.append(self.w[0]) 76 | w_2.append(self.w[1]) 77 | w_0.append(self.b) 78 | Action.append("None") 79 | 80 | if wrong_count == 0: 81 | is_wrong = True 82 | return 'Perceptron Model!' 83 | 84 | def score(self): 85 | pass 86 | print() 87 | perceptron = Perceptron_Model() 88 | perceptron.fit(X, y) 89 | print(count) 90 | 91 | record = { 92 | 'Iteration':Iteration, 93 | 'w_0':w_0[0:12], 94 | 'w_1':w_1[0:12], 95 | 'w_2':w_2[0:12], 96 | 'Training_Example':Training_Example, 97 | 'x1':x1, 98 | 'x2':x2, 99 | 'Class':Class, 100 | 's=w_0+w_1x_1+w_2x_2':s, 101 | 'Action':Action 102 | } 103 | 104 | print(record) 105 | frame = pd.DataFrame(record) 106 | frame.to_csv(path_or_buf="tmp.csv",index=False) 107 | -------------------------------------------------------------------------------- /Code/3-1K近邻的距离图.py: -------------------------------------------------------------------------------- 1 | import math 2 | from itertools import combinations 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | r = 1 8 | 9 | linestyle = ['b-','k-','m-','r-','y-'] 10 | p_values = (0.25, 0.5, 1, 2, 4,100) 11 | 12 | for i,p in enumerate(p_values): 13 | x = np.arange(-r,r+1e-5,1/128.0) 14 | y = (r**p - (abs(x)**p))**(1.0/p) 15 | plt.plot(x,y,x,-y) 16 | 17 | ax = plt.gca() 18 | ax.set_aspect(1) 19 | plt.show() 20 | 21 | def L(x, y, p=2): 22 | # x1 = [1, 1], x2 = [5,1] 23 | if len(x) == len(y) and len(x) > 1: 24 | sum = 0 25 | for i in range(len(x)): 26 | sum += math.pow(abs(x[i] - y[i]), p) 27 | return math.pow(sum, 1/p) 28 | else: 29 | return 0 30 | 31 | x1 = [1, 1] 32 | x2 = [5, 1] 33 | x3 = [4, 4] 34 | 35 | def L(x, y, p=2): 36 | # x1 = [1, 1], x2 = [5,1] 37 | if len(x) == len(y) and len(x) > 1: 38 | sum = 0 39 | for i in range(len(x)): 40 | sum += math.pow(abs(x[i] - y[i]), p) 41 | return math.pow(sum, 1/p) 42 | else: 43 | return 0 44 | 45 | -------------------------------------------------------------------------------- /Code/3-2K近邻法距离加权与统一的对比.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from matplotlib.colors import ListedColormap 6 | from sklearn import neighbors, datasets 7 | 8 | 9 | # import some data to play with 10 | irisData = datasets.load_iris() 11 | irisData.data[0:5 ,:] 12 | 13 | 14 | X = irisData.data[:, :2] 15 | y = irisData.target 16 | X[:10 ,:] 17 | 18 | 19 | n_neighbors = 15 20 | 21 | 22 | step = .01 # step size in the mesh 23 | 24 | for weights in ['uniform', 'distance']: 25 | # we create an instance of Neighbours Classifier and fit the data. 26 | classifier = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) 27 | classifier.fit(X, y) 28 | 29 | print('KNN classifier accuracy - "%s" - %.3f' % (weights ,classifier.score(X ,y))) 30 | 31 | # Plot the decision boundary. For that, we will assign a color to each 32 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 33 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 34 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 35 | x_grid, y_grid = np.meshgrid(np.arange(x_min, x_max, step = step), 36 | np.arange(y_min, y_max, step = step)) 37 | Z = classifier.predict(np.c_[x_grid.ravel(), y_grid.ravel()]) 38 | 39 | # Put the result into a color plot 40 | Z = Z.reshape(x_grid.shape) 41 | plt.figure() 42 | plt.pcolormesh(x_grid, y_grid, Z, cmap=ListedColormap(['lightblue', 'lightgreen', 'lightyellow']) ) 43 | 44 | # Plot also the training points 45 | plt.scatter(X[:, 0], X[:, 1], c=y, 46 | edgecolor='k', s=20) 47 | plt.xlim(x_grid.min(), x_grid.max()) 48 | plt.ylim(y_grid.min(), y_grid.max()) 49 | plt.title("KNN 3-Class Classification (k = %d, weights = '%s')" 50 | % (n_neighbors, weights)) 51 | 52 | 53 | plt.show() 54 | 55 | -------------------------------------------------------------------------------- /Code/3-2(1)K近邻法距离加权与统一的对比.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn import neighbors 5 | 6 | np.random.seed(0) 7 | X = np.sort(5 * np.random.rand(40, 1), axis=0) 8 | T = np.linspace(0, 5, 500)[:, np.newaxis] 9 | y = np.sin(X).ravel() 10 | 11 | # Add noise to targets 12 | y[::5] += 1 * (0.5 - np.random.rand(8)) 13 | 14 | plt.plot(X,y) 15 | 16 | 17 | n_neighbors = 5 18 | 19 | for i, weights in enumerate(['uniform', 'distance']): 20 | knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights) 21 | y_ = knn.fit(X, y).predict(T) 22 | 23 | plt.subplot(2, 1, i + 1) 24 | plt.scatter(X, y, c='k', label='data') 25 | plt.plot(T, y_, c='g', label='prediction') 26 | plt.axis('tight') 27 | plt.legend() 28 | plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors, 29 | weights)) 30 | 31 | plt.tight_layout() 32 | plt.show() 33 | 34 | -------------------------------------------------------------------------------- /Code/3-3KNN算法(原始和包).py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | #% matplotlib 6 | #inline 7 | 8 | from sklearn.datasets import load_iris 9 | from sklearn.model_selection import train_test_split 10 | 11 | from collections import Counter 12 | 13 | iris = load_iris() 14 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 15 | df['label'] = iris.target 16 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 17 | 18 | 19 | df.iloc[0:5] 20 | 21 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0') 22 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1') 23 | plt.xlabel('sepal length') 24 | plt.ylabel('sepal width') 25 | plt.legend() 26 | 27 | 28 | data = np.array(df.iloc[:100, [0, 1, -1]]) 29 | X, y = data[:, :-1], data[:, -1] 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 31 | 32 | 33 | 34 | class KNN: 35 | def __init__(self, X_train, y_train, n_neighbors=3, p=2): 36 | """ 37 | parameter: n_neighbors 临近点个数 38 | parameter: p 距离度量 39 | """ 40 | self.n = n_neighbors 41 | self.p = p 42 | self.X_train = X_train 43 | self.y_train = y_train 44 | 45 | def predict(self, X): 46 | # 取出n个点 47 | knn_list = [] 48 | for i in range(self.n): 49 | dist = np.linalg.norm(X - self.X_train[i], ord=self.p) 50 | knn_list.append((dist, self.y_train[i])) 51 | 52 | for i in range(self.n, len(self.X_train)): 53 | max_index = knn_list.index(max(knn_list, key=lambda x: x[0])) 54 | dist = np.linalg.norm(X - self.X_train[i], ord=self.p) 55 | if knn_list[max_index][0] > dist: 56 | knn_list[max_index] = (dist, self.y_train[i]) 57 | 58 | # 统计 59 | knn = [k[-1] for k in knn_list] 60 | count_pairs = Counter(knn) 61 | max_count = sorted(count_pairs, key=lambda x: x)[-1] 62 | return max_count 63 | 64 | def score(self, X_test, y_test): 65 | right_count = 0 66 | n = 10 67 | for X, y in zip(X_test, y_test): 68 | label = self.predict(X) 69 | if label == y: 70 | right_count += 1 71 | return right_count / len(X_test) 72 | 73 | 74 | 75 | clf = KNN(X_train, y_train) 76 | 77 | clf.score(X_test, y_test) 78 | 79 | 80 | test_point = [6.0, 3.0] 81 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0') 82 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1') 83 | plt.plot(test_point[0], test_point[1], 'bo', label='test_point') 84 | plt.xlabel('sepal length') 85 | plt.ylabel('sepal width') 86 | plt.legend() 87 | plt.show() 88 | 89 | 90 | from sklearn.neighbors import KNeighborsClassifier 91 | 92 | clf_sk = KNeighborsClassifier() 93 | clf_sk.fit(X_train, y_train) 94 | 95 | 96 | print(clf_sk.score(X_test, y_test)) 97 | 98 | 99 | clf_sk.predict([[6.0,3.0]]) -------------------------------------------------------------------------------- /Code/3-4KNN(糖尿病).py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | #% matplotlib 6 | #inline 7 | 8 | from sklearn.datasets import load_iris 9 | from sklearn.model_selection import train_test_split 10 | 11 | from collections import Counter 12 | 13 | dia = pd.read_csv("diabetes.csv") 14 | df = pd.DataFrame(dia) 15 | 16 | print(df) 17 | 18 | 19 | data = np.array(df.iloc[:767, [0,1,2,3,4,6,-1]]) 20 | print(data) 21 | X, y = data[:, :-1], data[:, -1] 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 23 | 24 | 25 | 26 | from sklearn.neighbors import KNeighborsClassifier 27 | # 28 | clf_sk = KNeighborsClassifier() 29 | clf_sk.fit(X_train, y_train) 30 | # 31 | # 32 | print(clf_sk.score(X_test, y_test)) 33 | # 34 | # 35 | -------------------------------------------------------------------------------- /Code/3-5KNN(cifar-10).py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | import pickle 3 | import cv2 4 | 5 | 6 | def load(filename): 7 | 8 | with open(filename, 'rb') as fo: 9 | 10 | data = pickle.load(fo, encoding='latin1') 11 | 12 | return data 13 | #读取第一个训练集——data_batch_1: 14 | train = 'cifar-10-batches-py\data_batch_' 15 | test=r'cifar-10-batches-py\test_batch' #字符串前加r防止转义字符/t 16 | print(test) 17 | clf = KNeighborsClassifier("nn") 18 | 19 | for i in range(1,6): #从文件cifar-10-batches-py中读取data集1-5 20 | d=load(train+str(i)) 21 | X, y = d["data"], d["labels"] 22 | X_train, y_train = X, y 23 | clf.fit(X_train, y_train) 24 | print("数据集" + str(i) + "训练完毕") 25 | d=load(test)#从文件cifar-10-batches-py中读取test集 26 | X, y = d["data"], d["labels"] 27 | X_test, y_test = X, y 28 | 29 | print(clf.score(X_test, y_test)) -------------------------------------------------------------------------------- /Code/4-1原始贝叶斯.py: -------------------------------------------------------------------------------- 1 | class NaiveBayes: 2 | def __init__(self): 3 | self.model = None 4 | 5 | # 数学期望 6 | @staticmethod 7 | def mean(X): 8 | return sum(X) / float(len(X)) 9 | 10 | # 标准差(方差) 11 | def stdev(self, X): 12 | avg = self.mean(X) 13 | return math.sqrt(sum([pow(x-avg, 2) for x in X]) / float(len(X))) 14 | 15 | # 概率密度函数 16 | def gaussian_probability(self, x, mean, stdev): 17 | exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))) 18 | return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent 19 | 20 | # 处理X_train 21 | def summarize(self, train_data): 22 | summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)] 23 | return summaries 24 | 25 | # 分类别求出数学期望和标准差 26 | def fit(self, X, y): 27 | labels = list(set(y)) 28 | data = {label:[] for label in labels} 29 | for f, label in zip(X, y): 30 | data[label].append(f) 31 | self.model = {label: self.summarize(value) for label, value in data.items()} 32 | return 'GaussianNB train done!' 33 | 34 | # 计算概率 35 | def calculate_probabilities(self, input_data): 36 | # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]} 37 | # input_data:[1.1, 2.2] 38 | probabilities = {} 39 | for label, value in self.model.items(): 40 | probabilities[label] = 1 41 | for i in range(len(value)): 42 | mean, stdev = value[i] 43 | probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev) 44 | return probabilities 45 | 46 | # 类别 47 | def predict(self, X_test): 48 | # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26} 49 | label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0] 50 | return label 51 | 52 | def score(self, X_test, y_test): 53 | right = 0 54 | for X, y in zip(X_test, y_test): 55 | label = self.predict(X) 56 | if label == y: 57 | right += 1 58 | 59 | return right / float(len(X_test)) 60 | 61 | import math 62 | import numpy as np 63 | import pandas as pd 64 | 65 | import matplotlib.pyplot as plt 66 | 67 | 68 | from sklearn.datasets import load_iris 69 | from sklearn.model_selection import train_test_split 70 | 71 | 72 | iris = load_iris() 73 | X = iris.data 74 | Y = iris.target 75 | 76 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 77 | df['label'] = iris.target 78 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 79 | data = np.array(df.iloc[:100, :]) 80 | # print(data) 81 | 82 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) 83 | 84 | X_train[:10,:] 85 | X_test[0], y_test[0] 86 | 87 | model = NaiveBayes() 88 | 89 | 90 | model.fit(X_train, y_train) 91 | 92 | 93 | x_train=[4.4, 3.2, 1.3, 0.2] 94 | 95 | print(model.predict(x_train)) 96 | -------------------------------------------------------------------------------- /Code/4-2导包的高斯贝叶斯.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) 4 | Y = np.array([1, 1, 1, 2, 2, 2]) 5 | 6 | from sklearn.naive_bayes import GaussianNB 7 | clf = GaussianNB(priors=None, var_smoothing=1e-09) 8 | clf.fit(X, Y) 9 | print(clf.predict([[-0.8, -1]])) -------------------------------------------------------------------------------- /Code/4-3高斯伯努利多项式贝叶斯.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.naive_bayes import GaussianNB 3 | from sklearn.naive_bayes import MultinomialNB 4 | from sklearn.naive_bayes import BernoulliNB 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | from sklearn.datasets import load_iris 13 | from sklearn.model_selection import train_test_split 14 | iris = load_iris() 15 | X = iris.data 16 | Y = iris.target 17 | 18 | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0) 19 | 20 | nb = GaussianNB() 21 | nb.fit(X_train, y_train) 22 | 23 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data) 24 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum())) 25 | 26 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test))) 27 | 28 | 29 | nb = MultinomialNB() 30 | nb.fit(X_train, y_train) 31 | 32 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data) 33 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum())) 34 | 35 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test))) 36 | 37 | 38 | nb = BernoulliNB() 39 | nb.fit(X_train, y_train) 40 | 41 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data) 42 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum())) 43 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test))) 44 | 45 | min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1 46 | max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1 47 | binarizes=np.linspace(min_x,max_x,endpoint=True,num=100) 48 | 49 | train_scores=[] 50 | test_scores=[] 51 | 52 | for binarize in binarizes: 53 | cls=BernoulliNB(binarize=binarize) 54 | cls.fit(X_train,y_train) 55 | train_scores.append(cls.score(X_train,y_train)) 56 | test_scores.append(cls.score(X_test, y_test)) 57 | 58 | fig=plt.figure() 59 | ax=fig.add_subplot(1,1,1) 60 | ax.plot(binarizes,train_scores,label="Training Score") 61 | ax.plot(binarizes,test_scores,label="Testing Score") 62 | ax.set_xlabel("binarize") 63 | ax.set_ylabel("score") 64 | ax.set_ylim(0,1.0) 65 | ax.set_xlim(min_x-1,max_x+1) 66 | ax.set_title("BernoulliNB") 67 | ax.legend(loc="best") 68 | plt.show() 69 | 70 | # 这几个都是naive bayes的模型,区别主要在于特征的分布。 71 | # 72 | # 73 | # 74 | # 如果特征是数值的,最好是正态分布的数值的,那么用 75 | # sklearn.naive_bayes.GaussianNB 76 | 77 | # 如果特征是binary的,那么用 78 | # sklearn.naive_bayes.BernoulliNB 79 | 80 | # 如果特征是categorical的,那么用 81 | # sklearn.naive_bayes.MultinomialNB -------------------------------------------------------------------------------- /Code/4-4高斯做的分类(数字样本).py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.datasets import load_digits 3 | from sklearn.naive_bayes import GaussianNB 4 | 5 | from sklearn.model_selection import train_test_split 6 | digits = load_digits() 7 | X, y = digits.data, digits.target 8 | 9 | 10 | print(digits.data.shape) 11 | 12 | import matplotlib.pyplot as plt 13 | # plt.gray() 14 | # plt.matshow(digits.images[0]) 15 | # plt.show() 16 | 17 | fig=plt.figure(figsize=(6,6)) 18 | fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05) 19 | 20 | for i in range(64): 21 | ax=fig.add_subplot(8,8,i+1,xticks=[],yticks=[]) 22 | ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest') 23 | #用目标值标记图像 24 | ax.text(0,7,str(digits.target[i])) 25 | plt.show() 26 | 27 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) 28 | from sklearn.naive_bayes import GaussianNB 29 | clf = GaussianNB(priors=None, var_smoothing=1e-09) 30 | clf.fit(X_train, y_train) 31 | print(clf.score(X_test,y_test)) -------------------------------------------------------------------------------- /Code/4-5高斯(鱼样本).py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.naive_bayes import GaussianNB 3 | from sklearn.model_selection import train_test_split 4 | fish= pd.read_csv("fish-01.csv") 5 | X_train, X_test, y_train, y_test = train_test_split(fish.iloc[:,1:], fish.iloc[:,0], test_size=0.4, random_state=0) 6 | 7 | clf = GaussianNB(priors=None, var_smoothing=1e-09) 8 | clf.fit(X_train, y_train) 9 | print(clf.predict([[120.0, 19.4, 21.0, 23.7, 25.8, 13.9]])) 10 | print(clf.score(X_test, y_test)) 11 | 12 | 13 | from sklearn.neural_network import MLPClassifier 14 | 15 | clf1=MLPClassifier(activation='logistic',max_iter=1000)# 构造分类器实例 16 | clf1.fit(X_train, y_train) 17 | print(clf.predict([[120.0, 19.4, 21.0, 23.7, 25.8, 13.9]])) 18 | print(clf.score(X_test, y_test)) -------------------------------------------------------------------------------- /Code/4-6高斯(cifar-10).py: -------------------------------------------------------------------------------- 1 | from sklearn.naive_bayes import GaussianNB 2 | import pickle 3 | 4 | 5 | 6 | def load(filename): 7 | 8 | with open(filename, 'rb') as fo: 9 | 10 | data = pickle.load(fo, encoding='latin1') 11 | 12 | return data 13 | #读取第一个训练集——data_batch_1: 14 | train = 'cifar-10-batches-py\data_batch_' 15 | test=r'cifar-10-batches-py\test_batch' #字符串前加r防止转义字符/t 16 | print(test) 17 | clf = GaussianNB(priors=None, var_smoothing=1e-09) 18 | 19 | for i in range(1,6): #从文件cifar-10-batches-py中读取data集1-5 20 | d=load(train+str(i)) 21 | X, y = d["data"], d["labels"] 22 | X_train, y_train = X, y 23 | clf.fit(X_train, y_train,) 24 | print("数据集" + str(i) + "训练完毕") 25 | 26 | d=load(test)#从文件cifar-10-batches-py中读取test集 27 | X, y = d["data"], d["labels"] 28 | X_test, y_test = X, y 29 | 30 | print(clf.score(X_test, y_test)) -------------------------------------------------------------------------------- /Code/5-1原始决策树.py: -------------------------------------------------------------------------------- 1 | def create_data(): 2 | datasets = [[1, 'Sunny', 'Hot', 'High', 'Weak', 'No'], 3 | [2, 'Sunny', 'Hot', 'High', 'Strong', 'No'], 4 | [3, 'Overcast', 'Hot', 'High', 'Weak', 'Yes'], 5 | [4, 'Rainy', 'Mild', 'High', 'Weak', 'Yes'], 6 | [5, 'Rainy', 'Cool', 'Normal', 'Weak', 'Yes'], 7 | [6, 'Rainy', 'Cool', 'Normal', 'Strong', 'No'], 8 | [7, 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'], 9 | [8, 'Sunny', 'Mild', 'High', 'Weak', 'No'], 10 | [9, 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'], 11 | [10, 'Rainy', 'Mild', 'Normal', 'Weak', 'Yes'], 12 | [11, 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], 13 | [12, 'Overcast', 'Mild', 'High', 'Strong', 'Yes'], 14 | [13, 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'], 15 | [14, 'Rainy', 'Mild', 'High', 'Strong', 'No'], 16 | ] 17 | 18 | labels = ['Day', 'OutLook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'] 19 | 20 | # 返回数据集和每个维度的名称 21 | return datasets, labels 22 | from math import log 23 | 24 | # 以 Outlook 为分界的熵 25 | 26 | En_Sunny = -(2/5)*log(2/5,2) - (3/5)*log(3/5,2) 27 | En_Overcast = -(4/4)*log(4/4,2) 28 | En_Rainy = -(3/5)*log(3/5,2) - (2/5)*log(2/5,2) 29 | 30 | # Outlook 联合熵 31 | En_Outlook = 5/14*En_Sunny + 4/14*En_Overcast + 5/14*En_Rainy 32 | 33 | print(En_Sunny,En_Overcast,En_Rainy) 34 | print('联合熵:',En_Outlook) 35 | # Outlook 的分裂信息度量 熵 36 | 37 | IG=-(5/14)*log(5/14,2) - (9/14)*log(9/14,2)-En_Outlook 38 | print("信息增益",IG) 39 | OutLook = -5/14*log(5/14,2)-4/14*log(4/14,2)-5/14*log(5/14,2) 40 | # Outlook 增益率 41 | OutLook_Gain_Ratio = IG/OutLook 42 | 43 | print(OutLook,OutLook_Gain_Ratio) 44 | 45 | import numpy as np 46 | 47 | 48 | # 定义节点类 二叉树 49 | class Node: 50 | def __init__(self, root=True, label=None, feature_name=None, feature=None): 51 | self.root = root 52 | self.label = label 53 | self.feature_name = feature_name 54 | self.feature = feature 55 | self.tree = {} 56 | self.result = {'label:': self.label, 'feature': self.feature, 'tree': self.tree} 57 | 58 | def __repr__(self): 59 | return '{}'.format(self.result) 60 | 61 | def add_node(self, val, node): 62 | self.tree[val] = node 63 | 64 | def predict(self, features): 65 | if self.root is True: 66 | return self.label 67 | return self.tree[features[self.feature]].predict(features) 68 | 69 | 70 | class DTree: 71 | def __init__(self, epsilon=0.1): 72 | self.epsilon = epsilon 73 | self._tree = {} 74 | 75 | # 熵 76 | @staticmethod 77 | def calc_ent(datasets): 78 | data_length = len(datasets) 79 | label_count = {} 80 | for i in range(data_length): 81 | label = datasets[i][-1] 82 | if label not in label_count: 83 | label_count[label] = 0 84 | label_count[label] += 1 85 | ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()]) 86 | return ent 87 | 88 | # 经验条件熵 89 | def cond_ent(self, datasets, axis=0): 90 | data_length = len(datasets) 91 | feature_sets = {} 92 | for i in range(data_length): 93 | feature = datasets[i][axis] 94 | if feature not in feature_sets: 95 | feature_sets[feature] = [] 96 | feature_sets[feature].append(datasets[i]) 97 | cond_ent = sum([(len(p) / data_length) * self.calc_ent(p) for p in feature_sets.values()]) 98 | return cond_ent 99 | 100 | # 信息增益 101 | @staticmethod 102 | def info_gain(ent, cond_ent): 103 | return ent - cond_ent 104 | 105 | def info_gain_train(self, datasets): 106 | count = len(datasets[0]) - 1 107 | ent = self.calc_ent(datasets) 108 | best_feature = [] 109 | for c in range(count): 110 | c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c)) 111 | best_feature.append((c, c_info_gain)) 112 | # 比较大小 113 | best_ = max(best_feature, key=lambda x: x[-1]) 114 | return best_ 115 | 116 | def train(self, train_data): 117 | """ 118 | input:数据集D(DataFrame格式),特征集A,阈值eta 119 | output:决策树T 120 | """ 121 | _, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1] 122 | # 1,若D中实例属于同一类Ck,则T为单节点树,并将类Ck作为结点的类标记,返回T 123 | if len(y_train.value_counts()) == 1: 124 | return Node(root=True, 125 | label=y_train.iloc[0]) 126 | 127 | # 2, 若A为空,则T为单节点树,将D中实例树最大的类Ck作为该节点的类标记,返回T 128 | if len(features) == 0: 129 | return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0]) 130 | 131 | # 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征 132 | max_feature, max_info_gain = self.info_gain_train(np.array(train_data)) 133 | max_feature_name = features[max_feature] 134 | 135 | # 4,Ag的信息增益小于阈值eta,则置T为单节点树,并将D中是实例数最大的类Ck作为该节点的类标记,返回T 136 | if max_info_gain < self.epsilon: 137 | return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0]) 138 | 139 | # 5,构建Ag子集 140 | node_tree = Node(root=False, feature_name=max_feature_name, feature=max_feature) 141 | 142 | feature_list = train_data[max_feature_name].value_counts().index 143 | for f in feature_list: 144 | sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1) 145 | 146 | # 6, 递归生成树 147 | sub_tree = self.train(sub_train_df) 148 | node_tree.add_node(f, sub_tree) 149 | 150 | # pprint.pprint(node_tree.tree) 151 | return node_tree 152 | 153 | def fit(self, train_data): 154 | self._tree = self.train(train_data) 155 | return self._tree 156 | 157 | def predict(self, X_test): 158 | return self._tree.predict(X_test) 159 | 160 | 161 | import pandas as pd 162 | 163 | def create_data(): 164 | datasets = [['青年', '否', '否', '一般', '否'], 165 | ['青年', '否', '否', '好', '否'], 166 | ['青年', '是', '否', '好', '是'], 167 | ['青年', '是', '是', '一般', '是'], 168 | ['青年', '否', '否', '一般', '否'], 169 | ['中年', '否', '否', '一般', '否'], 170 | ['中年', '否', '否', '好', '否'], 171 | ['中年', '是', '是', '好', '是'], 172 | ['中年', '否', '是', '非常好', '是'], 173 | ['中年', '否', '是', '非常好', '是'], 174 | ['老年', '否', '是', '非常好', '是'], 175 | ['老年', '否', '是', '好', '是'], 176 | ['老年', '是', '否', '好', '是'], 177 | ['老年', '是', '否', '非常好', '是'], 178 | ['老年', '否', '否', '一般', '否'], 179 | ] 180 | labels = ['年龄', '有工作', '有自己的房子', '信贷情况', '类别'] 181 | # 返回数据集和每个维度的名称 182 | return datasets, labels 183 | 184 | 185 | datasets, labels = create_data() 186 | data_df = pd.DataFrame(datasets, columns=labels) 187 | dt = DTree() 188 | tree = dt.fit(data_df) 189 | 190 | print(dt.predict(['老年', '否', '否', '一般'])) 191 | 192 | -------------------------------------------------------------------------------- /Code/5-2决策树(鸢尾样本).py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.tree import DecisionTreeClassifier 3 | 4 | from sklearn.tree import export_graphviz 5 | 6 | from sklearn.datasets import load_iris 7 | from sklearn.model_selection import train_test_split 8 | 9 | import pandas as pd 10 | import numpy as np 11 | iris = load_iris() 12 | # data 13 | def iris_data(): 14 | iris = load_iris() 15 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 16 | df['label'] = iris.target 17 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 18 | data = np.array(df.iloc[:100, [0, 1, -1]]) 19 | # print(data) 20 | return data[:,:2], data[:,-1] 21 | 22 | X, y = iris_data() 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 24 | 25 | clf = DecisionTreeClassifier() 26 | 27 | clf.fit(X_train, y_train,) 28 | 29 | print(clf.score(X_test, y_test)) 30 | 31 | print(clf.predict([[4.4, 3. ]])) 32 | 33 | 34 | import graphviz 35 | dot_data = export_graphviz(clf, out_file=None) 36 | graph = graphviz.Source(dot_data) 37 | graph.render("iris") 38 | 39 | dot_data = export_graphviz(clf, out_file=None, 40 | feature_names=iris.feature_names, 41 | class_names=iris.target_names, 42 | filled=True, rounded=True, 43 | special_characters=True) 44 | graph = graphviz.Source(dot_data) -------------------------------------------------------------------------------- /Code/5-3决策树(数字样本).py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_digits 2 | from sklearn.tree import DecisionTreeClassifier 3 | from sklearn.model_selection import train_test_split 4 | import matplotlib.pyplot as plt 5 | digits = load_digits() 6 | X, y = digits.data, digits.target 7 | 8 | 9 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 10 | 11 | 12 | clf = DecisionTreeClassifier() 13 | 14 | clf.fit(X_train, y_train,) 15 | 16 | print(clf.score(X_test, y_test)) 17 | 18 | -------------------------------------------------------------------------------- /Code/5-4多层决策树回归.py: -------------------------------------------------------------------------------- 1 | # Import the necessary modules and libraries 2 | import numpy as np 3 | from sklearn.tree import DecisionTreeRegressor 4 | import matplotlib.pyplot as plt # Create a random dataset 5 | rng = np.random.RandomState(1) 6 | X = np.sort(5 * rng.rand(80, 1), axis=0) 7 | y = np.sin(X).ravel() 8 | y[::5] += 3 * (0.5 - rng.rand(16)) 9 | 10 | # Fit regression model 11 | regr_1 = DecisionTreeRegressor(max_depth=2) 12 | regr_2 = DecisionTreeRegressor(max_depth=5) 13 | regr_1.fit(X, y) 14 | regr_2.fit(X, y) 15 | 16 | # Predict 17 | X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] 18 | y_1 = regr_1.predict(X_test) 19 | y_2 = regr_2.predict(X_test) 20 | 21 | # Plot the results 22 | plt.figure() 23 | plt.scatter(X, y, s=20, edgecolor="black", c="darkorange", label="data") 24 | plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2) 25 | plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2) 26 | 27 | plt.xlabel("data") 28 | plt.ylabel("target") 29 | plt.title("Decision Tree Regression") 30 | plt.legend() 31 | plt.show() -------------------------------------------------------------------------------- /Code/5-5决策树(鱼样本).py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.tree import DecisionTreeClassifier 3 | from sklearn.model_selection import train_test_split 4 | fish= pd.read_csv("fish-01.csv") 5 | X_train, X_test, y_train, y_test = train_test_split(fish.iloc[:,1:], fish.iloc[:,0], test_size=0.4, random_state=0) 6 | 7 | 8 | 9 | clf = DecisionTreeClassifier() 10 | clf.fit(X_train, y_train) 11 | 12 | print(clf.predict([[120.0, 19.4, 21.0, 23.7, 25.8, 13.9]])) 13 | 14 | print(clf.score(X_test, y_test)) 15 | 16 | 17 | -------------------------------------------------------------------------------- /Code/5-6决策树(cifar-10).py: -------------------------------------------------------------------------------- 1 | from sklearn.tree import DecisionTreeClassifier 2 | import pickle 3 | import cv2 4 | 5 | 6 | def load(filename): 7 | 8 | with open(filename, 'rb') as fo: 9 | 10 | data = pickle.load(fo, encoding='latin1') 11 | 12 | return data 13 | #读取第一个训练集——data_batch_1: 14 | train = 'cifar-10-batches-py\data_batch_' 15 | test=r'cifar-10-batches-py\test_batch' #字符串前加r防止转义字符/t 16 | print(test) 17 | clf = DecisionTreeClassifier() 18 | 19 | for i in range(1,6): #从文件cifar-10-batches-py中读取data集1-5 20 | d=load(train+str(i)) 21 | X, y = d["data"], d["labels"] 22 | X_train, y_train = X, y 23 | clf.fit(X_train, y_train,) 24 | print("数据集" + str(i) + "训练完毕") 25 | 26 | d=load(test)#从文件cifar-10-batches-py中读取test集 27 | X, y = d["data"], d["labels"] 28 | X_test, y_test = X, y 29 | 30 | print(clf.score(X_test, y_test)) -------------------------------------------------------------------------------- /Code/5-7决策树剪枝(乳腺癌样本).py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.datasets import load_breast_cancer 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | 8 | X, y = load_breast_cancer(return_X_y=True) 9 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 10 | 11 | clf = DecisionTreeClassifier(random_state=0) 12 | path = clf.cost_complexity_pruning_path(X_train, y_train) 13 | ccp_alphas, impurities = path.ccp_alphas, path.impurities 14 | fig, ax = plt.subplots() 15 | ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") 16 | ax.set_xlabel("effective alpha") 17 | ax.set_ylabel("total impurity of leaves") 18 | ax.set_title("Total Impurity vs effective alpha for training set") 19 | 20 | 21 | 22 | clfs = [] 23 | for ccp_alpha in ccp_alphas: 24 | clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) 25 | clf.fit(X_train, y_train) 26 | clfs.append(clf) 27 | print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( 28 | clfs[-1].tree_.node_count, ccp_alphas[-1])) 29 | 30 | 31 | clfs = clfs[:-1] 32 | ccp_alphas = ccp_alphas[:-1] 33 | 34 | node_counts = [clf.tree_.node_count for clf in clfs] 35 | depth = [clf.tree_.max_depth for clf in clfs] 36 | fig, ax = plt.subplots(2, 1) 37 | ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") 38 | ax[0].set_xlabel("alpha") 39 | ax[0].set_ylabel("number of nodes") 40 | ax[0].set_title("Number of nodes vs alpha") 41 | ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") 42 | ax[1].set_xlabel("alpha") 43 | ax[1].set_ylabel("depth of tree") 44 | ax[1].set_title("Depth vs alpha") 45 | fig.tight_layout() 46 | 47 | train_scores = [clf.score(X_train, y_train) for clf in clfs] 48 | test_scores = [clf.score(X_test, y_test) for clf in clfs] 49 | 50 | fig, ax = plt.subplots() 51 | ax.set_xlabel("alpha") 52 | ax.set_ylabel("accuracy") 53 | ax.set_title("Accuracy vs alpha for training and testing sets") 54 | ax.plot(ccp_alphas, train_scores, marker='o', label="train", 55 | drawstyle="steps-post") 56 | ax.plot(ccp_alphas, test_scores, marker='o', label="test", 57 | drawstyle="steps-post") 58 | ax.legend() 59 | plt.show() 60 | 61 | clfs = clfs[:-1] 62 | ccp_alphas = ccp_alphas[:-1] 63 | 64 | node_counts = [clf.tree_.node_count for clf in clfs] 65 | depth = [clf.tree_.max_depth for clf in clfs] 66 | fig, ax = plt.subplots(2, 1) 67 | ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") 68 | ax[0].set_xlabel("alpha") 69 | ax[0].set_ylabel("number of nodes") 70 | ax[0].set_title("Number of nodes vs alpha") 71 | ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") 72 | ax[1].set_xlabel("alpha") 73 | ax[1].set_ylabel("depth of tree") 74 | ax[1].set_title("Depth vs alpha") 75 | fig.tight_layout() 76 | 77 | 78 | 79 | train_scores = [clf.score(X_train, y_train) for clf in clfs] 80 | test_scores = [clf.score(X_test, y_test) for clf in clfs] 81 | 82 | fig, ax = plt.subplots() 83 | ax.set_xlabel("alpha") 84 | ax.set_ylabel("accuracy") 85 | ax.set_title("Accuracy vs alpha for training and testing sets") 86 | ax.plot(ccp_alphas, train_scores, marker='o', label="train", 87 | drawstyle="steps-post") 88 | ax.plot(ccp_alphas, test_scores, marker='o', label="test", 89 | drawstyle="steps-post") 90 | ax.legend() 91 | plt.show() 92 | 93 | -------------------------------------------------------------------------------- /Code/5-8计算熵(entropy)的函数.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | 6 | import math 7 | 8 | p=np.linspace(0.01,1,num=50,endpoint=False) 9 | 10 | entropy = -p*np.log2(p)-(1-p)*np.log2(1-p) 11 | 12 | 13 | #plt.plot(b) 14 | plt.plot(p,entropy) 15 | plt.grid(True) 16 | plt.xlabel('p') 17 | plt.ylabel('Entropy(bit)') 18 | #plt.plot(p,gini) 19 | 20 | max_en = 2*(-(1/2)*np.log2(1/2)) 21 | print(max_en) 22 | 23 | d=np.linspace(0.01,100,num=50,endpoint=False) 24 | ld=np.log2(d) 25 | 26 | plt.show() -------------------------------------------------------------------------------- /Code/6-1逻辑斯蒂的概率分布.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from math import exp 4 | 5 | def sigmod(x): 6 | return 1/(1+np.exp(-x)) 7 | 8 | x = np.arange(-10,10.,0.1) 9 | y = sigmod(x) 10 | 11 | plt.plot(x,y) 12 | plt.grid(True) 13 | plt.show() 14 | 15 | 16 | class LogisticReressionClassifier: 17 | def __init__(self, max_iter=200, learning_rate=0.01): 18 | self.max_iter = max_iter 19 | self.learning_rate = learning_rate 20 | 21 | def sigmoid(self, x): 22 | return 1 / (1 + exp(-x)) 23 | 24 | def data_matrix(self, X): 25 | data_mat = [] 26 | for d in X: 27 | data_mat.append([1.0, *d]) 28 | return data_mat 29 | 30 | def fit(self, X, y): 31 | # label = np.mat(y) 32 | data_mat = self.data_matrix(X) # m*n 33 | self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32) 34 | 35 | for iter_ in range(self.max_iter): 36 | for i in range(len(X)): 37 | result = self.sigmoid(np.dot(data_mat[i], self.weights)) 38 | error = y[i] - result 39 | self.weights += self.learning_rate * error * np.transpose([data_mat[i]]) 40 | print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter)) 41 | 42 | # def f(self, x): 43 | # return -(self.weights[0] + self.weights[1] * x) / self.weights[2] 44 | 45 | def score(self, X_test, y_test): 46 | right = 0 47 | X_test = self.data_matrix(X_test) 48 | for x, y in zip(X_test, y_test): 49 | result = np.dot(x, self.weights) 50 | if (result > 0 and y == 1) or (result < 0 and y == 0): 51 | right += 1 52 | return right / len(X_test) 53 | 54 | -------------------------------------------------------------------------------- /Code/6-2原始逻辑斯蒂(鸢尾样本).py: -------------------------------------------------------------------------------- 1 | 2 | from math import exp 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | from sklearn.datasets import load_iris 9 | from sklearn.model_selection import train_test_split 10 | 11 | from sklearn.linear_model import LogisticRegression 12 | 13 | 14 | 15 | 16 | 17 | # data 18 | def create_data(): 19 | iris = load_iris() 20 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 21 | df['label'] = iris.target 22 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 23 | data = np.array(df.iloc[:100, [0,1,-1]]) 24 | # print(data) 25 | return data[:,:2], data[:,-1] 26 | 27 | 28 | X, y = create_data() 29 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 30 | 31 | 32 | class LogisticReressionClassifier: 33 | def __init__(self, max_iter=200, learning_rate=0.01): 34 | self.max_iter = max_iter 35 | self.learning_rate = learning_rate 36 | 37 | def sigmoid(self, x): 38 | return 1 / (1 + exp(-x)) 39 | 40 | def data_matrix(self, X): 41 | data_mat = [] 42 | for d in X: 43 | data_mat.append([1.0, *d]) 44 | return data_mat 45 | 46 | def fit(self, X, y): 47 | # label = np.mat(y) 48 | data_mat = self.data_matrix(X) # m*n 49 | self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32) 50 | 51 | for iter_ in range(self.max_iter): 52 | for i in range(len(X)): 53 | result = self.sigmoid(np.dot(data_mat[i], self.weights)) 54 | error = y[i] - result 55 | self.weights += self.learning_rate * error * np.transpose([data_mat[i]]) 56 | print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter)) 57 | 58 | # def f(self, x): 59 | # return -(self.weights[0] + self.weights[1] * x) / self.weights[2] 60 | 61 | def score(self, X_test, y_test): 62 | right = 0 63 | X_test = self.data_matrix(X_test) 64 | for x, y in zip(X_test, y_test): 65 | result = np.dot(x, self.weights) 66 | if (result > 0 and y == 1) or (result < 0 and y == 0): 67 | right += 1 68 | return right / len(X_test) 69 | 70 | lr_clf = LogisticReressionClassifier() 71 | lr_clf.fit(X_train, y_train) 72 | 73 | x_ponits = np.arange(4, 8) 74 | y_ = -(lr_clf.weights[1]*x_ponits + lr_clf.weights[0])/lr_clf.weights[2] 75 | plt.plot(x_ponits, y_) 76 | 77 | 78 | plt.scatter(X[:50,0],X[:50,1], label='0') 79 | plt.scatter(X[50:,0],X[50:,1], label='1') 80 | plt.legend() 81 | plt.show() 82 | 83 | -------------------------------------------------------------------------------- /Code/6-2逻辑斯蒂(鸢尾样本).py: -------------------------------------------------------------------------------- 1 | 2 | from math import exp 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | from sklearn.datasets import load_iris 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.linear_model import LogisticRegression 11 | 12 | 13 | clf = LogisticRegression(max_iter=200,solver='liblinear') 14 | def create_data(): 15 | iris = load_iris() 16 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 17 | df['label'] = iris.target 18 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 19 | data = np.array(df.iloc[:100, [0,1,-1]]) 20 | # print(data) 21 | return data[:,:2], data[:,-1] 22 | 23 | 24 | X, y = create_data() 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 26 | 27 | clf.fit(X_train, y_train) 28 | 29 | 30 | print(clf.coef_, clf.intercept_) 31 | 32 | 33 | x_ponits = np.arange(4, 8) 34 | y_ = -(clf.coef_[0][0]*x_ponits + clf.intercept_)/clf.coef_[0][1] 35 | plt.plot(x_ponits, y_) 36 | 37 | plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0') 38 | plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1') 39 | plt.xlabel('sepal length') 40 | plt.ylabel('sepal width') 41 | plt.legend() 42 | plt.show() 43 | -------------------------------------------------------------------------------- /Code/6-4逻辑斯蒂(数字样本).py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets, neighbors, linear_model 2 | 3 | digits = datasets.load_digits() 4 | X_digits = digits.data / digits.data.max() 5 | y_digits = digits.target 6 | 7 | n_samples = len(X_digits) 8 | 9 | X_train = X_digits[:int(.9 * n_samples)] 10 | y_train = y_digits[:int(.9 * n_samples)] 11 | X_test = X_digits[int(.9 * n_samples):] 12 | y_test = y_digits[int(.9 * n_samples):] 13 | 14 | knn = neighbors.KNeighborsClassifier() 15 | logistic = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000, 16 | multi_class='multinomial') 17 | 18 | print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test)) 19 | print('LogisticRegression score: %f' 20 | % logistic.fit(X_train, y_train).score(X_test, y_test)) -------------------------------------------------------------------------------- /Code/6-5逻辑斯蒂(乳腺癌样本)评估(二分类).py: -------------------------------------------------------------------------------- 1 | 2 | # 载入数据 3 | from sklearn.datasets import load_breast_cancer 4 | import matplotlib.pyplot as plt 5 | cancer = load_breast_cancer() 6 | X = cancer.data 7 | y = cancer.target 8 | print('data shape: {0}; no. positive: {1}; no. negative: {2}'.format( 9 | X.shape, y[y==1].shape[0], y[y==0].shape[0])) 10 | print(cancer.data[0]) 11 | 12 | 13 | cancer.feature_names 14 | 15 | 16 | from sklearn.model_selection import train_test_split 17 | X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2) 18 | 19 | from sklearn.linear_model import LogisticRegression 20 | logmodel = LogisticRegression(max_iter=3000) 21 | logmodel.fit(X_train, y_train) 22 | 23 | train_score = logmodel.score(X_train,y_train) 24 | test_score = logmodel.score(X_test,y_test) 25 | 26 | #print(train_score,test_score) 27 | 28 | X_output=logmodel.predict(X_test) 29 | from sklearn.metrics import precision_score 30 | print(precision_score(y_test,X_output,average=None)) 31 | 32 | # (查全率)召回率 The best value is 1 and the worst value is 0. 33 | # 就是所有准确的条目有多少被检索出来了。 34 | from sklearn.metrics import recall_score 35 | print(recall_score(y_test,X_output,average=None)) 36 | 37 | 38 | 39 | 40 | from sklearn.metrics import plot_precision_recall_curve 41 | 42 | pr = plot_precision_recall_curve(logmodel,X_test,y_test) 43 | 44 | #print(pr) 45 | 46 | from sklearn.metrics import plot_roc_curve 47 | 48 | roc = plot_roc_curve(logmodel, X_test, y_test) 49 | 50 | 51 | # from sklearn.metrics import roc_curve,auc 52 | # fpr, tpr, thresholds = roc_curve(y_test, X_output) 53 | # print(auc(fpr, tpr)) 54 | 55 | 56 | from sklearn.metrics import plot_confusion_matrix 57 | from sklearn.metrics import classification_report 58 | disp = plot_confusion_matrix(logmodel, X_test, y_test) #混淆矩阵 59 | disp.figure_.suptitle("Confusion Matrix") 60 | #print("Confusion matrix:\n%s" % disp.confusion_matrix) 61 | 62 | print(classification_report(y_test, X_output)) 63 | 64 | plt.show() -------------------------------------------------------------------------------- /Code/6-6逻辑斯蒂(广告样本).py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets, neighbors, linear_model 2 | import pandas as pd 3 | from sklearn.model_selection import train_test_split 4 | 5 | advertise= pd.read_csv("advertising.csv") 6 | X_train, X_test, y_train, y_test = train_test_split(advertise.loc[:,["Daily Time Spent on Site","Age","Area Income","Male"]], advertise.iloc[:,[-1]], test_size=0.2, random_state=0) 7 | from sklearn.linear_model import LogisticRegression 8 | logmodel = LogisticRegression(max_iter=200) 9 | logmodel.fit(X_train, y_train) 10 | 11 | train_score = logmodel.score(X_train,y_train) 12 | test_score = logmodel.score(X_test,y_test) 13 | 14 | print(train_score,test_score) 15 | 16 | y_pred_proba = logmodel.predict_proba(X_test) 17 | print('sample of predict probability: {0}'.format(y_pred_proba[0]))#是0?还是1(概率)0在前1在后 18 | print(y_pred_proba) 19 | -------------------------------------------------------------------------------- /Code/7-1查找best参数.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm, datasets 2 | from sklearn.model_selection import GridSearchCV #找离散变量中最好的参数 3 | from sklearn.linear_model import LogisticRegression 4 | iris = datasets.load_iris() 5 | #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} 6 | 7 | parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'C':[0.1,1,2]} 8 | print(parameters) 9 | classifier = LogisticRegression(max_iter=3000) 10 | 11 | 12 | clf = GridSearchCV(classifier, parameters) 13 | clf.fit(iris.data, iris.target) 14 | 15 | sorted(clf.cv_results_.keys()) 16 | 17 | print(clf.best_params_) 18 | print(clf.best_score_) 19 | 20 | from sklearn.model_selection import RandomizedSearchCV #找连续变量中 21 | 22 | clf = RandomizedSearchCV(classifier, parameters) 23 | 24 | clf.fit(iris.data, iris.target) 25 | 26 | sorted(clf.cv_results_.keys()) 27 | 28 | print(clf.best_params_) 29 | print(clf.best_score_) -------------------------------------------------------------------------------- /Code/7-2决策树(数字样本)评估(多分类).py: -------------------------------------------------------------------------------- 1 | # 对决策树模型在数字样本上的测试结果做评估 2 | from sklearn.datasets import load_digits 3 | from sklearn.tree import DecisionTreeClassifier 4 | from sklearn.model_selection import train_test_split 5 | import matplotlib.pyplot as plt 6 | digits = load_digits() 7 | X, y = digits.data, digits.target 8 | 9 | 10 | X_train, X_test, y_train, Y_test = train_test_split(X, y, test_size=0.3) 11 | clf = DecisionTreeClassifier() 12 | clf.fit(X_train, y_train,) 13 | print(clf.score(X_test, Y_test)) 14 | 15 | 16 | y_pred=clf.predict(X_test) 17 | from sklearn.metrics import precision_score 18 | print(precision_score(Y_test,y_pred,average=None)) 19 | 20 | 21 | from sklearn.metrics import recall_score 22 | print(recall_score(Y_test,y_pred,average=None)) 23 | 24 | 25 | from sklearn.preprocessing import label_binarize #给数字数据集做归类化 26 | 27 | # Use label_binarize to be multi-label like settings 28 | Y_test_b = label_binarize(Y_test, classes=[0, 1, 2,3,4,5,6,7,8,9]) 29 | Y_pred_b= label_binarize(y_pred, classes=[0, 1, 2,3,4,5,6,7,8,9]) 30 | n_classes = Y_test_b.shape[1] 31 | 32 | 33 | from sklearn.metrics import precision_recall_curve 34 | from sklearn.metrics import average_precision_score 35 | 36 | 37 | precision = dict() 38 | recall = dict() 39 | average_precision = dict() 40 | for i in range(n_classes): #计算每个标签的召回率 41 | precision[i], recall[i], _ = precision_recall_curve(Y_test_b[:, i], 42 | Y_pred_b[:, i]) 43 | average_precision[i] = average_precision_score(Y_test_b[:, i],Y_pred_b[:, i]) 44 | 45 | lines = [] 46 | labels = [] 47 | 48 | colors = ['red','blue','green','pink','gold','navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'] 49 | 50 | # precision["micro"], recall["micro"], _ = precision_recall_curve(Y_test_b.ravel(), 51 | # Y_pred_b.ravel()) 52 | # average_precision["micro"] = average_precision_score(Y_test_b,Y_pred_b, #平均召回率 53 | # average="micro") 54 | # print('Average precision score, micro-averaged over all classes: {0:0.2f}' 55 | # .format(average_precision["micro"])) 56 | 57 | plt.figure(figsize=(7, 8)) 58 | for i, color in zip(range(n_classes), colors): #在图里画出每个标签的召回率 59 | l, = plt.plot(recall[i], precision[i], color=color, lw=2) 60 | lines.append(l) 61 | labels.append('Precision-recall for class {0} (area = {1:0.2f})' 62 | ''.format(i, average_precision[i])) 63 | 64 | fig = plt.gcf() 65 | fig.subplots_adjust(bottom=0.25) 66 | plt.xlim([0.0, 1.0]) 67 | plt.ylim([0.0, 1.05]) 68 | plt.xlabel('Recall') 69 | plt.ylabel('Precision') 70 | plt.title('Extension of Precision-Recall curve to multi-class') 71 | plt.legend(lines, labels, loc=(0, -.0), prop=dict(size=9)) 72 | 73 | from sklearn.metrics import plot_confusion_matrix 74 | from sklearn.metrics import classification_report 75 | disp = plot_confusion_matrix(clf, X_test, Y_test) 76 | disp.figure_.suptitle("Confusion Matrix") 77 | 78 | 79 | print(classification_report(Y_test, y_pred)) 80 | 81 | plt.show() -------------------------------------------------------------------------------- /Code/7-2官网svm(花样本)评估(多分类).py: -------------------------------------------------------------------------------- 1 | from sklearn import svm, datasets 2 | from sklearn.model_selection import train_test_split 3 | import numpy as np 4 | 5 | iris = datasets.load_iris() 6 | X = iris.data 7 | y = iris.target 8 | 9 | # Add noisy features 10 | random_state = np.random.RandomState(0) 11 | n_samples, n_features = X.shape 12 | X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] 13 | 14 | # Limit to the two first classes, and split into training and test 15 | X_train, X_test, y_train, y_test = train_test_split(X[y < 2], y[y < 2], 16 | test_size=.5, 17 | random_state=random_state) 18 | 19 | # Create a simple classifier 20 | classifier = svm.LinearSVC(random_state=random_state) 21 | classifier.fit(X_train, y_train) 22 | y_score = classifier.decision_function(X_test) 23 | 24 | from sklearn.metrics import precision_recall_curve 25 | from sklearn.metrics import plot_precision_recall_curve 26 | import matplotlib.pyplot as plt 27 | from sklearn.metrics import average_precision_score 28 | average_precision = average_precision_score(y_test, y_score) 29 | 30 | print('Average precision-recall score: {0:0.2f}'.format( 31 | average_precision)) 32 | disp = plot_precision_recall_curve(classifier, X_test, y_test) 33 | disp.ax_.set_title('2-class Precision-Recall curve: ' 34 | 'AP={0:0.2f}'.format(average_precision)) 35 | 36 | 37 | from sklearn.preprocessing import label_binarize 38 | 39 | # Use label_binarize to be multi-label like settings 40 | Y = label_binarize(y, classes=[0, 1, 2]) 41 | n_classes = Y.shape[1] 42 | 43 | # Split into training and test 44 | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5, 45 | random_state=random_state) 46 | 47 | # We use OneVsRestClassifier for multi-label prediction 48 | from sklearn.multiclass import OneVsRestClassifier 49 | 50 | # Run classifier 51 | classifier = OneVsRestClassifier(svm.LinearSVC(random_state=random_state)) 52 | classifier.fit(X_train, Y_train) 53 | y_score = classifier.decision_function(X_test) 54 | 55 | from sklearn.metrics import precision_recall_curve 56 | from sklearn.metrics import average_precision_score 57 | 58 | # For each class 59 | precision = dict() 60 | recall = dict() 61 | average_precision = dict() 62 | 63 | 64 | print(n_classes) 65 | for i in range(n_classes): 66 | print(Y_test[:, i]) 67 | precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i], 68 | y_score[:, i]) 69 | average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i]) 70 | 71 | # A "micro-average": quantifying score on all classes jointly 72 | precision["micro"], recall["micro"], _ = precision_recall_curve(Y_test.ravel(), 73 | y_score.ravel()) 74 | average_precision["micro"] = average_precision_score(Y_test, y_score, 75 | average="micro") 76 | print('Average precision score, micro-averaged over all classes: {0:0.2f}' 77 | .format(average_precision["micro"])) 78 | 79 | plt.figure() 80 | plt.step(recall['micro'], precision['micro'], where='post') 81 | 82 | plt.xlabel('Recall') 83 | plt.ylabel('Precision') 84 | plt.ylim([0.0, 1.05]) 85 | plt.xlim([0.0, 1.0]) 86 | plt.title( 87 | 'Average precision score, micro-averaged over all classes: AP={0:0.2f}' 88 | .format(average_precision["micro"])) 89 | 90 | 91 | 92 | from itertools import cycle 93 | # setup plot details 94 | colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal']) 95 | 96 | plt.figure(figsize=(7, 8)) 97 | f_scores = np.linspace(0.2, 0.8, num=4) 98 | lines = [] 99 | labels = [] 100 | for f_score in f_scores: 101 | print(f_score) 102 | x = np.linspace(0.01, 1) 103 | y = f_score * x / (2 * x - f_score) 104 | l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2) 105 | plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02)) 106 | 107 | lines.append(l) 108 | labels.append('iso-f1 curves') 109 | l, = plt.plot(recall["micro"], precision["micro"], color='gold', lw=2) 110 | lines.append(l) 111 | labels.append('micro-average Precision-recall (area = {0:0.2f})' 112 | ''.format(average_precision["micro"])) 113 | 114 | for i, color in zip(range(n_classes), colors): 115 | l, = plt.plot(recall[i], precision[i], color=color, lw=2) 116 | lines.append(l) 117 | labels.append('Precision-recall for class {0} (area = {1:0.2f})' 118 | ''.format(i, average_precision[i])) 119 | 120 | fig = plt.gcf() 121 | fig.subplots_adjust(bottom=0.25) 122 | plt.xlim([0.0, 1.0]) 123 | plt.ylim([0.0, 1.05]) 124 | plt.xlabel('Recall') 125 | plt.ylabel('Precision') 126 | plt.title('Extension of Precision-Recall curve to multi-class') 127 | plt.legend(lines, labels, loc=(0, -.38), prop=dict(size=14)) 128 | 129 | 130 | plt.show() -------------------------------------------------------------------------------- /Code/8-1原始svm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SVM: 4 | def __init__(self, max_iter=100, kernel='linear'): 5 | self.max_iter = max_iter 6 | self._kernel = kernel 7 | 8 | def init_args(self, features, labels): 9 | self.m, self.n = features.shape 10 | self.X = features 11 | self.Y = labels 12 | self.b = 0.0 13 | # 将Ei保存在⼀个列表⾥ 14 | self.alpha = np.ones(self.m) 15 | self.E = [self._E(i) for i in range(self.m)] 16 | # 松弛变量 17 | self.C = 1.0 18 | 19 | def _KKT(self, i): 20 | y_g = self._g(i) * self.Y[i] 21 | if self.alpha[i] == 0: 22 | return y_g >= 1 23 | elif 0 < self.alpha[i] < self.C: 24 | return y_g == 1 25 | else: 26 | return y_g <= 1 27 | # g(x)预测值,输⼊xi(X[i]) 28 | 29 | def _g(self, i): 30 | r = self.b 31 | for j in range(self.m): 32 | r += self.alpha[j] * self.Y[j] * self.kernel(self.X[i], self.X[j]) 33 | return r 34 | # 核函数 35 | 36 | def kernel(self, x1, x2): 37 | if self._kernel == 'linear': 38 | return sum([x1[k] * x2[k] for k in range(self.n)]) 39 | elif self._kernel == 'poly': 40 | return (sum([x1[k] * x2[k] for k in range(self.n)]) + 1) ** 2 41 | 42 | return 0 43 | # E(x)为g(x)对输⼊x的预测值和y的差 44 | 45 | def _E(self, i): 46 | return self._g(i) - self.Y[i] 47 | 48 | def _init_alpha(self): 49 | # 外层循环⾸先遍历所有满⾜0= 0: 60 | j = min(range(self.m), key=lambda x: self.E[x]) 61 | else: 62 | j = max(range(self.m), key=lambda x: self.E[x]) 63 | return i, j 64 | 65 | def _compare(self, _alpha, L, H): 66 | if _alpha > H: 67 | return H 68 | elif _alpha < L: 69 | return L 70 | else: 71 | return _alpha 72 | 73 | def fit(self, features, labels): 74 | self.init_args(features, labels) 75 | 76 | for t in range(self.max_iter): 77 | # train 78 | i1, i2 = self._init_alpha() 79 | # 边界 80 | if self.Y[i1] == self.Y[i2]: 81 | L = max(0, self.alpha[i1] + self.alpha[i2] - self.C) 82 | H = min(self.C, self.alpha[i1] + self.alpha[i2]) 83 | else: 84 | L = max(0, self.alpha[i2] - self.alpha[i1]) 85 | H = min(self.C, self.C + self.alpha[i2] - self.alpha[i1]) 86 | E1 = self.E[i1] 87 | E2 = self.E[i2] 88 | # eta=K11+K22-2K12 89 | eta = self.kernel(self.X[i1], self.X[i1]) + self.kernel(self.X[i2], self.X[i2]) - 2 * self.kernel( 90 | self.X[i1], self.X[i2]) 91 | if eta <= 0: 92 | # print('eta <= 0') 93 | continue 94 | alpha2_new_unc = self.alpha[i2] + self.Y[i2] * (E1 - E2) / eta 95 | alpha2_new = self._compare(alpha2_new_unc, L, H) 96 | alpha1_new = self.alpha[i1] + self.Y[i1] * self.Y[i2] * (self.alpha[i2] - alpha2_new) 97 | b1_new = -E1 - self.Y[i1] * self.kernel(self.X[i1], self.X[i1]) * (alpha1_new - self.alpha[i1]) - self.Y[ 98 | i2] * self.kernel(self.X[i2], self.X[i1]) * (alpha2_new - self.alpha[i2]) + self.b 99 | b2_new = -E2 - self.Y[i1] * self.kernel(self.X[i1], self.X[i2]) * (alpha1_new - self.alpha[i1]) - self.Y[ 100 | i2] * self.kernel(self.X[i2], self.X[i2]) * (alpha2_new - self.alpha[i2]) + self.b 101 | if 0 < alpha1_new < self.C: 102 | b_new = b1_new 103 | 104 | elif 0 < alpha2_new < self.C: 105 | b_new = b2_new 106 | else: 107 | # 选择中点 108 | b_new = (b1_new + b2_new) / 2 109 | # 更新参数 110 | self.alpha[i1] = alpha1_new 111 | self.alpha[i2] = alpha2_new 112 | self.b = b_new 113 | self.E[i1] = self._E(i1) 114 | self.E[i2] = self._E(i2) 115 | return 'train done!' 116 | 117 | def predict(self, data): 118 | r = self.b 119 | for i in range(self.m): 120 | r += self.alpha[i] * self.Y[i] * self.kernel(data, self.X[i]) 121 | return 1 if r > 0 else -1 122 | 123 | def score(self, X_test, y_test): 124 | right_count = 0 125 | for i in range(len(X_test)): 126 | result = self.predict(X_test[i]) 127 | if result == y_test[i]: 128 | right_count += 1 129 | return right_count / len(X_test) 130 | 131 | def _weight(self): 132 | # linear model 133 | yx = self.Y.reshape(-1, 1) * self.X 134 | self.w = np.dot(yx.T, self.alpha) 135 | return self.w -------------------------------------------------------------------------------- /Code/8-2svc参数讲解.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.datasets import load_iris 5 | from sklearn.model_selection import train_test_split 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | 10 | # data 11 | def create_data(col = 2): 12 | iris = load_iris() 13 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 14 | df['label'] = iris.target 15 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 16 | data = np.array(df.iloc[:100, [0, 1, -1]]) 17 | for i in range(len(data)): 18 | if data[i,-1] == 0: 19 | data[i,-1] = -1 20 | # print(data) 21 | return data[:,:col], data[:,-1] 22 | 23 | 24 | X, y = create_data() 25 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) 26 | 27 | 28 | 29 | #svm = SVM(max_iter=400) 30 | 31 | 32 | #svm.fit(X_train, y_train) 33 | 34 | 35 | from sklearn.svm import SVC 36 | clf = SVC(gamma='auto') 37 | clf.fit(X_train, y_train) 38 | 39 | # sklearn.svm.SVC 40 | ''' 41 | *(C 42 | =1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, random_state=None) * 43 | 44 | 参数: 45 | 46 | - C:C - SVC的惩罚参数C?默认值是1 47 | .0 48 | 49 | C越大,相当于惩罚松弛变量,希望松弛变量接近0,即对误分类的惩罚增大,趋向于对训练集全分对的情况,这样对训练集测试时准确率很高,但泛化能力弱。C值小,对误分类的惩罚减小,允许容错,将他们当成噪声点,泛化能力较强。 50 | 51 | - kernel :核函数,默认是rbf,可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ 52 | 53 | – 线性:u'v 54 | 55 | – 多项式:(gamma * u'*v + coef0)^degree 56 | 57 | – RBF函数:exp(-gamma | u-v | ^ 2) 58 | 59 | – sigmoid:tanh(gamma * u'*v + coef0) 60 | 61 | - decision_function_shape: ‘ovo’, ‘ovr’, default =’ovr’ 62 | 63 | Whether 64 | to 65 | return a 66 | one - vs - rest(‘ovr’) decision 67 | function 68 | of 69 | shape(n_samples, n_classes) as all 70 | other 71 | classifiers, or the 72 | original 73 | one - vs - one(‘ovo’) decision 74 | function 75 | of 76 | libsvm 77 | which 78 | has 79 | shape(n_samples, n_classes * (n_classes - 1) / 2).However, one - vs - one(‘ovo’) is always 80 | used as multi - 81 | 82 | 83 | class strategy. 84 | 85 | 86 | 87 | a.一对多法(one - versus - rest, 简称1 - v - r 88 | SVMs)。训练时依次把某个类别的样本归为一类, 其他剩余的样本归为另一类,这样k个类别的样本就构造出了k个SVM。分类时将未知样本分类为具有最大分类函数值的那类。  89 | 90 | b.一对一法(one - versus - one, 简称1 - v - 1 91 | SVMs)。其做法是在任意两类样本之间设计一个SVM,因此k个类别的样本就需要设计k(k - 1) / 2 92 | 个SVM。当对一个未知样本进行分类时,最后得票最多的类别即为该未知样本的类别。Libsvm中的多类分类就是根据这个方法实现的。 93 | 94 | - degree :多项式poly函数的维度,默认是3,选择其他核函数时会被忽略。 95 | 96 | 97 | - gamma : ‘rbf’, ‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’,则会选择1 / n_features 98 | 99 | - coef0 :核函数的常数项。对于‘poly’和 ‘sigmoid’有用。 100 | 101 | 102 | - probability :是否采用概率估计?.默认为False 103 | 104 | - shrinking :是否采用shrinking 105 | heuristic方法,默认为true 106 | 107 | - tol :停止训练的误差值大小,默认为1e - 3 108 | 109 | - cache_size :核函数cache缓存大小,默认为200 110 | 111 | - class_weight :类别的权重,字典形式传递。设置第几类的参数C为weight * C(C - SVC中的C) 112 | 113 | - verbose :允许冗余输出? 114 | 115 | 116 | - max_iter :最大迭代次数。-1 117 | 为无限制。 118 | 119 | 120 | - decision_function_shape :‘ovo’, ‘ovr’ or None, default = None3 121 | 122 | - random_state :数据洗牌时的种子值,int值 123 | 124 | 主要调节的参数有:C、kernel、degree、gamma、coef0。 125 | ''' -------------------------------------------------------------------------------- /Code/8-3核是可以选的.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn import datasets 3 | from sklearn.model_selection import train_test_split as ts 4 | 5 | #import our data 6 | iris = datasets.load_iris() 7 | X = iris.data 8 | y = iris.target 9 | 10 | #split the data to 7:3 11 | X_train,X_test,y_train,y_test = ts(X,y,test_size=0.3) 12 | 13 | # select different type of kernel function and compare the score 14 | 15 | # kernel = 'rbf' 16 | clf_rbf = svm.SVC(kernel='rbf',gamma='auto') 17 | clf_rbf.fit(X_train,y_train) 18 | score_rbf = clf_rbf.score(X_test,y_test) 19 | print("The score of rbf is : %f"%score_rbf) 20 | 21 | # kernel = 'linear' 22 | clf_linear = svm.SVC(kernel='linear',gamma='auto') 23 | print("xxxxxxxxxxxxxxxx") 24 | print(X_train.shape) 25 | clf_linear.fit(X_train,y_train) 26 | score_linear = clf_linear.score(X_test,y_test) 27 | print("The score of linear is : %f"%score_linear) 28 | 29 | # kernel = 'poly' 30 | clf_poly = svm.SVC(kernel='poly',gamma='auto') 31 | clf_poly.fit(X_train,y_train) 32 | score_poly = clf_poly.score(X_test,y_test) 33 | print("The score of poly is : %f"%score_poly) 34 | 35 | print(clf_linear.coef_,clf_linear.intercept_) 36 | 37 | print(clf_linear.predict([[4.9,3.,1.4,0.2]])) 38 | # LinearSVC 39 | from sklearn.svm import LinearSVC 40 | from sklearn import datasets 41 | from sklearn.model_selection import train_test_split as ts 42 | 43 | 44 | iris = datasets.load_iris() 45 | X = iris.data 46 | y = iris.target 47 | 48 | #split the data to 7:3 49 | X_train,X_test,y_train,y_test = ts(X,y,test_size=0.3) 50 | 51 | 52 | clf = LinearSVC(random_state=0, tol=1e-5,max_iter = 10000) 53 | clf.fit(X, y) 54 | 55 | print(clf.coef_) 56 | print(clf.intercept_) 57 | print(clf.predict([[4.9,3.,1.4,0.2]])) 58 | 59 | 60 | from sklearn.svm import SVR 61 | 62 | X = [[0, 0], [2, 2]] 63 | y = [0.5, 2.5] 64 | clf = svm.SVR() 65 | clf.fit(X, y) 66 | SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, 67 | gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True, 68 | tol=0.001, verbose=False) 69 | clf.predict([[1, 1]]) 70 | 71 | 72 | 73 | from sklearn.svm import LinearSVR 74 | 75 | regr = LinearSVR(random_state=0, tol=1e-5) 76 | regr.fit(X, y) 77 | print(regr.coef_) 78 | 79 | print(regr.intercept_) 80 | print(regr.predict([[1,1]])) 81 | 82 | -------------------------------------------------------------------------------- /Code/8-4SVC(数字样本).py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets, svm, metrics 2 | 3 | 4 | digits = datasets.load_digits() 5 | from sklearn.datasets import load_digits 6 | 7 | from sklearn.model_selection import train_test_split 8 | 9 | digits = load_digits() 10 | X, y = digits.data, digits.target 11 | 12 | 13 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 14 | 15 | 16 | clf = svm.SVC() 17 | 18 | clf.fit(X_train, y_train,) 19 | 20 | print(clf.score(X_test, y_test)) 21 | 22 | -------------------------------------------------------------------------------- /Code/8-5SVC(cifar-10).py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn import svm 3 | import pickle 4 | 5 | from sklearn.model_selection import GridSearchCV 6 | 7 | 8 | def load(filename): 9 | 10 | with open(filename, 'rb') as fo: 11 | 12 | data = pickle.load(fo, encoding='latin1') 13 | 14 | return data 15 | #读取第一个训练集——data_batch_1: 16 | train = 'cifar-10-batches-py\data_batch_' 17 | test=r'cifar-10-batches-py\test_batch' #字符串前加r防止转义字符/t 18 | print(test) 19 | 20 | 21 | 22 | #parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'), 'C':[0.1,1,10,100]} 23 | #classifier=svm.SVC() 24 | #classifier.n_jobs=-1 25 | #print("1") 26 | #clf = GridSearchCV(classifier, parameters) 27 | clf=svm.SVC() 28 | for j in range(1,6): #从文件cifar-10-batches-py中读取data集1-5 29 | d=load(train+str(j)) 30 | print("数据集"+str(j)+"训练完毕") 31 | X, y = d["data"], d["labels"] 32 | X_train, y_train = X, y 33 | clf.fit(X_train, y_train,) 34 | 35 | d=load(test)#从文件cifar-10-batches-py中读取test集 36 | X, y = d["data"], d["labels"] 37 | X_test, y_test = X, y 38 | clf.fit(X_test, y_test,) 39 | 40 | 41 | print(clf.score) 42 | -------------------------------------------------------------------------------- /Code/9-1bagging三种集成学习方式.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_iris 4 | from sklearn.model_selection import train_test_split 5 | import matplotlib.pyplot as plt 6 | 7 | # data 8 | def create_data(): 9 | iris = load_iris() 10 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 11 | df['label'] = iris.target 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 13 | data = np.array(df.iloc[:100, [0, 1, -1]]) 14 | for i in range(len(data)): 15 | if data[i,-1] == 0: 16 | data[i,-1] = -1 17 | # print(data) 18 | return data[:,:2], data[:,-1] 19 | 20 | 21 | X, y = create_data() 22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 23 | 24 | 25 | # bagging 算法 26 | from sklearn.ensemble import BaggingClassifier 27 | from sklearn.neighbors import KNeighborsClassifier 28 | 29 | bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5) 30 | bagging.fit(X_train, y_train) 31 | 32 | in_score = bagging.score(X_train, y_train) 33 | out_score = bagging.score(X_test, y_test) 34 | print(in_score,out_score) 35 | 36 | # RandomForest 算法 37 | from sklearn.ensemble import RandomForestClassifier 38 | 39 | forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0) #用了三百棵树 40 | forest.fit(X_train, y_train) 41 | 42 | in_score = forest.score(X_train, y_train) 43 | out_score = forest.score(X_test, y_test) 44 | print(in_score,out_score) 45 | 46 | 47 | from sklearn.linear_model import LogisticRegression 48 | from sklearn.naive_bayes import GaussianNB 49 | from sklearn.ensemble import RandomForestClassifier 50 | 51 | from sklearn.ensemble import VotingClassifier 52 | 53 | import numpy as np 54 | 55 | clf1 = LogisticRegression(multi_class='multinomial', random_state=1) 56 | clf2 = RandomForestClassifier(n_estimators=50, random_state=1) 57 | clf3 = GaussianNB() 58 | 59 | 60 | 61 | eclf1 = VotingClassifier(estimators=[ 62 | ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') 63 | eclf1 = eclf1.fit(X_train, y_train) 64 | # print(eclf1.predict(X)) 65 | in_score = eclf1.score(X_train, y_train) 66 | out_score = eclf1.score(X_test, y_test) 67 | print(in_score,out_score) 68 | 69 | # Adaboost 算法 70 | 71 | from sklearn.ensemble import AdaBoostClassifier 72 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) 73 | clf.fit(X_train, y_train) 74 | in_score = clf.score(X_train, y_train) 75 | out_score = clf.score(X_test, y_test) 76 | print(in_score,out_score) 77 | 78 | """ 79 | - n_estimators: AdaBoostClassifier和AdaBoostRegressor都有,就是我们的弱学习器的最大迭代次数,或者说最大的弱学习器的个数。一般来说n_estimators太小,容易欠拟合,n_estimators太大,又容易过拟合,一般选择一个适中的数值。默认是50。在实际调参的过程中,我们常常将n_estimators和下面介绍的参数learning_rate一起考虑。 80 | 81 | - learning_rate: AdaBoostClassifier和AdaBoostRegressor都有,即每个弱学习器的权重缩减系数ν 82 | 83 | - base_estimator:AdaBoostClassifier和AdaBoostRegressor都有,即我们的弱分类学习器或者弱回归学习器。理论上可以选择任何一个分类或者回归学习器,不过需要支持样本权重。我们常用的一般是CART决策树或者神经网络MLP。 84 | """ 85 | 86 | # boosting的方式 GBDT 算法 87 | from sklearn.ensemble import GradientBoostingRegressor 88 | 89 | model = GradientBoostingRegressor(n_estimators=500,learning_rate=0.25,min_samples_leaf=9,max_depth=8,random_state=4) 90 | model.fit(X_train, y_train) 91 | in_score = model.score(X_train, y_train) 92 | out_score = model.score(X_test, y_test) 93 | print(in_score,out_score) 94 | 95 | #stackking的方式 96 | from sklearn.datasets import load_iris 97 | from sklearn.ensemble import RandomForestClassifier 98 | from sklearn.svm import LinearSVC 99 | from sklearn.linear_model import LogisticRegression 100 | from sklearn.preprocessing import StandardScaler 101 | from sklearn.pipeline import make_pipeline 102 | from sklearn.ensemble import StackingClassifier 103 | 104 | 105 | X, y = load_iris(return_X_y=True) 106 | estimators = [ 107 | ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), 108 | ('svr', make_pipeline(StandardScaler(), 109 | LinearSVC(random_state=42)))] 110 | clf = StackingClassifier( 111 | estimators=estimators, final_estimator=LogisticRegression() 112 | ) 113 | 114 | from sklearn.model_selection import train_test_split 115 | X_train, X_test, y_train, y_test = train_test_split( 116 | X, y, stratify=y, random_state=42 117 | ) 118 | clf.fit(X_train, y_train) 119 | print(clf.score(X_test, y_test)) 120 | 121 | from sklearn.ensemble import AdaBoostRegressor 122 | from sklearn.datasets import make_regression 123 | X, y = make_regression(n_features=4, n_informative=2, 124 | random_state=0, shuffle=False) 125 | regr = AdaBoostRegressor(random_state=0, n_estimators=100) 126 | regr.fit(X, y) 127 | #regr.feature_importances_ 128 | print(regr.predict([[0, 0, 0, 0]])) 129 | print(regr.score(X, y)) -------------------------------------------------------------------------------- /Code/9-2原始Adaboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import datasets 3 | 4 | 5 | digits = datasets.load_digits() 6 | 7 | class AdaBoost: 8 | def __init__(self, n_estimators=50, learning_rate=1.0): 9 | self.clf_num = n_estimators 10 | self.learning_rate = learning_rate 11 | 12 | def init_args(self, datasets, labels): 13 | 14 | self.X = datasets 15 | self.Y = labels 16 | self.M, self.N = datasets.shape 17 | 18 | # 弱分类器数目和集合 19 | self.clf_sets = [] 20 | 21 | # 初始化weights 22 | self.weights = [1.0 / self.M] * self.M 23 | 24 | # G(x)系数 alpha 25 | self.alpha = [] 26 | 27 | def _G(self, features, labels, weights): 28 | m = len(features) 29 | error = 100000.0 # 无穷大 30 | best_v = 0.0 31 | # 单维features 32 | features_min = min(features) 33 | features_max = max(features) 34 | n_step = (features_max - features_min + self.learning_rate) // self.learning_rate 35 | # print('n_step:{}'.format(n_step)) 36 | direct, compare_array = None, None 37 | for i in range(1, int(n_step)): 38 | v = features_min + self.learning_rate * i 39 | 40 | if v not in features: 41 | # 误分类计算 42 | compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)]) 43 | weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]]) 44 | 45 | compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)]) 46 | weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]]) 47 | 48 | if weight_error_positive < weight_error_nagetive: 49 | weight_error = weight_error_positive 50 | _compare_array = compare_array_positive 51 | direct = 'positive' 52 | else: 53 | weight_error = weight_error_nagetive 54 | _compare_array = compare_array_nagetive 55 | direct = 'nagetive' 56 | 57 | # print('v:{} error:{}'.format(v, weight_error)) 58 | if weight_error < error: 59 | error = weight_error 60 | compare_array = _compare_array 61 | best_v = v 62 | return best_v, direct, error, compare_array 63 | 64 | 65 | clf = AdaBoost(n_estimators=3, learning_rate=0.5) 66 | clf.fit(X, y) 67 | clf.score(X_test, y_test) -------------------------------------------------------------------------------- /Code/9-3Adaboost与RandomForest.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | 3 | digits = datasets.load_digits() 4 | from sklearn.datasets import load_digits 5 | 6 | from sklearn.model_selection import train_test_split 7 | 8 | digits = load_digits() 9 | X, y = digits.data, digits.target 10 | 11 | 12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 13 | 14 | 15 | 16 | 17 | 18 | from sklearn.ensemble import RandomForestClassifier 19 | 20 | forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0) #用了三百棵树 21 | forest.fit(X_train, y_train) 22 | 23 | in_score = forest.score(X_train, y_train) 24 | out_score = forest.score(X_test, y_test) 25 | print(in_score,out_score) 26 | 27 | from sklearn.ensemble import AdaBoostClassifier 28 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) 29 | clf.fit(X_train, y_train) 30 | in_score = clf.score(X_train, y_train) 31 | out_score = clf.score(X_test, y_test) 32 | print(in_score,out_score) 33 | 34 | -------------------------------------------------------------------------------- /Code/9-4集成学习(酒样本).py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.model_selection import train_test_split 3 | 4 | wine= pd.read_csv("wine.csv") 5 | 6 | 7 | X_train, X_test, y_train, y_test = train_test_split(wine.iloc[:,1:], wine.iloc[:,0], test_size=0.2, random_state=0) 8 | 9 | 10 | 11 | from sklearn.ensemble import RandomForestClassifier 12 | 13 | forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0) #用了三百棵树 14 | forest.fit(X_train, y_train) 15 | 16 | in_score = forest.score(X_train, y_train) 17 | out_score = forest.score(X_test, y_test) 18 | print(in_score,out_score) 19 | 20 | from sklearn.ensemble import AdaBoostClassifier 21 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) 22 | clf.fit(X_train, y_train) 23 | in_score = clf.score(X_train, y_train) 24 | out_score = clf.score(X_test, y_test) 25 | print(in_score,out_score) -------------------------------------------------------------------------------- /Code_2022/class10-test1.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from collections import defaultdict 5 | 6 | 7 | # euclidian distance between 2 data points. For as many data points as necessary. 8 | def euclidean_distance(a, b): 9 | return np.linalg.norm(a - b) 10 | 11 | 12 | def kmeans(data, k=3): 13 | m = data.shape[0] 14 | index = random.sample(range(m), k) 15 | mu = data[index] # 随机选择初始均值向量 16 | 17 | while True: 18 | 19 | C = defaultdict(list) 20 | 21 | for j in range(0, m): 22 | dij = [euclidean_distance(data[j], mu[i]) for i in range(k)] 23 | lambda_j = np.argmin(dij) # 选择最小的值得下标 24 | 25 | C[lambda_j].append(data[j].tolist()) 26 | 27 | new_mu = [np.mean(C[i], axis=0).tolist() for i in range(k)] 28 | 29 | if (euclidean_distance(np.array(new_mu), np.array(mu)) > 1e-9): 30 | mu = new_mu 31 | else: 32 | break 33 | 34 | return C, mu 35 | 36 | 37 | watermelon = np.array([[0.697, 0.46], 38 | [0.774, 0.376], 39 | [0.634, 0.264], 40 | [0.608, 0.318], 41 | [0.556, 0.215], 42 | [0.403, 0.237], 43 | [0.481, 0.149], 44 | [0.437, 0.211], 45 | [0.666, 0.091], 46 | [0.243, 0.267], 47 | [0.245, 0.057], 48 | [0.343, 0.099], 49 | [0.639, 0.161], 50 | [0.657, 0.198], 51 | [0.36, 0.37], 52 | [0.593, 0.042], 53 | [0.719, 0.103], 54 | [0.359, 0.188], 55 | [0.339, 0.241], 56 | [0.282, 0.257], 57 | [0.748, 0.232], 58 | [0.714, 0.346], 59 | [0.483, 0.312], 60 | [0.478, 0.437], 61 | [0.525, 0.369], 62 | [0.751, 0.489], 63 | [0.532, 0.472], 64 | [0.473, 0.376], 65 | [0.725, 0.445], 66 | [0.446, 0.459]]) 67 | 68 | k = 2 69 | res, mu = kmeans(watermelon, k) 70 | print(res) 71 | print('新的中心:', mu) 72 | 73 | for i in range(k): 74 | res_i = np.array(res[i]) 75 | plt.scatter(res_i[:, 0], res_i[:, 1]) 76 | plt.show() 77 | -------------------------------------------------------------------------------- /Code_2022/class10-test2.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import numpy as np 4 | from scipy import linalg 5 | import matplotlib.pyplot as plt 6 | import matplotlib as mpl 7 | 8 | from sklearn import mixture 9 | from sklearn.cluster import KMeans 10 | 11 | X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) 12 | kmeans = KMeans(n_clusters=2, random_state=0).fit(X) 13 | print(kmeans.labels_) 14 | print(kmeans.predict([[0, 0], [4, 4]])) 15 | print(kmeans.cluster_centers_) 16 | 17 | color_iter = itertools.cycle(['navy', 'c', 'cornflowerblue', 'gold', 18 | 'darkorange']) 19 | 20 | 21 | def plot_results(X, Y_, means, covariances, index, title): 22 | splot = plt.subplot(2, 1, 1 + index) 23 | for i, (mean, covar, color) in enumerate(zip( 24 | means, covariances, color_iter)): 25 | v, w = linalg.eigh(covar) 26 | v = 2. * np.sqrt(2.) * np.sqrt(v) 27 | u = w[0] / linalg.norm(w[0]) 28 | # as the DP will not use every component it has access to 29 | # unless it needs it, we shouldn't plot the redundant 30 | # components. 31 | if not np.any(Y_ == i): 32 | continue 33 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 34 | 35 | # Plot an ellipse to show the Gaussian component 36 | angle = np.arctan(u[1] / u[0]) 37 | angle = 180. * angle / np.pi # convert to degrees 38 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) 39 | ell.set_clip_box(splot.bbox) 40 | ell.set_alpha(0.5) 41 | splot.add_artist(ell) 42 | 43 | 44 | # Number of samples per component 45 | n_samples = 500 46 | 47 | # Generate random sample, two components 48 | np.random.seed(0) 49 | C = np.array([[0., -0.1], [1.7, .4]]) 50 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), 51 | .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] 52 | 53 | # Fit a Gaussian mixture with EM using five components 54 | gmm = mixture.GaussianMixture(n_components=5, covariance_type='full').fit(X) 55 | plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, 56 | 'Gaussian Mixture') 57 | 58 | # Fit a Dirichlet process Gaussian mixture using five components 59 | dpgmm = mixture.BayesianGaussianMixture(n_components=5, 60 | covariance_type='full').fit(X) 61 | plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1, 62 | 'Bayesian Gaussian Mixture with a Dirichlet process prior') 63 | 64 | plt.show() 65 | 66 | print(gmm.weights_) 67 | print(gmm.means_) 68 | print(gmm.covariances_) 69 | -------------------------------------------------------------------------------- /Code_2022/class10-test3.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | import matplotlib.pyplot as plt 3 | from sklearn.cluster import KMeans 4 | 5 | # 加载数据集,是一个字典类似Java中的map 6 | lris_df = datasets.load_iris() 7 | 8 | # 挑选出前两个维度作为x轴和y轴,你也可以选择其他维度 9 | x_axis = lris_df.data[:, 0] 10 | y_axis = lris_df.data[:, 2] 11 | 12 | # 这里已经知道了分3类,其他分类这里的参数需要调试 13 | model = KMeans(n_clusters=3) 14 | 15 | # 训练模型 16 | model.fit(lris_df.data) 17 | 18 | # 选取行标为100的那条数据,进行预测 19 | prddicted_label = model.predict([[6.3, 3.3, 6, 2.5]]) 20 | 21 | # 预测全部150条数据 22 | all_predictions = model.predict(lris_df.data) 23 | 24 | # 打印出来对150条数据的聚类散点图 25 | plt.scatter(x_axis, y_axis, c=all_predictions) 26 | plt.show() 27 | 28 | -------------------------------------------------------------------------------- /Code_2022/class11-test1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class HiddenMarkov: 5 | def forward(self, Q, V, A, B, O, PI): # 使用前向算法 6 | N = len(Q) # 状态序列的大小 7 | M = len(O) # 观测序列的大小 8 | alphas = np.zeros((N, M)) # alpha值 9 | T = M # 有几个时刻,有几个观测序列,就有几个时刻 10 | for t in range(T): # 遍历每一时刻,算出alpha值 11 | indexOfO = V.index(O[t]) # 找出序列对应的索引 12 | for i in range(N): 13 | if t == 0: # 计算初值 14 | alphas[i][t] = PI[t][i] * B[i][indexOfO] # P176(10.15) 15 | print('alpha1(%d)=p%db%db(o1)=%f' % (i, i, i, alphas[i][t])) 16 | else: 17 | alphas[i][t] = np.dot([alpha[t - 1] for alpha in alphas], [a[i] for a in A]) * B[i][ 18 | indexOfO] # 对应P176(10.16) 19 | print('alpha%d(%d)=[sigma alpha%d(i)ai%d]b%d(o%d)=%f' % (t, i, t - 1, i, i, t, alphas[i][t])) 20 | # print(alphas) 21 | P = np.sum([alpha[M - 1] for alpha in alphas]) # P176(10.17) 22 | # alpha11 = pi[0][0] * B[0][0] #代表a1(1) 23 | # alpha12 = pi[0][1] * B[1][0] #代表a1(2) 24 | # alpha13 = pi[0][2] * B[2][0] #代表a1(3) 25 | 26 | def backward(self, Q, V, A, B, O, PI): # 后向算法 27 | N = len(Q) # 状态序列的大小 28 | M = len(O) # 观测序列的大小 29 | betas = np.ones((N, M)) # beta 30 | for i in range(N): 31 | print('beta%d(%d)=1' % (M, i)) 32 | for t in range(M - 2, -1, -1): 33 | indexOfO = V.index(O[t + 1]) # 找出序列对应的索引 34 | for i in range(N): 35 | betas[i][t] = np.dot(np.multiply(A[i], [b[indexOfO] for b in B]), [beta[t + 1] for beta in betas]) 36 | realT = t + 1 37 | realI = i + 1 38 | print('beta%d(%d)=[sigma a%djbj(o%d)]beta%d(j)=(' % (realT, realI, realI, realT + 1, realT + 1), 39 | end='') 40 | for j in range(N): 41 | print("%.2f*%.2f*%.2f+" % (A[i][j], B[j][indexOfO], betas[j][t + 1]), end='') 42 | print("0)=%.3f" % betas[i][t]) 43 | # print(betas) 44 | indexOfO = V.index(O[0]) 45 | P = np.dot(np.multiply(PI, [b[indexOfO] for b in B]), [beta[0] for beta in betas]) 46 | print("P(O|lambda)=", end="") 47 | for i in range(N): 48 | print("%.1f*%.1f*%.5f+" % (PI[0][i], B[i][indexOfO], betas[i][0]), end="") 49 | print("0=%f" % P) 50 | 51 | def viterbi(self, Q, V, A, B, O, PI): 52 | N = len(Q) # 状态序列的大小 53 | M = len(O) # 观测序列的大小 54 | deltas = np.zeros((N, M)) 55 | psis = np.zeros((N, M)) 56 | I = np.zeros((1, M)) 57 | for t in range(M): 58 | realT = t + 1 59 | indexOfO = V.index(O[t]) # 找出序列对应的索引 60 | for i in range(N): 61 | realI = i + 1 62 | if t == 0: 63 | deltas[i][t] = PI[0][i] * B[i][indexOfO] 64 | psis[i][t] = 0 65 | print('delta1(%d)=pi%d * b%d(o1)=%.2f * %.2f=%.2f' % ( 66 | realI, realI, realI, PI[0][i], B[i][indexOfO], deltas[i][t])) 67 | print('psis1(%d)=0' % (realI)) 68 | else: 69 | deltas[i][t] = np.max(np.multiply([delta[t - 1] for delta in deltas], [a[i] for a in A])) * B[i][ 70 | indexOfO] 71 | print('delta%d(%d)=max[delta%d(j)aj%d]b%d(o%d)=%.2f*%.2f=%.5f' % ( 72 | realT, realI, realT - 1, realI, realI, realT, 73 | np.max(np.multiply([delta[t - 1] for delta in deltas], [a[i] for a in A])), B[i][indexOfO], 74 | deltas[i][t])) 75 | psis[i][t] = np.argmax(np.multiply([delta[t - 1] for delta in deltas], [a[i] for a in A])) 76 | print('psis%d(%d)=argmax[delta%d(j)aj%d]=%d' % (realT, realI, realT - 1, realI, psis[i][t])) 77 | print(deltas) 78 | print(psis) 79 | I[0][M - 1] = np.argmax([delta[M - 1] for delta in deltas]) 80 | print('i%d=argmax[deltaT(i)]=%d' % (M, I[0][M - 1] + 1)) 81 | for t in range(M - 2, -1, -1): 82 | I[0][t] = psis[int(I[0][t + 1])][t + 1] 83 | print('i%d=psis%d(i%d)=%d' % (t + 1, t + 2, t + 2, I[0][t] + 1)) 84 | print(I) 85 | 86 | 87 | Q = [1, 2, 3] 88 | V = ['红', '白'] 89 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]] 90 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]] 91 | # O = ['红', '白', '红', '红', '白', '红', '白', '白'] 92 | O = ['红', '白', '红', '白'] # 习题10.1的例子 93 | PI = [[0.2, 0.4, 0.4]] 94 | 95 | HMM = HiddenMarkov() 96 | # HMM.forward(Q, V, A, B, O, PI) 97 | # HMM.backward(Q, V, A, B, O, PI) 98 | HMM.viterbi(Q, V, A, B, O, PI) 99 | print('------------------------------------------------') 100 | 101 | Q = [1, 2, 3] 102 | V = ['红', '白'] 103 | A = [[0.5, 0.2, 0.3], [0.3, 0.5, 0.2], [0.2, 0.3, 0.5]] 104 | B = [[0.5, 0.5], [0.4, 0.6], [0.7, 0.3]] 105 | O = ['红', '白', '红', '红', '白', '红', '白', '白'] 106 | PI = [[0.2, 0.3, 0.5]] 107 | 108 | HMM.forward(Q, V, A, B, O, PI) 109 | HMM.backward(Q, V, A, B, O, PI) 110 | -------------------------------------------------------------------------------- /Code_2022/class11-test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import hmmlearn.hmm as hmm 3 | 4 | states = ['盒子1', '盒子2', '盒子3'] 5 | obs = ['白球', '黑球'] 6 | n_states = len(states) 7 | m_obs = len(obs) 8 | 9 | model2 = hmm.MultinomialHMM(n_components=n_states, n_iter=20, tol=0.001) 10 | X2 = np.array([ 11 | [0, 1, 0, 0, 1], 12 | [0, 0, 0, 1, 1], 13 | [1, 1, 0, 1, 0], 14 | [0, 1, 0, 1, 1], 15 | [0, 0, 0, 1, 0] 16 | ]) 17 | model2.fit(X2) 18 | print("输出根据数据训练出来的π") 19 | print(model2.startprob_) 20 | print("输出根据数据训练出来的A") 21 | print(model2.transmat_) 22 | print("输出根据数据训练出来的B") 23 | print(model2.emissionprob_) 24 | 25 | status = ['盒子1', '盒子2', '盒子3'] 26 | obs = ['白球', '黑球'] 27 | n_status = len(status) 28 | m_obs = len(obs) 29 | start_probability = np.array([0.2, 0.5, 0.3]) 30 | transition_probability = np.array([ 31 | [0.5, 0.4, 0.1], 32 | [0.2, 0.2, 0.6], 33 | [0.2, 0.5, 0.3] 34 | ]) 35 | emission_probalitity = np.array([ 36 | [0.4, 0.6], 37 | [0.8, 0.2], 38 | [0.5, 0.5] 39 | ]) 40 | 41 | model = hmm.MultinomialHMM(n_components=n_status) 42 | model.startprob_ = start_probability 43 | model.transmat_ = transition_probability 44 | model.emissionprob_ = emission_probalitity 45 | 46 | se = np.array([[0, 1, 0, 0, 1]]).T 47 | logprob, box_index = model.decode(se, algorithm='viterbi') 48 | print("颜色:", end="") 49 | print(" ".join(map(lambda t: obs[t], [0, 1, 0, 0, 1]))) 50 | print("盒子:", end="") 51 | print(" ".join(map(lambda t: status[t], box_index))) 52 | print("概率值:", end="") 53 | print(np.exp(logprob)) # 这个是因为在hmmlearn底层将概率进行了对数化,防止出现乘积为0的情况 54 | 55 | status = ['盒子1', '盒子2', '盒子3'] 56 | obs = ['白球', '黑球'] 57 | n_status = len(status) 58 | m_obs = len(obs) 59 | start_probability = np.array([0.2, 0.5, 0.3]) 60 | transition_probability = np.array([ 61 | [0.5, 0.4, 0.1], 62 | [0.2, 0.2, 0.6], 63 | [0.2, 0.5, 0.3] 64 | ]) 65 | emission_probalitity = np.array([ 66 | [0.4, 0.6], 67 | [0.8, 0.2], 68 | [0.5, 0.5] 69 | ]) 70 | 71 | model = hmm.MultinomialHMM(n_components=n_status) 72 | model.startprob_ = start_probability 73 | model.transmat_ = transition_probability 74 | model.emissionprob_ = emission_probalitity 75 | 76 | # 预测问题 77 | seen = np.array([0, 1, 0]) 78 | 79 | # 观测序列的概率计算问题 80 | # score函数返回的是以自然对数为底的对数概率值 81 | # ln0.13022≈−2.0385 82 | print(model.score(seen.reshape(-1, 1))) 83 | 84 | print(np.exp(-1.81)) 85 | -------------------------------------------------------------------------------- /Code_2022/class12-test.py: -------------------------------------------------------------------------------- 1 | # Singular-value decomposition 2 | from numpy import array 3 | from scipy.linalg import svd 4 | 5 | # define a matrix 6 | A = array([[1, 2], [3, 4], [5, 6]]) 7 | print(A) 8 | # SVD 9 | U, s, VT = svd(A) 10 | print(U) 11 | print(s) 12 | print(VT) 13 | 14 | from numpy import array 15 | from numpy import diag 16 | from numpy import dot 17 | 18 | A = array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 19 | print(A) 20 | # Singular-value decomposition 21 | U, s, VT = svd(A) 22 | # create n x n Sigma matrix 23 | Sigma = diag(s) 24 | # reconstruct matrix 25 | B = U.dot(Sigma.dot(VT)) 26 | print(B) 27 | 28 | # create n x n Sigma matrix 29 | Sigma = diag(s) 30 | # reconstruct matrix 31 | B = U.dot(Sigma.dot(VT)) 32 | print(B) 33 | -------------------------------------------------------------------------------- /Code_2022/class12-test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import matplotlib.image as mpimg 4 | 5 | # import cv2 6 | import matplotlib.pyplot as plt 7 | 8 | # %matplotlib inline 9 | 10 | I = mpimg.imread('data/header.jpg') 11 | # Now, let's look at the size of this numpy array object img as well as plot it using imshow. 12 | print(I.shape) 13 | plt.axis('off') 14 | plt.imshow(I) 15 | plt.show() 16 | 17 | 18 | def show_img(img): 19 | plt.figure(figsize=(10, 7.5)) 20 | plt.imshow(img, cmap='gray', vmin=0, vmax=255, aspect='auto') 21 | plt.axis('off') 22 | plt.show() 23 | 24 | 25 | gray_img = I[:, :, 1] 26 | show_img(gray_img) 27 | print(gray_img.shape) 28 | U, S, V_T = np.linalg.svd(gray_img) 29 | print(U.shape, S.shape, V_T.shape) 30 | 31 | # Plot sigmas 32 | plt.figure(figsize=(9, 5)) 33 | plt.plot(np.arange(S.shape[0]), S) 34 | plt.yscale('log') 35 | plt.xlabel('Index of $\sigma$') 36 | plt.ylabel('log(value of $\sigma$)') 37 | plt.title('Singular values $\sigma_i$ vs its index') 38 | plt.show() 39 | 40 | # Plot cumsum of sigma 41 | plt.figure(figsize=(9, 5)) 42 | plt.plot(np.cumsum(S) / np.sum(S)) 43 | plt.xlabel('Index of $\sigma$') 44 | plt.ylabel('Value of $\sigma$') 45 | plt.title('Cumulative sum of $\sigma_i$ vs its index\n(Percent of explained variance)') 46 | plt.show() 47 | 48 | # Create an empty matrix to fill with sigma values (np.lialg.svd returns sigma as an array) 49 | S_full = np.zeros((U.shape[0], V_T.shape[0])) 50 | print(S_full.shape) 51 | 52 | # Populate sigma matrix 53 | S_diag = np.diag(S) 54 | print(S_diag.shape) 55 | S_full[:S_diag.shape[0], :S_diag.shape[1]] = S_diag 56 | 57 | # for i in [5, 10, 25, 50, 100, 200, U.shape[0]]: 58 | # print(str(i) + '\n') 59 | # show_img(U[:, :i].dot(S_full[:i, :i].dot(V_T[:i, :]))) 60 | # print('-' * 100 + '\n') 61 | # 62 | # print(U[:, :5]) 63 | i = 200 64 | print(U[:, :i].shape, S_full[:i, :i].shape, V_T[:i, :].shape) 65 | -------------------------------------------------------------------------------- /Code_2022/class13-test1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_iris 3 | from sklearn.decomposition import PCA 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def pca(data, n_dim): 8 | ''' 9 | pca is O(D^3) 10 | :param data: (n_samples, n_features(D)) 11 | :param n_dim: target dimensions 12 | :return: (n_samples, n_dim) 13 | ''' 14 | data = data - np.mean(data, axis=0, keepdims=True) 15 | 16 | cov = np.dot(data.T, data) 17 | 18 | eig_values, eig_vector = np.linalg.eig(cov) 19 | # print(eig_values) 20 | indexs_ = np.argsort(-eig_values)[:n_dim] 21 | picked_eig_values = eig_values[indexs_] 22 | picked_eig_vector = eig_vector[:, indexs_] 23 | data_ndim = np.dot(data, picked_eig_vector) 24 | return data_ndim 25 | 26 | 27 | data = load_iris() 28 | X = data.data 29 | Y = data.target 30 | data_2d1 = pca(X, 2) 31 | plt.figure(figsize=(8, 4)) 32 | plt.subplot(121) 33 | plt.title("my_PCA") 34 | plt.scatter(data_2d1[:, 0], data_2d1[:, 1], c=Y) 35 | plt.show() -------------------------------------------------------------------------------- /Code_2022/class13-test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.decomposition import PCA 3 | 4 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) 5 | pca = PCA(n_components=2) 6 | pca.fit(X) 7 | print(pca.explained_variance_ratio_) 8 | print(pca.singular_values_) 9 | 10 | new_X = pca.fit_transform(X) 11 | print(new_X) 12 | -------------------------------------------------------------------------------- /Code_2022/class13-test3.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | from sklearn.decomposition import PCA 3 | from sklearn.datasets import load_iris 4 | 5 | pca = PCA(2) 6 | print(pca) 7 | 8 | data = load_iris() 9 | X, y = data.data, data.target 10 | X_proj = pca.fit_transform(X) 11 | print(X_proj.shape) 12 | 13 | plt.scatter(X_proj[:, 0], X_proj[:, 1], c=y) 14 | plt.show() 15 | -------------------------------------------------------------------------------- /Code_2022/class13-test4.py: -------------------------------------------------------------------------------- 1 | import matplotlib.image as mpimg 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | from sklearn.decomposition import PCA 5 | 6 | img = mpimg.imread('data/header.jpg') 7 | # Now, let's look at the size of this numpy array object img as well as plot it using imshow. 8 | print(img.shape) 9 | plt.axis('off') 10 | plt.imshow(img) 11 | plt.show() 12 | 13 | img_r = np.reshape(img, (800, 3600)) 14 | print(img_r.shape) 15 | 16 | ipca = PCA(64).fit(img_r) 17 | img_c = ipca.transform(img_r) 18 | print(img_c.shape) 19 | print(np.sum(ipca.explained_variance_ratio_)) 20 | 21 | print(ipca) 22 | 23 | # OK, now to visualize how PCA has performed this compression, let's inverse transform the PCA output and 24 | # reshape for visualization using imshow. 25 | temp = ipca.inverse_transform(img_c) 26 | print(temp.shape) 27 | # reshaping 2988 back to the original 996 * 3 28 | temp = np.reshape(temp, (800, 1200, 3)) 29 | print(temp.shape) 30 | 31 | plt.axis('off') 32 | plt.imshow(temp) 33 | plt.imshow(temp.astype('uint8')) 34 | plt.show() 35 | -------------------------------------------------------------------------------- /Code_2022/class13-test5.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import load_digits 3 | from sklearn.decomposition import PCA 4 | 5 | digits = load_digits() 6 | print(digits.keys()) 7 | 8 | # looking at data, there looks to be 64 features, what are these? 9 | print(digits.data.shape) 10 | # another available dataset is called images. Let's check this out. 11 | print(digits.images.shape) 12 | 13 | import matplotlib.pyplot as plt 14 | 15 | plt.gray() 16 | plt.matshow(digits.images[0]) 17 | plt.show() 18 | 19 | X, y = digits.data, digits.target 20 | pca_digits = PCA(0.95) 21 | X_proj = pca_digits.fit_transform(X) 22 | print(X.shape, X_proj.shape) 23 | 24 | # Let's run PCA with 2 components so as to plot the data in 2D 25 | pca_digits = PCA(2) 26 | X_proj = pca_digits.fit_transform(X) 27 | print(np.sum(pca_digits.explained_variance_ratio_)) 28 | # Note we only retain about 28% of the variance by choosing 2 components 29 | 30 | print(X_proj.shape) 31 | 32 | # Let's plot the principal components as a scatter plot 33 | plt.scatter(X_proj[:, 0], X_proj[:, 1], c=y) 34 | plt.colorbar() 35 | plt.show() 36 | 37 | pca_digits = PCA(64).fit(X) 38 | plt.semilogx(np.cumsum(pca_digits.explained_variance_ratio_)) 39 | plt.xlabel('Number of Components') 40 | plt.ylabel('Variance retained') 41 | plt.ylim(0, 1) 42 | plt.show() 43 | -------------------------------------------------------------------------------- /Code_2022/class13-test6.py: -------------------------------------------------------------------------------- 1 | # SparsePCA 2 | import numpy as np 3 | from sklearn.datasets import make_friedman1, load_digits 4 | from sklearn.decomposition import SparsePCA 5 | 6 | X, _ = load_digits(return_X_y=True) 7 | transformer = SparsePCA(n_components=5, random_state=0) 8 | transformer.fit(X) 9 | X_transformed = transformer.transform(X) 10 | 11 | print(X_transformed.shape) 12 | 13 | # KernelPCA 14 | from sklearn.datasets import load_digits 15 | from sklearn.decomposition import KernelPCA 16 | 17 | X, y = load_digits(return_X_y=True) 18 | print(X.shape) 19 | transformer = KernelPCA(n_components=7, kernel='linear') 20 | X_transformed = transformer.fit_transform(X) 21 | print(X_transformed.shape) 22 | 23 | # Isomap 24 | from sklearn.manifold import Isomap 25 | 26 | isomap = Isomap(n_components=2, n_neighbors=5) 27 | new_X_isomap = isomap.fit_transform(X) 28 | print(new_X_isomap.shape) 29 | 30 | from sklearn.manifold import TSNE 31 | 32 | X_embedded = TSNE(n_components=2).fit_transform(X) 33 | print(X_embedded.shape) 34 | -------------------------------------------------------------------------------- /Code_2022/class13-test7.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from functools import partial 3 | from time import time 4 | 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.mplot3d import Axes3D 7 | from matplotlib.ticker import NullFormatter 8 | 9 | from sklearn import manifold, datasets 10 | 11 | # Next line to silence pyflakes. This import is needed. 12 | print(Axes3D) 13 | 14 | n_points = 1000 15 | X, color = datasets.make_s_curve(n_points, random_state=0) 16 | n_neighbors = 10 17 | n_components = 2 18 | 19 | # Create figure 20 | fig = plt.figure(figsize=(15, 8)) 21 | fig.suptitle("Manifold Learning with %i points, %i neighbors" 22 | % (1000, n_neighbors), fontsize=14) 23 | 24 | # Add 3d scatter plot 25 | ax = fig.add_subplot(251, projection='3d') 26 | ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) 27 | ax.view_init(4, -72) 28 | 29 | # Set-up manifold methods 30 | LLE = partial(manifold.LocallyLinearEmbedding, 31 | n_neighbors, n_components, eigen_solver='auto') 32 | 33 | methods = OrderedDict() 34 | methods['LLE'] = LLE(method='standard') 35 | methods['LTSA'] = LLE(method='ltsa') 36 | methods['Hessian LLE'] = LLE(method='hessian') 37 | methods['Modified LLE'] = LLE(method='modified') 38 | methods['Isomap'] = manifold.Isomap(n_neighbors, n_components) 39 | methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1) 40 | methods['SE'] = manifold.SpectralEmbedding(n_components=n_components, 41 | n_neighbors=n_neighbors) 42 | methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca', 43 | random_state=0) 44 | 45 | # Plot results 46 | for i, (label, method) in enumerate(methods.items()): 47 | t0 = time() 48 | Y = method.fit_transform(X) 49 | t1 = time() 50 | print("%s: %.2g sec" % (label, t1 - t0)) 51 | ax = fig.add_subplot(2, 5, 2 + i + (i > 3)) 52 | ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) 53 | ax.set_title("%s (%.2g sec)" % (label, t1 - t0)) 54 | ax.xaxis.set_major_formatter(NullFormatter()) 55 | ax.yaxis.set_major_formatter(NullFormatter()) 56 | ax.axis('tight') 57 | 58 | plt.show() 59 | -------------------------------------------------------------------------------- /Code_2022/class14-test1.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | from gensim import corpora 3 | from pprint import pprint 4 | 5 | # How to create a dictionary from a list of sentences? 6 | 7 | documents = ["The Saudis are preparing a report that will acknowledge that", 8 | "Saudi journalist Jamal Khashoggi's death was the result of an", 9 | "interrogation that went wrong, one that was intended to lead", 10 | "to his abduction from Turkey, according to two sources."] 11 | 12 | # Tokenize(split) the sentences into words 13 | texts = [[text for text in doc.split()] for doc in documents] 14 | 15 | # Create dictionary 16 | dictionary = corpora.Dictionary(texts) 17 | 18 | # Get information about the dictionary 19 | print(dictionary) 20 | print(dictionary.token2id) 21 | 22 | documents_2 = ["The intersection graph of paths in trees", 23 | "Graph minors IV Widths of trees and well quasi ordering", 24 | "Graph minors A survey"] 25 | 26 | texts_2 = [[text for text in doc.split()] for doc in documents_2] 27 | 28 | dictionary.add_documents(texts_2) 29 | 30 | print(dictionary.token2id) 31 | 32 | new_corpus = [dictionary.doc2bow(text) for text in texts] 33 | 34 | print(new_corpus) 35 | 36 | from gensim import models 37 | 38 | tfidf = models.TfidfModel(new_corpus) 39 | 40 | corpus_tfidf = tfidf[new_corpus] 41 | print(corpus_tfidf) 42 | 43 | for i in range(len(corpus_tfidf)): 44 | print(corpus_tfidf[i]) 45 | 46 | string = 'the i first second name' 47 | string_bow = dictionary.doc2bow(string.lower().split()) 48 | string_tfidf = tfidf[string_bow] 49 | print(string_bow) 50 | print(string_tfidf) -------------------------------------------------------------------------------- /Code_2022/class14-test2.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | from gensim.models import Word2Vec 3 | 4 | # define training data 5 | sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'], 6 | ['this', 'is', 'the', 'second', 'sentence'], 7 | ['yet', 'another', 'sentence'], 8 | ['one', 'more', 'sentence'], 9 | ['and', 'the', 'final', 'sentence']] 10 | 11 | # train model 12 | model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4) 13 | # model.build_vocab(sentences, update=True) 14 | # model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) 15 | 16 | print(model) 17 | vector = list(model.wv['first']) 18 | print(vector) 19 | 20 | sims = model.wv.most_similar('first', topn=10) # get other similar words 21 | print(sims) 22 | 23 | word_vectors = model.wv 24 | print(word_vectors) 25 | 26 | for index, word in enumerate(model.wv.index_to_key): 27 | if index == 10: 28 | break 29 | print(f"word #{index}/{len(model.wv.index_to_key)} is {word}") 30 | 31 | similarity = word_vectors.similarity('first', 'second') 32 | print(similarity) 33 | 34 | result = word_vectors.similar_by_word("first") 35 | print(result) 36 | 37 | _idx = model.wv.key_to_index["first"] 38 | print(_idx) 39 | -------------------------------------------------------------------------------- /Code_2022/class14-test3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | ''' 4 | @author: MrYx 5 | @github: https://github.com/MrYxJ 6 | ''' 7 | 8 | import jieba 9 | from gensim.models import word2vec 10 | import re 11 | 12 | with open('../class/data/三国演义.txt') as f: 13 | document = f.read() 14 | document = re.sub('[,。?!:;、“”]+', ' ', document) # 去标点 15 | document_cut = jieba.cut(document) # 结巴分词 16 | result = ' '.join(document_cut) 17 | with open('1.txt', 'w') as f2: 18 | f2.write(result) 19 | 20 | sentences = word2vec.LineSentence('1.txt') 21 | model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3) 22 | 23 | s1 = model.wv.most_similar('曹操') 24 | s2 = model.wv.most_similar('玄德') 25 | 26 | 27 | def show(s, name): 28 | print(name + ':', end=' ') 29 | for i in s: 30 | print(i[0], end=' ') 31 | print() 32 | 33 | 34 | show(s1, '曹操') 35 | show(s2, '玄德') 36 | -------------------------------------------------------------------------------- /Code_2022/class15-test1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | tqdm.pandas(desc="progress-bar") 6 | from gensim.models import Doc2Vec 7 | from sklearn import utils 8 | from sklearn.model_selection import train_test_split 9 | import gensim 10 | from sklearn.linear_model import LogisticRegression 11 | from gensim.models.doc2vec import TaggedDocument 12 | import re 13 | import seaborn as sns 14 | import matplotlib.pyplot as plt 15 | 16 | df = pd.read_csv('data/Consumer_Complaints.csv') 17 | df = df[['Consumer Complaint', 'Product']] 18 | df = df[pd.notnull(df['Consumer Complaint'])] 19 | print(df.head(10)) 20 | print(df.shape) 21 | print(df.isnull().sum()) 22 | 23 | cnt_pro = df['Product'].value_counts() 24 | 25 | plt.figure(figsize=(12, 4)) 26 | sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8) 27 | plt.ylabel('Number of Occurrences', fontsize=12) 28 | plt.xlabel('Product', fontsize=12) 29 | plt.xticks(rotation=90) 30 | plt.show() 31 | 32 | df.rename(columns={'Consumer Complaint': 'narrative'}, inplace=True) 33 | 34 | from gensim.models import doc2vec 35 | 36 | 37 | def label_sentences(corpus, label_type): 38 | """ 39 | Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. 40 | We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is 41 | a dummy index of the complaint narrative. 42 | """ 43 | labeled = [] 44 | for i, v in enumerate(corpus): 45 | label = label_type + '_' + str(i) 46 | labeled.append(doc2vec.TaggedDocument(v.split(), [label])) 47 | return labeled 48 | 49 | 50 | X_train, X_test, y_train, y_test = train_test_split(df.narrative, df.Product, random_state=0, test_size=0.3) 51 | X_train = label_sentences(X_train, 'Train') 52 | X_test = label_sentences(X_test, 'Test') 53 | all_data = X_train + X_test 54 | 55 | # print(all_data[:2]) 56 | 57 | model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065) 58 | model_dbow.build_vocab([x for x in tqdm(all_data)]) 59 | 60 | # %%time 61 | for epoch in range(2): 62 | model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1) 63 | model_dbow.alpha -= 0.002 64 | model_dbow.min_alpha = model_dbow.alpha 65 | 66 | 67 | # %%time 68 | def get_vectors(model, corpus_size, vectors_size, vectors_type): 69 | """ 70 | Get vectors from trained doc2vec model 71 | :param doc2vec_model: Trained Doc2Vec model 72 | :param corpus_size: Size of the data 73 | :param vectors_size: Size of the embedding vectors 74 | :param vectors_type: Training or Testing vectors 75 | :return: list of vectors 76 | """ 77 | vectors = np.zeros((corpus_size, vectors_size)) 78 | for i in range(0, corpus_size): 79 | prefix = vectors_type + '_' + str(i) 80 | vectors[i] = model.dv[prefix] 81 | return vectors 82 | 83 | 84 | train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train') 85 | test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test') 86 | 87 | print(len(train_vectors_dbow)) 88 | print(len(test_vectors_dbow)) 89 | # %%time 90 | from sklearn.linear_model import LogisticRegression 91 | 92 | logreg = LogisticRegression() # multi_class='multinomial', solver = 'lbfgs') 93 | logreg.fit(train_vectors_dbow, y_train) 94 | print(logreg.score(test_vectors_dbow, y_test)) 95 | # %%time 96 | new_doc = model_dbow.infer_vector(['violent', 'means', 'to', 'destroy', 'the', 'organization']) 97 | print(new_doc.shape) 98 | -------------------------------------------------------------------------------- /Code_2022/class15-test2.py: -------------------------------------------------------------------------------- 1 | from gensim import corpora, models, similarities 2 | from pprint import pprint 3 | import warnings 4 | 5 | f = open('data/LDA_test.txt') 6 | stop_list = set('for a of the and to in'.split()) 7 | 8 | texts = [[ 9 | word for word in line.strip().lower().split() if word not in stop_list 10 | ] for line in f] 11 | print('Text = ') 12 | pprint(texts) 13 | 14 | dictionary = corpora.Dictionary(texts) 15 | print(dictionary) 16 | 17 | V = len(dictionary) 18 | corpus = [dictionary.doc2bow(text) for text in texts] 19 | corpus_tfidf = models.TfidfModel(corpus)[corpus] 20 | corpus_tfidf = corpus 21 | 22 | print('\nTF-IDF:') 23 | for c in corpus_tfidf: 24 | print(c) 25 | 26 | print('\nLSI Model:') 27 | lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary) 28 | topic_result = [a for a in lsi[corpus_tfidf]] 29 | pprint(topic_result) 30 | 31 | print('\nLSI Topics:') 32 | pprint(lsi.print_topics(num_topics=2, num_words=5)) 33 | 34 | print('\nLDA Model:') 35 | num_topics = 2 36 | lda = models.LdaModel( 37 | corpus_tfidf, 38 | num_topics=num_topics, 39 | id2word=dictionary, 40 | alpha='auto', 41 | eta='auto', 42 | minimum_probability=0.001, 43 | passes=10) 44 | doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] 45 | print('Document-Topic:') 46 | pprint(doc_topic) 47 | -------------------------------------------------------------------------------- /Code_2022/class2-test1.py: -------------------------------------------------------------------------------- 1 | # Importing libraries 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from sklearn.linear_model import LinearRegression 6 | 7 | # driver code 8 | # Create dataset 9 | 10 | X = np.array([[1], [2], [3], [4], [5], [6], [7]]) 11 | Y = np.array([45000, 50000, 60000, 80000, 110000, 150000, 200000]) 12 | 13 | # Model training 14 | 15 | model = LinearRegression() 16 | model.fit(X, Y) 17 | 18 | # Prediction 19 | Y_pred = model.predict(X) 20 | 21 | print(model.coef_, model.intercept_) 22 | # Visualization 23 | plt.scatter(X, Y, color='blue') 24 | plt.plot(X, Y_pred, color='orange') 25 | plt.title('X vs Y') 26 | plt.xlabel('X') 27 | plt.ylabel('Y') 28 | plt.show() -------------------------------------------------------------------------------- /Code_2022/class2-test2.py: -------------------------------------------------------------------------------- 1 | from sklearn import linear_model 2 | from sklearn.linear_model import LinearRegression 3 | 4 | reg = linear_model.LinearRegression() 5 | reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) 6 | print(reg.coef_) 7 | 8 | from sklearn.preprocessing import PolynomialFeatures 9 | from sklearn.pipeline import Pipeline 10 | 11 | polynomial_features = PolynomialFeatures(degree=3, 12 | include_bias=False) 13 | linear_regression = LinearRegression() 14 | pipeline = Pipeline([("polynomial_features", polynomial_features), 15 | ("linear_regression", linear_regression)]) 16 | pipeline.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) 17 | print(linear_regression.coef_, linear_regression.intercept_) 18 | -------------------------------------------------------------------------------- /Code_2022/class2-test3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.model_selection import cross_val_score 7 | 8 | 9 | def true_fun(X): 10 | return np.cos(1.5 * np.pi * X) 11 | 12 | np.random.seed(0) 13 | 14 | n_samples = 30 15 | degrees = [1, 4, 15, 17] 16 | 17 | X = np.sort(np.random.rand(n_samples)) 18 | y = true_fun(X) + np.random.randn(n_samples) * 0.1 # 加入随机噪声 19 | 20 | plt.figure(figsize=(14, 5)) 21 | for i in range(len(degrees)): 22 | ax = plt.subplot(1, len(degrees), i + 1) # 确认行列 23 | plt.setp(ax, xticks=(), yticks=()) 24 | 25 | polynomial_features = PolynomialFeatures(degree=degrees[i], 26 | include_bias=False) 27 | # 建模,组装,拟合 28 | linear_regression = LinearRegression() 29 | pipeline = Pipeline([("polynomial_features", polynomial_features), 30 | ("linear_regression", linear_regression)]) 31 | pipeline.fit(X[:, np.newaxis], y) 32 | 33 | # Evaluate the models using crossvalidation 评分 34 | scores = cross_val_score(pipeline, X[:, np.newaxis], y, 35 | scoring="neg_mean_squared_error", cv=10) 36 | 37 | X_test = np.linspace(0, 1, 100) 38 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 39 | plt.plot(X_test, true_fun(X_test), label="True function") 40 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 41 | plt.xlabel("x") 42 | plt.ylabel("y") 43 | plt.xlim((0, 1)) 44 | plt.ylim((-2, 2)) 45 | plt.legend(loc="best") 46 | plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format( 47 | degrees[i], -scores.mean(), scores.std())) 48 | plt.show() 49 | 50 | print(X) 51 | print(y) -------------------------------------------------------------------------------- /Code_2022/class2-test4.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.model_selection import cross_val_score 7 | 8 | 9 | def true_fun(X): 10 | return np.cos(1.5 * np.pi * X) 11 | 12 | 13 | np.random.seed(0) 14 | 15 | n_samples = 30 16 | degrees = [1, 4, 15] 17 | 18 | X = np.sort(np.random.rand(n_samples)) 19 | y = true_fun(X) + np.random.randn(n_samples) * 0.1 # 加入随机噪声 20 | 21 | plt.figure(figsize=(14, 5)) 22 | for i in range(len(degrees)): 23 | ax = plt.subplot(1, len(degrees), i + 1) # 确认行列 24 | plt.setp(ax, xticks=(), yticks=()) 25 | 26 | polynomial_features = PolynomialFeatures(degree=degrees[i], 27 | include_bias=False) 28 | # 建模,组装,拟合 29 | linear_regression = LinearRegression() 30 | pipeline = Pipeline([("polynomial_features", polynomial_features), 31 | ("linear_regression", linear_regression)]) 32 | pipeline.fit(X[:, np.newaxis], y) 33 | 34 | # Evaluate the models using crossvalidation 评分 35 | scores = cross_val_score(pipeline, X[:, np.newaxis], y, 36 | scoring="neg_mean_squared_error", cv=10) 37 | 38 | X_test = np.linspace(0, 1, 100) 39 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 40 | plt.plot(X_test, true_fun(X_test), label="True function") 41 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 42 | plt.xlabel("x") 43 | plt.ylabel("y") 44 | plt.xlim((0, 1)) 45 | plt.ylim((-2, 2)) 46 | plt.legend(loc="best") 47 | plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format( 48 | degrees[i], -scores.mean(), scores.std())) 49 | plt.show() 50 | 51 | print(X) 52 | print(y) 53 | 54 | from sklearn.linear_model import Lasso 55 | from sklearn.linear_model import Ridge 56 | 57 | # pipeline = Ridge(alpha = 0.5) 58 | # pipeline.fit(X[:, np.newaxis], y) 59 | from sklearn.model_selection import cross_val_score 60 | from sklearn.pipeline import Pipeline 61 | from sklearn.preprocessing import PolynomialFeatures 62 | 63 | polynomial_features = PolynomialFeatures(degree=15, # 加入岭回归,避免15次时过拟合 64 | include_bias=False) 65 | linear_regression = Ridge(alpha=0.01) 66 | 67 | pipeline = Pipeline([("polynomial_features", polynomial_features), 68 | ("linear_regression", linear_regression)]) 69 | pipeline.fit(X[:, np.newaxis], y) 70 | 71 | # Evaluate the models using crossvalidation 72 | scores = cross_val_score(pipeline, X[:, np.newaxis], y, 73 | scoring="neg_mean_squared_error", cv=10) 74 | 75 | X_test = np.linspace(0, 1, 100) 76 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 77 | plt.plot(X_test, true_fun(X_test), label="True function") 78 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 79 | plt.xlabel("x") 80 | plt.ylabel("y") 81 | plt.xlim((0, 1)) 82 | plt.ylim((-2, 2)) 83 | plt.legend(loc="best") 84 | plt.show() -------------------------------------------------------------------------------- /Code_2022/class2-test5.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.pipeline import Pipeline 4 | from sklearn.preprocessing import PolynomialFeatures 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.model_selection import cross_val_score 7 | 8 | 9 | def true_fun(X): 10 | return np.cos(1.5 * np.pi * X) 11 | 12 | 13 | np.random.seed(0) 14 | 15 | n_samples = 30 16 | degrees = [1, 4, 15] 17 | 18 | X = np.sort(np.random.rand(n_samples)) 19 | y = true_fun(X) + np.random.randn(n_samples) * 0.1 # 加入随机噪声 20 | 21 | plt.figure(figsize=(14, 5)) 22 | for i in range(len(degrees)): 23 | ax = plt.subplot(1, len(degrees), i + 1) # 确认行列 24 | plt.setp(ax, xticks=(), yticks=()) 25 | 26 | polynomial_features = PolynomialFeatures(degree=degrees[i], 27 | include_bias=False) 28 | # 建模,组装,拟合 29 | linear_regression = LinearRegression() 30 | pipeline = Pipeline([("polynomial_features", polynomial_features), 31 | ("linear_regression", linear_regression)]) 32 | pipeline.fit(X[:, np.newaxis], y) 33 | 34 | # Evaluate the models using crossvalidation 评分 35 | scores = cross_val_score(pipeline, X[:, np.newaxis], y, 36 | scoring="neg_mean_squared_error", cv=10) 37 | 38 | X_test = np.linspace(0, 1, 100) 39 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 40 | plt.plot(X_test, true_fun(X_test), label="True function") 41 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 42 | plt.xlabel("x") 43 | plt.ylabel("y") 44 | plt.xlim((0, 1)) 45 | plt.ylim((-2, 2)) 46 | plt.legend(loc="best") 47 | plt.title("Degree {}\nMSE = {:.2e}(+/- {:.2e})".format( 48 | degrees[i], -scores.mean(), scores.std())) 49 | plt.show() 50 | 51 | print(X) 52 | print(y) 53 | 54 | from sklearn.linear_model import Lasso 55 | from sklearn.linear_model import Ridge 56 | 57 | # pipeline = Ridge(alpha = 0.5) 58 | # pipeline.fit(X[:, np.newaxis], y) 59 | from sklearn.model_selection import cross_val_score 60 | from sklearn.pipeline import Pipeline 61 | from sklearn.preprocessing import PolynomialFeatures 62 | 63 | polynomial_features = PolynomialFeatures(degree=15, # 加入岭回归,避免15次时过拟合 64 | include_bias=False) 65 | # linear_regression = Ridge(alpha = 0.01) # 替换 66 | linear_regression = Lasso(alpha=0.01) 67 | 68 | pipeline = Pipeline([("polynomial_features", polynomial_features), 69 | ("linear_regression", linear_regression)]) 70 | pipeline.fit(X[:, np.newaxis], y) 71 | 72 | # Evaluate the models using crossvalidation 73 | scores = cross_val_score(pipeline, X[:, np.newaxis], y, 74 | scoring="neg_mean_squared_error", cv=10) 75 | 76 | X_test = np.linspace(0, 1, 100) 77 | plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model") 78 | plt.plot(X_test, true_fun(X_test), label="True function") 79 | plt.scatter(X, y, edgecolor='b', s=20, label="Samples") 80 | plt.xlabel("x") 81 | plt.ylabel("y") 82 | plt.xlim((0, 1)) 83 | plt.ylim((-2, 2)) 84 | plt.legend(loc="best") 85 | plt.show() 86 | -------------------------------------------------------------------------------- /Code_2022/class3-test1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.datasets import load_iris 4 | import matplotlib.pyplot as plt 5 | 6 | # load data 7 | iris = load_iris() 8 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 9 | df['label'] = iris.target 10 | 11 | df.head(5) 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 13 | df.label.value_counts() 14 | 15 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0') 16 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1') 17 | plt.xlabel('sepal length') 18 | plt.ylabel('sepal width') 19 | plt.legend() 20 | plt.show() 21 | 22 | data = np.array(df.iloc[:100, [0, 1, -1]]) 23 | print(data[:10, :]) 24 | X, y = data[:, :-1], data[:, -1] 25 | print(X[:10, :]) 26 | y = np.array([1 if i == 1 else -1 for i in y]) 27 | 28 | 29 | class PLA: 30 | def __init__(self, max_iter=1000, shuffle=False): 31 | self.b = 0 32 | self.lr = 0.1 33 | self.max_iter = max_iter 34 | self.iter = 0 35 | self.shuffle = shuffle 36 | 37 | def sign(self, x, w, b): 38 | return np.dot(x, w) + b 39 | 40 | def fit(self, X, y): 41 | N, M = X.shape 42 | self.w = np.ones(M) 43 | for n in range(self.max_iter): 44 | self.iter = n 45 | wrong_items = 0 46 | if self.shuffle: # 每次迭代,是否打乱 47 | idx = np.random.permutation(range(N)) 48 | X, y = X[idx], y[idx] 49 | for i in range(N): 50 | if y[i] * self.sign(X[i], self.w, self.b) <= 0: 51 | self.w += self.lr * np.dot(y[i], X[i]) 52 | self.b += self.lr * y[i] 53 | wrong_items += 1 54 | if wrong_items == 0: 55 | print("finished at iters: {}, w: {}, b: {}".format(self.iter, self.w, self.b)) 56 | return 57 | print("finished for reaching the max_iter: {}, w: {}, b: {}".format(self.max_iter, self.w, self.b)) 58 | perceptron1 = PLA() 59 | perceptron1.fit(X, y) 60 | 61 | 62 | def plot(model, tilte): 63 | x_points = np.linspace(4, 7, 10) 64 | y_ = -(model.w[0] * x_points + model.b) / model.w[1] 65 | plt.plot(x_points, y_) 66 | print(y_) 67 | 68 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='-1') 69 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 70 | plt.xlabel('sepal length') 71 | plt.ylabel('sepal width') 72 | plt.title(tilte) 73 | plt.legend() 74 | 75 | 76 | perceptron1 = PLA() 77 | perceptron1.fit(X, y) 78 | plot(perceptron1, 'PLA_dual') 79 | plt.show() 80 | 81 | #################################################### 82 | from sklearn.linear_model import Perceptron 83 | from sklearn.model_selection import train_test_split 84 | 85 | # import numpy as np 86 | # import matplotlib.pyplot as plt 87 | iris = load_iris() 88 | X = iris.data 89 | Y = iris.target 90 | 91 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 92 | 93 | # df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 94 | df['label'] = iris.target 95 | 96 | # 97 | data = np.array(df.iloc[:100, [0, 1, -1]]) 98 | 99 | x, y = data[:, :-1], data[:, -1] 100 | 101 | # print(data) 102 | X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.9) 103 | 104 | clf = Perceptron(tol=1e-3, random_state=0, max_iter=1000) 105 | 106 | clf.fit(X_train, y_train) 107 | 108 | print(clf.coef_) 109 | 110 | print(clf.intercept_) 111 | 112 | x_ponits = np.arange(4, 8) 113 | y_ = -(clf.coef_[0][0] * x_ponits + clf.intercept_) / clf.coef_[0][1] 114 | plt.plot(x_ponits, y_) 115 | 116 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='0') 117 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 118 | plt.xlabel('sepal length') 119 | plt.ylabel('sepal width') 120 | plt.legend() 121 | plt.show() 122 | 123 | ################################################### 124 | from sklearn.metrics import plot_confusion_matrix 125 | 126 | disp = plot_confusion_matrix(clf, X_train, y_train) 127 | disp.figure_.suptitle("Confusion Matrix") 128 | print("Confusion matrix:\n%s" % disp.confusion_matrix) 129 | ################################################### 130 | 131 | from sklearn.metrics import plot_precision_recall_curve 132 | 133 | pr = plot_precision_recall_curve(clf, X_test, y_test) 134 | from sklearn.metrics import plot_roc_curve 135 | 136 | roc = plot_roc_curve(clf, X_test, y_test) 137 | -------------------------------------------------------------------------------- /Code_2022/class4-test1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.datasets import load_iris 4 | import matplotlib.pyplot as plt 5 | 6 | # load data 7 | iris = load_iris() 8 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 9 | df['label'] = iris.target 10 | 11 | df.head(5) 12 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 13 | df.label.value_counts() 14 | 15 | plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0') 16 | plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1') 17 | plt.xlabel('sepal length') 18 | plt.ylabel('sepal width') 19 | plt.legend() 20 | plt.show() 21 | 22 | data = np.array(df.iloc[:100, [0, 1, -1]]) 23 | print(data[:10, :]) 24 | X, y = data[:, :-1], data[:, -1] 25 | print(X[:10, :]) 26 | y = np.array([1 if i == 1 else -1 for i in y]) 27 | 28 | 29 | class PLA_dual: 30 | def __init__(self, max_iter=1000): 31 | self.b = 0 32 | self.lr = 0.1 33 | self.max_iter = max_iter 34 | self.iter = 0 35 | 36 | def cal_w(self, X): 37 | w = 0 38 | for i in range(len(self.alpha)): 39 | w += self.alpha[i] * y[i] * X[i] 40 | return w 41 | 42 | def gram_matrix(self, X): 43 | return np.dot(X, X.T) 44 | 45 | def fit(self, X, y): 46 | N, M = X.shape 47 | self.alpha = np.zeros(N) 48 | gram = self.gram_matrix(X) 49 | for n in range(self.max_iter): 50 | self.iter = n 51 | wrong_items = 0 52 | for i in range(N): 53 | tmp = 0 54 | for j in range(N): 55 | tmp += self.alpha[j] * y[j] * gram[i, j] 56 | tmp += self.b 57 | if y[i] * tmp <= 0: 58 | self.alpha[i] += self.lr 59 | self.b += self.lr * y[i] 60 | wrong_items += 1 61 | if wrong_items == 0: 62 | self.w = self.cal_w(X) 63 | print("finished at iters: {}, w: {}, b: {}".format(self.iter, self.w, self.b)) 64 | return 65 | self.w = self.cal_w(X) 66 | print("finished for reaching the max_iter: {}, w: {}, b: {}".format(self.max_iter, self.w, self.b)) 67 | return 68 | 69 | 70 | perceptron3 = PLA_dual() 71 | perceptron3.fit(X, y) 72 | 73 | 74 | def plot(model, tilte): 75 | x_points = np.linspace(4, 7, 10) 76 | y_ = -(model.w[0] * x_points + model.b) / model.w[1] 77 | plt.plot(x_points, y_) 78 | print(y_) 79 | 80 | plt.plot(data[:50, 0], data[:50, 1], 'bo', color='blue', label='-1') 81 | plt.plot(data[50:100, 0], data[50:100, 1], 'bo', color='orange', label='1') 82 | plt.xlabel('sepal length') 83 | plt.ylabel('sepal width') 84 | plt.title(tilte) 85 | plt.legend() 86 | plt.show() 87 | 88 | 89 | plot(perceptron3, 'PLA_dual') 90 | 91 | from sklearn.datasets import load_iris 92 | from sklearn.model_selection import train_test_split 93 | from sklearn.neural_network import MLPClassifier 94 | 95 | iris = load_iris() 96 | X = iris.data 97 | Y = iris.target 98 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) 99 | 100 | # clf=MLPClassifier(activation='logistic',max_iter=1000)# 构造分类器实例 101 | clf = MLPClassifier(solver='sgd', alpha=1e-5, 102 | hidden_layer_sizes=(20, 20, 20), random_state=1, max_iter=10000) 103 | # 4*20*20*20个参数 104 | 105 | clf.fit(X_train, y_train) # 训练分类器 106 | print(clf.score(X_test, y_test)) # 查看在训练集上的评价预测精度 107 | -------------------------------------------------------------------------------- /Code_2022/class4-test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | r = 1 6 | 7 | linestyle = ['b-', 'k-', 'm-', 'r-', 'y-'] 8 | p_values = (0.25, 0.5, 1, 2, 4, 100) 9 | 10 | for i, p in enumerate(p_values): 11 | x = np.arange(-r, r + 1e-5, 1 / 128.0) 12 | y = (r ** p - (abs(x) ** p)) ** (1.0 / p) 13 | plt.plot(x, y, x, -y) 14 | 15 | ax = plt.gca() 16 | ax.set_aspect(1) 17 | plt.show() 18 | 19 | ##### 20 | X = [[0], [1], [2], [3]] 21 | y = [0, 0, 1, 1] 22 | from sklearn.neighbors import KNeighborsClassifier 23 | neigh = KNeighborsClassifier(n_neighbors=3) 24 | neigh.fit(X, y) 25 | 26 | print(neigh.predict([[1.1]])) 27 | print(neigh.predict_proba([[0.9]])) 28 | -------------------------------------------------------------------------------- /Code_2022/class4-test3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.colors import ListedColormap 4 | from sklearn import neighbors, datasets 5 | 6 | irisData = datasets.load_iris() 7 | 8 | X = irisData.data[:, :4] 9 | y = irisData.target 10 | 11 | weights = 'uniform' 12 | n_neighbors=15 13 | # we create an instance of Neighbours Classifier and fit the data. 14 | classifier = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) 15 | classifier.fit(X, y) 16 | 17 | print('KNN classifier accuracy - "%s" - %.3f' % (weights, classifier.score(X, y))) 18 | -------------------------------------------------------------------------------- /Code_2022/class5-test1.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | class NaiveBayes: 5 | def __init__(self): 6 | self.model = None 7 | 8 | # 数学期望 9 | @staticmethod 10 | def mean(X): 11 | return sum(X) / float(len(X)) 12 | 13 | # 标准差(方差) 14 | def stdev(self, X): 15 | avg = self.mean(X) 16 | return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X))) 17 | 18 | # 概率密度函数 19 | def gaussian_probability(self, x, mean, stdev): 20 | exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2)))) 21 | return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent 22 | 23 | # 处理X_train 24 | def summarize(self, train_data): 25 | summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)] 26 | return summaries 27 | 28 | # 分类别求出数学期望和标准差 29 | def fit(self, X, y): 30 | labels = list(set(y)) 31 | data = {label: [] for label in labels} 32 | for f, label in zip(X, y): 33 | data[label].append(f) 34 | self.model = {label: self.summarize(value) for label, value in data.items()} 35 | return 'GaussianNB train done!' 36 | 37 | # 计算概率 38 | def calculate_probabilities(self, input_data): 39 | # summaries:{0.0: [(5.0, 0.37),(3.42, 0.40)], 1.0: [(5.8, 0.449),(2.7, 0.27)]} 40 | # input_data:[1.1, 2.2] 41 | probabilities = {} 42 | for label, value in self.model.items(): 43 | probabilities[label] = 1 44 | for i in range(len(value)): 45 | mean, stdev = value[i] 46 | probabilities[label] *= self.gaussian_probability(input_data[i], mean, stdev) 47 | return probabilities 48 | 49 | # 类别 50 | def predict(self, X_test): 51 | # {0.0: 2.9680340789325763e-27, 1.0: 3.5749783019849535e-26} 52 | label = sorted(self.calculate_probabilities(X_test).items(), key=lambda x: x[-1])[-1][0] 53 | return label 54 | 55 | def score(self, X_test, y_test): 56 | right = 0 57 | for X, y in zip(X_test, y_test): 58 | label = self.predict(X) 59 | if label == y: 60 | right += 1 61 | 62 | return right / float(len(X_test)) 63 | 64 | 65 | import numpy as np 66 | import pandas as pd 67 | 68 | import matplotlib.pyplot as plt 69 | # %matplotlib inline 70 | 71 | from sklearn.datasets import load_iris 72 | from sklearn.model_selection import train_test_split 73 | 74 | iris = load_iris() 75 | X = iris.data 76 | Y = iris.target 77 | ''' 78 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 79 | df['label'] = iris.target 80 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 81 | data = np.array(df.iloc[:100, :]) 82 | # print(data) 83 | ''' 84 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) 85 | model = NaiveBayes() 86 | model.fit(X_train, y_train) 87 | print(model.score(X_test, y_test)) 88 | print(model.predict([4.4, 3.2, 1.3, 0.2])) 89 | 90 | from sklearn.naive_bayes import GaussianNB 91 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) 92 | Y = np.array([1, 1, 1, 2, 2, 2]) 93 | Y = np.array(['a', 'a', 'a', 'b', 'b', 'b']) 94 | 95 | clf = GaussianNB(priors=None, var_smoothing=1e-09) 96 | clf.fit(X, Y) 97 | print(clf.predict([[-0.8, -1]])) 98 | 99 | from sklearn import datasets 100 | iris = datasets.load_iris() 101 | 102 | from sklearn.naive_bayes import MultinomialNB 103 | clf = MultinomialNB() 104 | clf = clf.fit(iris.data, iris.target) 105 | y_pred=clf.predict(iris.data) 106 | print("多项分布朴素贝叶斯,样本总数: %d 错误样本数 : %d" % (iris.data.shape[0],(iris.target != y_pred).sum())) -------------------------------------------------------------------------------- /Code_2022/class5-test2.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.naive_bayes import GaussianNB 3 | #from sklearn.cross_validation import train_test_split 4 | from sklearn.model_selection import train_test_split 5 | 6 | from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型 7 | 8 | iris = load_iris() 9 | X = iris.data 10 | Y = iris.target 11 | 12 | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0) 13 | 14 | nb = GaussianNB() 15 | nb.fit(X_train, y_train) 16 | 17 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data) 18 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum())) 19 | 20 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test))) 21 | 22 | nb = MultinomialNB() 23 | nb.fit(X_train, y_train) 24 | 25 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data) 26 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum())) 27 | 28 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test))) 29 | 30 | nb = BernoulliNB() 31 | nb.fit(X_train, y_train) 32 | 33 | y_pred = nb.fit(iris.data, iris.target).predict(iris.data) 34 | print("Number of mislabeled points out of a total %d points : %d"% (iris.data.shape[0],(iris.target != y_pred).sum())) 35 | 36 | print("Naive Gausian bayes score (sklearn): " +str(nb.score(X_test, y_test))) 37 | 38 | print(nb) -------------------------------------------------------------------------------- /Code_2022/class6-test1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | # %matplotlib inline 4 | 5 | import math 6 | 7 | p = np.linspace(0.01, 1, num=50, endpoint=False) 8 | 9 | entropy = -p * np.log2(p) - (1 - p) * np.log2(1 - p) 10 | 11 | # plt.plot(b) 12 | plt.plot(p, entropy) 13 | plt.grid(True) 14 | plt.xlabel('p') 15 | plt.ylabel('Entropy(bit)') 16 | # plt.plot(p,gini) 17 | 18 | max_en = 2 * (-(1 / 2) * np.log2(1 / 2)) 19 | print(max_en) 20 | 21 | d = np.linspace(0.01, 100, num=50, endpoint=False) 22 | ld = np.log2(d) 23 | plt.show() 24 | -------------------------------------------------------------------------------- /Code_2022/class6-test2.py: -------------------------------------------------------------------------------- 1 | def create_data(): 2 | datasets = [[1, 'Sunny', 'Hot', 'High', 'Weak', 'No'], 3 | [2, 'Sunny', 'Hot', 'High', 'Strong', 'No'], 4 | [3, 'Overcast', 'Hot', 'High', 'Weak', 'Yes'], 5 | [4, 'Rainy', 'Mild', 'High', 'Weak', 'Yes'], 6 | [5, 'Rainy', 'Cool', 'Normal', 'Weak', 'Yes'], 7 | [6, 'Rainy', 'Cool', 'Normal', 'Strong', 'No'], 8 | [7, 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'], 9 | [8, 'Sunny', 'Mild', 'High', 'Weak', 'No'], 10 | [9, 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'], 11 | [10, 'Rainy', 'Mild', 'Normal', 'Weak', 'Yes'], 12 | [11, 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], 13 | [12, 'Overcast', 'Mild', 'High', 'Strong', 'Yes'], 14 | [13, 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'], 15 | [14, 'Rainy', 'Mild', 'High', 'Strong', 'No'], 16 | ] 17 | 18 | labels = ['Day', 'OutLook', 'Temperature', 'Humidity', 'Wind', 'PlayTennis'] 19 | return datasets, labels 20 | # 返回数据集和每个维度的名称 21 | 22 | 23 | import pandas as pd 24 | from math import log2 25 | 26 | datasets, labels = create_data() 27 | 28 | train_data = pd.DataFrame(datasets, columns=labels) 29 | 30 | print(train_data) 31 | 32 | # 以 Outlook 为分界的熵 33 | En_Sunny = -(2 / 5) * log2(2 / 5) - (3 / 5) * log2(3 / 5) 34 | En_Overcast = -(4 / 4) * log2(4 / 4) 35 | En_Rainy = -(3 / 5) * log2(3 / 5) - (2 / 5) * log2(2 / 5) 36 | 37 | # Outlook 熵 38 | En_Outlook = 5 / 14 * En_Sunny + 4 / 14 * En_Overcast + 5 / 14 * En_Rainy 39 | 40 | print(En_Outlook) 41 | 42 | from sklearn import tree 43 | X = [[0, 0], [1, 1]] 44 | Y = [0, 1] 45 | clf = tree.DecisionTreeClassifier() 46 | clf = clf.fit(X, Y) 47 | 48 | print(clf.predict([[2., 2.]])) 49 | print(clf.predict_proba([[2., 2.]])) -------------------------------------------------------------------------------- /Code_2022/class7-test1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | # %matplotlib inline 6 | 7 | def sigmod(x): 8 | return 1 / (1 + np.exp(-x)) 9 | 10 | 11 | x = np.arange(-10, 10., 0.1) 12 | y = sigmod(x) 13 | 14 | plt.plot(x, y) 15 | plt.grid(True) 16 | plt.show() 17 | -------------------------------------------------------------------------------- /Code_2022/class7-test2.py: -------------------------------------------------------------------------------- 1 | from math import exp 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | # %matplotlib inline 6 | 7 | from sklearn.datasets import load_iris 8 | from sklearn.model_selection import train_test_split 9 | 10 | 11 | class LogisticReressionClassifier: 12 | def __init__(self, max_iter=200, learning_rate=0.01): 13 | self.max_iter = max_iter 14 | self.learning_rate = learning_rate 15 | 16 | def sigmoid(self, x): 17 | return 1 / (1 + exp(-x)) 18 | 19 | def data_matrix(self, X): 20 | data_mat = [] 21 | for d in X: 22 | data_mat.append([1.0, *d]) 23 | return data_mat 24 | 25 | def fit(self, X, y): 26 | # label = np.mat(y) 27 | data_mat = self.data_matrix(X) # m*n 28 | self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32) 29 | 30 | for iter_ in range(self.max_iter): 31 | for i in range(len(X)): 32 | result = self.sigmoid(np.dot(data_mat[i], self.weights)) 33 | error = y[i] - result 34 | self.weights += self.learning_rate * error * np.transpose([data_mat[i]]) 35 | print('LogisticRegression Model(learning_rate={},max_iter={})'.format(self.learning_rate, self.max_iter)) 36 | 37 | # def f(self, x): 38 | # return -(self.weights[0] + self.weights[1] * x) / self.weights[2] 39 | 40 | def score(self, X_test, y_test): 41 | right = 0 42 | X_test = self.data_matrix(X_test) 43 | for x, y in zip(X_test, y_test): 44 | result = np.dot(x, self.weights) 45 | if (result > 0 and y == 1) or (result < 0 and y == 0): 46 | right += 1 47 | return right / len(X_test) 48 | 49 | 50 | def create_data(): 51 | iris = load_iris() 52 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 53 | df['label'] = iris.target 54 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 55 | data = np.array(df.iloc[:100, [0, 1, -1]]) 56 | # print(data) 57 | return data[:, :2], data[:, -1] 58 | 59 | 60 | X, y = create_data() 61 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 62 | lr_clf = LogisticReressionClassifier() 63 | lr_clf.fit(X_train, y_train) 64 | print(lr_clf.score(X_test, y_test)) 65 | 66 | x_ponits = np.arange(4, 8) 67 | y_ = -(lr_clf.weights[1] * x_ponits + lr_clf.weights[0]) / lr_clf.weights[2] 68 | plt.plot(x_ponits, y_) 69 | 70 | # lr_clf.show_graph() 71 | plt.scatter(X[:50, 0], X[:50, 1], label='0') 72 | plt.scatter(X[50:, 0], X[50:, 1], label='1') 73 | plt.legend() 74 | plt.show() 75 | -------------------------------------------------------------------------------- /Code_2022/class7-test3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from matplotlib import pyplot as plt 4 | from sklearn.datasets import load_iris 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.model_selection import train_test_split 7 | 8 | 9 | def create_data(): 10 | iris = load_iris() 11 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 12 | df['label'] = iris.target 13 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 14 | data = np.array(df.iloc[:100, [0, 1, -1]]) 15 | # print(data) 16 | return data[:, :2], data[:, -1] 17 | 18 | 19 | X, y = create_data() 20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 21 | clf = LogisticRegression(max_iter=200, solver='liblinear') 22 | clf.fit(X_train, y_train) 23 | clf.score(X_test, y_test) 24 | print(clf.coef_, clf.intercept_) 25 | 26 | x_ponits = np.arange(4, 8) 27 | y_ = -(clf.coef_[0][0] * x_ponits + clf.intercept_) / clf.coef_[0][1] 28 | plt.plot(x_ponits, y_) 29 | 30 | plt.plot(X[:50, 0], X[:50, 1], 'bo', color='blue', label='0') 31 | plt.plot(X[50:, 0], X[50:, 1], 'bo', color='orange', label='1') 32 | plt.xlabel('sepal length') 33 | plt.ylabel('sepal width') 34 | plt.legend() 35 | plt.show() 36 | -------------------------------------------------------------------------------- /Code_2022/class7-test4.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn import datasets 5 | 6 | # import some data to play with 7 | iris = datasets.load_iris() 8 | X = iris.data[:, :2] # we only take the first two features. 9 | Y = iris.target 10 | 11 | logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial') 12 | 13 | # Create an instance of Logistic Regression Classifier and fit the data. 14 | logreg.fit(X, Y) 15 | 16 | # Plot the decision boundary. For that, we will assign a color to each 17 | # point in the mesh [x_min, x_max]x[y_min, y_max]. 18 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 19 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 20 | h = .02 # step size in the mesh 21 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 22 | Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()]) 23 | 24 | # Put the result into a color plo 25 | Z = Z.reshape(xx.shape) 26 | plt.figure(1, figsize=(4, 3)) 27 | plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) 28 | 29 | # Plot also the training points 30 | plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired) 31 | plt.xlabel('Sepal length') 32 | plt.ylabel('Sepal width') 33 | 34 | plt.xlim(xx.min(), xx.max()) 35 | plt.ylim(yy.min(), yy.max()) 36 | plt.xticks(()) 37 | plt.yticks(()) 38 | plt.show() 39 | -------------------------------------------------------------------------------- /Code_2022/class8-test1.py: -------------------------------------------------------------------------------- 1 | # Example 1 2 | import numpy as np 3 | 4 | X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) 5 | y = np.array([1, 1, 2, 2]) 6 | 7 | from sklearn.svm import SVC 8 | 9 | # clf = SVC(gamma='auto') 10 | 11 | clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, 12 | decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear', 13 | max_iter=-1, probability=False, random_state=None, shrinking=True, 14 | tol=0.001, verbose=False) # 可以根据前面介绍的参数,做出相应改变观察结果变化 15 | 16 | clf.fit(X, y) 17 | print(clf.predict([[-0.8, -1]])) 18 | 19 | print(clf.support_vectors_) 20 | print(clf.dual_coef_, clf.coef_, clf.intercept_) 21 | -------------------------------------------------------------------------------- /Code_2022/class8-test2.py: -------------------------------------------------------------------------------- 1 | # Example 2 2 | from sklearn import svm 3 | from sklearn import datasets 4 | from sklearn.model_selection import train_test_split as ts 5 | 6 | # import our data 7 | iris = datasets.load_iris() 8 | X = iris.data 9 | y = iris.target 10 | 11 | # split the data to 7:3 12 | X_train, X_test, y_train, y_test = ts(X, y, test_size=0.3) 13 | 14 | # select different type of kernel function and compare the score 15 | 16 | # kernel = 'rbf' 17 | clf_rbf = svm.SVC(kernel='rbf', gamma='auto') 18 | clf_rbf.fit(X_train, y_train) 19 | score_rbf = clf_rbf.score(X_test, y_test) 20 | print("The score of rbf is : %f" % score_rbf) 21 | 22 | # kernel = 'linear' 23 | clf_linear = svm.SVC(kernel='linear', gamma='auto') 24 | clf_linear.fit(X_train, y_train) 25 | score_linear = clf_linear.score(X_test, y_test) 26 | print("The score of linear is : %f" % score_linear) 27 | 28 | # kernel = 'poly' 29 | clf_poly = svm.SVC(kernel='poly', gamma='auto') 30 | clf_poly.fit(X_train, y_train) 31 | score_poly = clf_poly.score(X_test, y_test) 32 | print("The score of poly is : %f" % score_poly) 33 | 34 | print(clf_linear.coef_, clf_linear.intercept_) 35 | 36 | # print(clf.predict([[4.9, 3., 1.4, 0.2]])) 37 | -------------------------------------------------------------------------------- /Code_2022/class8-test3.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm 2 | from sklearn.svm import SVR 3 | 4 | X = [[0, 0], [2, 2]] 5 | y = [0.5, 2.5] 6 | clf = svm.SVR() 7 | clf.fit(X, y) 8 | SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, 9 | gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True, 10 | tol=0.001, verbose=False) 11 | print(clf.predict([[1, 1]])) 12 | 13 | from sklearn.svm import LinearSVR 14 | 15 | regr = LinearSVR(random_state=0, tol=1e-5) 16 | regr.fit(X, y) 17 | print(regr.coef_) 18 | 19 | print(regr.intercept_) 20 | print(regr.predict([[1, 1]])) 21 | -------------------------------------------------------------------------------- /Code_2022/class8-test4.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import LinearSVR 2 | 3 | regr = LinearSVR(random_state=0, tol=1e-5) 4 | regr.fit(X, y) 5 | print(regr.coef_) 6 | 7 | print(regr.intercept_) 8 | print(regr.predict([[1, 1]])) 9 | -------------------------------------------------------------------------------- /Code_2022/class9-test1.py: -------------------------------------------------------------------------------- 1 | class AdaBoost: 2 | def __init__(self, n_estimators=50, learning_rate=1.0): 3 | self.clf_num = n_estimators 4 | self.learning_rate = learning_rate 5 | 6 | def init_args(self, datasets, labels): 7 | 8 | self.X = datasets 9 | self.Y = labels 10 | self.M, self.N = datasets.shape 11 | 12 | # 弱分类器数目和集合 13 | self.clf_sets = [] 14 | 15 | # 初始化weights 16 | self.weights = [1.0 / self.M] * self.M 17 | 18 | # G(x)系数 alpha 19 | self.alpha = [] 20 | 21 | def _G(self, features, labels, weights): 22 | m = len(features) 23 | error = 100000.0 # 无穷大 24 | best_v = 0.0 25 | # 单维features 26 | features_min = min(features) 27 | features_max = max(features) 28 | n_step = (features_max - features_min + self.learning_rate) // self.learning_rate 29 | # print('n_step:{}'.format(n_step)) 30 | direct, compare_array = None, None 31 | for i in range(1, int(n_step)): 32 | v = features_min + self.learning_rate * i 33 | 34 | if v not in features: 35 | # 误分类计算 36 | compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)]) 37 | weight_error_positive = sum([weights[k] for k in range(m) if compare_array_positive[k] != labels[k]]) 38 | 39 | compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)]) 40 | weight_error_nagetive = sum([weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k]]) 41 | 42 | if weight_error_positive < weight_error_nagetive: 43 | weight_error = weight_error_positive 44 | _compare_array = compare_array_positive 45 | direct = 'positive' 46 | else: 47 | weight_error = weight_error_nagetive 48 | _compare_array = compare_array_nagetive 49 | direct = 'nagetive' 50 | 51 | # print('v:{} error:{}'.format(v, weight_error)) 52 | if weight_error < error: 53 | error = weight_error 54 | compare_array = _compare_array 55 | best_v = v 56 | return best_v, direct, error, compare_array 57 | 58 | # 计算alpha 59 | def _alpha(self, error): 60 | return 0.5 * np.log((1 - error) / error) 61 | 62 | # 规范化因子 63 | def _Z(self, weights, a, clf): 64 | return sum([weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M)]) 65 | 66 | # 权值更新 67 | def _w(self, a, clf, Z): 68 | for i in range(self.M): 69 | self.weights[i] = self.weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) / Z 70 | 71 | # G(x)的线性组合 72 | def _f(self, alpha, clf_sets): 73 | pass 74 | 75 | def G(self, x, v, direct): 76 | if direct == 'positive': 77 | return 1 if x > v else -1 78 | else: 79 | return -1 if x > v else 1 80 | 81 | def fit(self, X, y): 82 | self.init_args(X, y) 83 | 84 | for epoch in range(self.clf_num): 85 | best_clf_error, best_v, clf_result = 100000, None, None 86 | # 根据特征维度, 选择误差最小的 87 | for j in range(self.N): 88 | features = self.X[:, j] 89 | # 分类阈值,分类误差,分类结果 90 | v, direct, error, compare_array = self._G(features, self.Y, self.weights) 91 | 92 | if error < best_clf_error: 93 | best_clf_error = error 94 | best_v = v 95 | final_direct = direct 96 | clf_result = compare_array 97 | axis = j 98 | 99 | # print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v)) 100 | if best_clf_error == 0: 101 | break 102 | 103 | # 计算G(x)系数a 104 | a = self._alpha(best_clf_error) 105 | self.alpha.append(a) 106 | # 记录分类器 107 | self.clf_sets.append((axis, best_v, final_direct)) 108 | # 规范化因子 109 | Z = self._Z(self.weights, a, clf_result) 110 | # 权值更新 111 | self._w(a, clf_result, Z) 112 | 113 | # print('classifier:{}/{} error:{:.3f} v:{} direct:{} a:{:.5f}'.format(epoch+1, self.clf_num, error, best_v, final_direct, a)) 114 | # print('weight:{}'.format(self.weights)) 115 | # print('\n') 116 | 117 | def predict(self, feature): 118 | result = 0.0 119 | for i in range(len(self.clf_sets)): 120 | axis, clf_v, direct = self.clf_sets[i] 121 | f_input = feature[axis] 122 | result += self.alpha[i] * self.G(f_input, clf_v, direct) 123 | # sign 124 | return 1 if result > 0 else -1 125 | 126 | def score(self, X_test, y_test): 127 | right_count = 0 128 | for i in range(len(X_test)): 129 | feature = X_test[i] 130 | if self.predict(feature) == y_test[i]: 131 | right_count += 1 132 | 133 | return right_count / len(X_test) 134 | 135 | 136 | import numpy as np 137 | import pandas as pd 138 | from sklearn.datasets import load_iris 139 | from sklearn.model_selection import train_test_split 140 | import matplotlib.pyplot as plt 141 | 142 | 143 | # %matplotlib inline 144 | 145 | def create_data(): 146 | iris = load_iris() 147 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 148 | df['label'] = iris.target 149 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 150 | data = np.array(df.iloc[:100, [0, 1, -1]]) 151 | for i in range(len(data)): 152 | if data[i, -1] == 0: 153 | data[i, -1] = -1 154 | # print(data) 155 | return data[:, :2], data[:, -1] 156 | 157 | 158 | X, y = create_data() 159 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 160 | clf = AdaBoost(n_estimators=3, learning_rate=0.5) 161 | clf.fit(X_train, y_train) 162 | print(clf.score(X_test, y_test)) 163 | -------------------------------------------------------------------------------- /Code_2022/class9-test2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_iris 4 | from sklearn.ensemble import BaggingClassifier 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.neighbors import KNeighborsClassifier 7 | 8 | 9 | def create_data(): 10 | iris = load_iris() 11 | df = pd.DataFrame(iris.data, columns=iris.feature_names) 12 | df['label'] = iris.target 13 | df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] 14 | data = np.array(df.iloc[:100, [0, 1, -1]]) 15 | for i in range(len(data)): 16 | if data[i, -1] == 0: 17 | data[i, -1] = -1 18 | # print(data) 19 | return data[:, :2], data[:, -1] 20 | 21 | 22 | X, y = create_data() 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 24 | # bagging 算法 25 | bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) 26 | bagging.fit(X_train, y_train) 27 | 28 | in_score = bagging.score(X_train, y_train) 29 | out_score = bagging.score(X_test, y_test) 30 | print(in_score, out_score) 31 | 32 | # RandomForest 算法 33 | from sklearn.ensemble import RandomForestClassifier 34 | 35 | forest = RandomForestClassifier(n_estimators=300, max_depth=2, random_state=0) 36 | forest.fit(X_train, y_train) 37 | 38 | in_score = forest.score(X_train, y_train) 39 | out_score = forest.score(X_test, y_test) 40 | print(in_score, out_score) 41 | 42 | # Adaboost 算法 43 | 44 | from sklearn.ensemble import AdaBoostClassifier 45 | 46 | clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) 47 | clf.fit(X_train, y_train) 48 | in_score = clf.score(X_train, y_train) 49 | out_score = clf.score(X_test, y_test) 50 | print(in_score, out_score) 51 | 52 | # 投票分类器 53 | from sklearn.linear_model import LogisticRegression 54 | from sklearn.naive_bayes import GaussianNB 55 | from sklearn.ensemble import RandomForestClassifier 56 | 57 | from sklearn.ensemble import VotingClassifier 58 | 59 | import numpy as np 60 | 61 | clf1 = LogisticRegression(multi_class='multinomial', random_state=1) 62 | clf2 = RandomForestClassifier(n_estimators=50, random_state=1) 63 | clf3 = GaussianNB() 64 | 65 | X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) 66 | y = np.array([1, 1, 1, 2, 2, 2]) 67 | eclf1 = VotingClassifier(estimators=[ 68 | ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') 69 | eclf1 = eclf1.fit(X_train, y_train) 70 | # print(eclf1.predict(X)) 71 | in_score = eclf1.score(X_train, y_train) 72 | out_score = eclf1.score(X_test, y_test) 73 | print(in_score, out_score) 74 | 75 | 76 | from sklearn.datasets import load_iris 77 | from sklearn.ensemble import RandomForestClassifier 78 | from sklearn.svm import LinearSVC 79 | from sklearn.linear_model import LogisticRegression 80 | from sklearn.preprocessing import StandardScaler 81 | from sklearn.pipeline import make_pipeline 82 | from sklearn.ensemble import StackingClassifier 83 | 84 | 85 | X, y = load_iris(return_X_y=True) 86 | estimators = [ 87 | ('rf', RandomForestClassifier(n_estimators=10, random_state=42)), 88 | ('svr', make_pipeline(StandardScaler(), 89 | LinearSVC(random_state=42)))] 90 | clf = StackingClassifier( 91 | estimators=estimators, final_estimator=LogisticRegression() 92 | ) 93 | 94 | from sklearn.model_selection import train_test_split 95 | X_train, X_test, y_train, y_test = train_test_split( 96 | X, y, stratify=y, random_state=42 97 | ) 98 | clf.fit(X_train, y_train) 99 | print(clf.score(X_test, y_test)) -------------------------------------------------------------------------------- /Code_2022/class9-test3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib.gridspec as gridspec 4 | import itertools 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.naive_bayes import GaussianNB 7 | from sklearn.neighbors import KNeighborsClassifier 8 | from sklearn.svm import SVC 9 | from sklearn.ensemble import RandomForestClassifier, StackingClassifier 10 | 11 | from mlxtend.classifier import EnsembleVoteClassifier 12 | from mlxtend.data import iris_data 13 | from mlxtend.plotting import plot_decision_regions 14 | 15 | # Initializing Classifiers 16 | clf1 = LogisticRegression(random_state=0) 17 | clf2 = RandomForestClassifier(random_state=0) 18 | clf3 = SVC(random_state=0, probability=True) 19 | eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], 20 | weights=[2, 1, 1], voting='soft') 21 | 22 | # clf1 = KNeighborsClassifier(n_neighbors=1) 23 | # clf2 = RandomForestClassifier(random_state=1) 24 | # clf3 = GaussianNB() 25 | # lr = LogisticRegression() 26 | # sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 27 | # meta_classifier=lr) 28 | 29 | # Loading some example data 30 | X, y = iris_data() 31 | X = X[:, [0, 2]] 32 | 33 | # Plotting Decision Regions 34 | 35 | gs = gridspec.GridSpec(2, 2) 36 | fig = plt.figure(figsize=(10, 8)) 37 | 38 | labels = ['Logistic Regression', 39 | 'Random Forest', 40 | 'RBF kernel SVM', 41 | 'Ensemble'] 42 | 43 | for clf, lab, grd in zip([clf1, clf2, clf3, eclf], 44 | labels, 45 | itertools.product([0, 1], 46 | repeat=2)): 47 | clf.fit(X, y) 48 | ax = plt.subplot(gs[grd[0], grd[1]]) 49 | fig = plot_decision_regions(X=X, y=y, 50 | clf=clf, legend=2) 51 | plt.title(lab) 52 | 53 | plt.show() 54 | -------------------------------------------------------------------------------- /Code_2022/readme: -------------------------------------------------------------------------------- 1 | 2022 春季学期代码整理。 - 2023.2.8 上传 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Statistical-Learning-Slides-Code 2 | 3 | ## 2025-《数据挖掘技术》课程课件与参考代码 4 | 5 | 最后一次更新日期:2025.02.25 6 | 7 | 更新记录: 8 | - 2024.02.26 9 | - 2023.02.08 10 | 11 | #### 教材:《机器学习方法》(李航) 12 | 13 | 历史教材: 14 | - 《机器学习方法》 李航 15 | - 《统计学习方法》(第二版)李航 16 | - 《统计学习方法》(第一版)李航 17 | 18 | ## Contents 19 | 20 | #### Code 21 | - Code 2020年整理整理参考代码 22 | - Code_2022 2022年整理代码 23 | - 致谢 学号 20195298 同学 24 | 25 | #### Slides 26 | - CH00.pdf (2022.02.20 11:28) 27 | - CH01 Statistical Learning.pdf 28 | - CH02 Perceptron.pdf 29 | - CH03 KNN.pdf 30 | - CH04 NaiveBayes.pdf 、 31 | - CH05 DecisionTree.pdf (2022.03.26 22:04) 32 | - CH06 LogicRegression and Maximum Entropy Model.pdf (2022.03.26 22:04) 33 | - CH07 SVM.pdf (2022.03.26 22:04) 34 | - CH08 Boosting.pdf (2022.03.26 22:04) 35 | - CH09 EM.pdf (2022.03.26 22:04) 36 | - CH10 Hidden Markov Model.pdf (2022.03.26 22:04) 37 | - CH15 SVD.pdf (2022.03.26 22:04) 38 | - CH16 PCA.pdf (2022.03.26 22:04) 39 | - CH17 LSA.pdf (2022.03.26 22:04) 40 | - CH21 PageRank (2021.05.02 16:52) 41 | - CH22 Transformer.pdf (2025.02.25) 42 | - CHX0 Summary (2021.05.17 16:21) 43 | - CHX5 NN-CNN.pdf (2025.02.25) 44 | 45 | ------- 46 | ## History versions: 47 | 48 | - CH00.pdf (2021.03.23 16:03) 49 | - CH01 Statistical Learning.pdf 50 | - CH02 Perceptron.pdf 51 | - CH03 KNN.pdf 52 | - CH04 NaiveBayes.pdf 53 | - CH05 DecisionTree.pdf 54 | - CH06 LogicRegression and Maximum Entropy Model.pdf 55 | - CH07 SVM.pdf 56 | - CH08 Boosting.pdf (2021.03.26 22:04) 57 | - CH09 EM.pdf (2021.03.30 16:47) 58 | - CH10 Hidden Markov Model.pdf (2021.04.06 16:14) 59 | - CH15 SVD.pdf (2021.04.12 12:09) 60 | - CH16 PCA.pdf (2021.04.16 10:00) 61 | - CH17 LSA.pdf (2021.04.25 14:32) 62 | 63 | ...... End ...... 64 | -------------------------------------------------------------------------------- /Slides/A Step by Step Backpropagation Example for Regression using an One-hot Encoded Categorical Variable .pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/A Step by Step Backpropagation Example for Regression using an One-hot Encoded Categorical Variable .pdf -------------------------------------------------------------------------------- /Slides/CH00 OverView.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH00 OverView.pdf -------------------------------------------------------------------------------- /Slides/CH01 Statistical Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH01 Statistical Learning.pdf -------------------------------------------------------------------------------- /Slides/CH02 Perceptron.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH02 Perceptron.pdf -------------------------------------------------------------------------------- /Slides/CH03 KNN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH03 KNN.pdf -------------------------------------------------------------------------------- /Slides/CH04 NaiveBayes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH04 NaiveBayes.pdf -------------------------------------------------------------------------------- /Slides/CH05 DecisionTree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH05 DecisionTree.pdf -------------------------------------------------------------------------------- /Slides/CH06 LogicRegression and Maximum Entropy Model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH06 LogicRegression and Maximum Entropy Model.pdf -------------------------------------------------------------------------------- /Slides/CH07 SVM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH07 SVM.pdf -------------------------------------------------------------------------------- /Slides/CH08 Boosting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH08 Boosting.pdf -------------------------------------------------------------------------------- /Slides/CH09 EM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH09 EM.pdf -------------------------------------------------------------------------------- /Slides/CH10 Hidden Markov Model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH10 Hidden Markov Model.pdf -------------------------------------------------------------------------------- /Slides/CH16 PCA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH16 PCA.pdf -------------------------------------------------------------------------------- /Slides/CH21 PageRank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH21 PageRank.pdf -------------------------------------------------------------------------------- /Slides/CH22 Transformer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CH22 Transformer.pdf -------------------------------------------------------------------------------- /Slides/CHX0 Summary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CHX0 Summary.pdf -------------------------------------------------------------------------------- /Slides/CHX5 NN-CNN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wjssx/Statistical-Learning-Slides-Code/e8bd3a28a58a8102ede42671d96780f657c50cd7/Slides/CHX5 NN-CNN.pdf --------------------------------------------------------------------------------